def scrape_topic(topic_url): questions_and_answers = [] html_page = get_soup(f'https://forum.jusline.at/{topic_url}') topic_title = html_page.find("h2", class_="forum-title").get_text() print(topic_title) page = 1 while True: print(f'PROCESSING PAGE: {page}') rows = html_page.findAll("dl", class_="row-item topic_read") for row in rows: try: answer_num = row.find("dd", class_="posts") if answer_num is None or answer_num.get_text( ) == "0 Antworten": continue article_url = row.find("a", class_="topictitle")["href"][1:] article_url = f'https://forum.jusline.at/{article_url}' article_html = get_soup(article_url) article_data = extract_qa_jusline(article_html) article_data["url"] = article_url article_data["topic"] = topic_title questions_and_answers.append(article_data) except BaseException as e: print("ERROR PROCESSING ARTICLE") print(e) # print(article_data) next_page_button_link = html_page.find("a", class_="button", rel="next") if next_page_button_link is None: break next_page_url = next_page_button_link["href"] html_page = get_soup(f'https://forum.jusline.at/{next_page_url}') page += 1 return questions_and_answers
def scrape_topic_pages(topic): page = 1 questions_and_answers = [] while True: print(f'Processing page: {page}') html = get_soup( f'https://jura-fragen.de/rechtsgebiet/{topic}/page/{page}') if 'error404' in html.body["class"]: # no more pages break articles = html.find_all(class_="entry-title") for article_title in articles: article_url = article_title.a["href"] article_html = get_soup(article_url) article_data = extract_qa_jura_fragen(article_html) article_data["url"] = article_url questions_and_answers.append(article_data) page += 1 return questions_and_answers
def scrape_pages(): page = 1 questions_and_answers = [] while True: print(f'Processing page: {page}') html = get_soup(f'https://www.gutefrage.net/tag/recht/{page}') # check if error page was reached error_page_section = html.select("section.ErrorPageSection") if len(error_page_section) != 0: break article_cards = html.select( "article.ListingElement.Plate a.ListingElement-questionLink") for article_card in article_cards: try: article_url = f'https://www.gutefrage.net{article_card["href"]}' print(article_url) article_html = get_soup(article_url) article_data = extract_qa_gutefrage(article_html) article_data["url"] = article_url questions_and_answers.append(article_data) except BaseException as e: print("ERROR PROCESSING ARTICLE") print(e) # checkpoint if page % 2 == 0: print(len(questions_and_answers)) json_string = json.dumps(questions_and_answers) with open(f'./data/gute_frage.json', 'w') as outfile: json.dump(json_string, outfile) page += 1 return questions_and_answers
def scrape_topic_pages(topic): html_page_1 = get_soup( f'https://www.juraforum.de/ratgeber/{topic}/{1}') page_number_indicator = html_page_1.find(string=re.compile("Seite 1 von")) # returns ex. 'Seite 1 von 10:' page_number_string = page_number_indicator.strip().split(" ")[-1][:-1] page_number = int(page_number_string) if len(page_number_string) > 0 else 1 print(f'{page_number} pages in topic {topic}.') questions_and_answers = [] for page in range(1, page_number + 1): print(f'Processing page: {page}') html = get_soup( f'https://www.juraforum.de/ratgeber/{topic}/{page}') articles = html.select("#spalte-inhalt .teaser-xl") for article in articles: article_url = article.h2.a["href"] article_html = get_soup(article_url) article_data = extract_qa_jura_forum(article_html) article_data["url"] = article_url questions_and_answers.append(article_data) return questions_and_answers
def scrape_topic_pages(): html_page_starter = get_soup('https://forum.jusline.at/') forum_topic_urls = html_page_starter.findAll("a", class_="forumtitle") # get href and remove . from relative path forum_topic_urls = [link["href"][1:] for link in forum_topic_urls] forum_topic_urls = forum_topic_urls[:-1] questions_and_answers = [] for topic_url in forum_topic_urls: questions_and_answers.extend(scrape_topic(topic_url)) with open(f'./data/forum_jusline_checkpoint.json', 'w') as outfile: json.dump(questions_and_answers, outfile) return questions_and_answers