예제 #1
0
def scrape_topic(topic_url):
    questions_and_answers = []

    html_page = get_soup(f'https://forum.jusline.at/{topic_url}')
    topic_title = html_page.find("h2", class_="forum-title").get_text()
    print(topic_title)

    page = 1
    while True:
        print(f'PROCESSING PAGE: {page}')

        rows = html_page.findAll("dl", class_="row-item topic_read")

        for row in rows:
            try:
                answer_num = row.find("dd", class_="posts")
                if answer_num is None or answer_num.get_text(
                ) == "0 Antworten":
                    continue
                article_url = row.find("a", class_="topictitle")["href"][1:]
                article_url = f'https://forum.jusline.at/{article_url}'
                article_html = get_soup(article_url)
                article_data = extract_qa_jusline(article_html)
                article_data["url"] = article_url
                article_data["topic"] = topic_title
                questions_and_answers.append(article_data)
            except BaseException as e:
                print("ERROR PROCESSING ARTICLE")
                print(e)
            # print(article_data)

        next_page_button_link = html_page.find("a",
                                               class_="button",
                                               rel="next")
        if next_page_button_link is None:
            break

        next_page_url = next_page_button_link["href"]
        html_page = get_soup(f'https://forum.jusline.at/{next_page_url}')
        page += 1

    return questions_and_answers
예제 #2
0
def scrape_topic_pages(topic):
    page = 1
    questions_and_answers = []
    while True:
        print(f'Processing page: {page}')
        html = get_soup(
            f'https://jura-fragen.de/rechtsgebiet/{topic}/page/{page}')
        if 'error404' in html.body["class"]:  # no more pages
            break

        articles = html.find_all(class_="entry-title")
        for article_title in articles:
            article_url = article_title.a["href"]
            article_html = get_soup(article_url)
            article_data = extract_qa_jura_fragen(article_html)
            article_data["url"] = article_url
            questions_and_answers.append(article_data)
        page += 1

    return questions_and_answers
예제 #3
0
def scrape_pages():
    page = 1
    questions_and_answers = []
    while True:

        print(f'Processing page: {page}')
        html = get_soup(f'https://www.gutefrage.net/tag/recht/{page}')

        # check if error page was reached
        error_page_section = html.select("section.ErrorPageSection")
        if len(error_page_section) != 0:
            break

        article_cards = html.select(
            "article.ListingElement.Plate a.ListingElement-questionLink")
        for article_card in article_cards:
            try:
                article_url = f'https://www.gutefrage.net{article_card["href"]}'
                print(article_url)
                article_html = get_soup(article_url)
                article_data = extract_qa_gutefrage(article_html)
                article_data["url"] = article_url
                questions_and_answers.append(article_data)
            except BaseException as e:
                print("ERROR PROCESSING ARTICLE")
                print(e)

        # checkpoint
        if page % 2 == 0:
            print(len(questions_and_answers))
            json_string = json.dumps(questions_and_answers)
            with open(f'./data/gute_frage.json', 'w') as outfile:
                json.dump(json_string, outfile)

        page += 1

    return questions_and_answers
예제 #4
0
def scrape_topic_pages(topic):
    html_page_1 = get_soup(
            f'https://www.juraforum.de/ratgeber/{topic}/{1}')
    
    page_number_indicator = html_page_1.find(string=re.compile("Seite 1 von")) # returns ex. 'Seite 1 von 10:'
    page_number_string = page_number_indicator.strip().split(" ")[-1][:-1]
    page_number =  int(page_number_string) if len(page_number_string) > 0 else 1
    print(f'{page_number} pages in topic {topic}.')
    
    questions_and_answers = []
    for page in range(1, page_number + 1):
        print(f'Processing page: {page}')
        html = get_soup(
            f'https://www.juraforum.de/ratgeber/{topic}/{page}')

        articles = html.select("#spalte-inhalt .teaser-xl")
        for article in articles:
            article_url = article.h2.a["href"]
            article_html = get_soup(article_url)
            article_data = extract_qa_jura_forum(article_html)
            article_data["url"] = article_url
            questions_and_answers.append(article_data)

    return questions_and_answers
예제 #5
0
def scrape_topic_pages():
    html_page_starter = get_soup('https://forum.jusline.at/')

    forum_topic_urls = html_page_starter.findAll("a", class_="forumtitle")

    # get href and remove . from relative path
    forum_topic_urls = [link["href"][1:] for link in forum_topic_urls]

    forum_topic_urls = forum_topic_urls[:-1]

    questions_and_answers = []
    for topic_url in forum_topic_urls:
        questions_and_answers.extend(scrape_topic(topic_url))
        with open(f'./data/forum_jusline_checkpoint.json', 'w') as outfile:
            json.dump(questions_and_answers, outfile)

    return questions_and_answers