Python get_soup 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: parsing.get_soup

메소드/함수: get_soup

hotexamples.com에서의 예제들: 5

Python get_soup - 5개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 parsing.get_soup.get_soup에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: scrape_forum_jusline.py 프로젝트: matusekma/onlab2

def scrape_topic(topic_url):
    questions_and_answers = []

    html_page = get_soup(f'https://forum.jusline.at/{topic_url}')
    topic_title = html_page.find("h2", class_="forum-title").get_text()
    print(topic_title)

    page = 1
    while True:
        print(f'PROCESSING PAGE: {page}')

        rows = html_page.findAll("dl", class_="row-item topic_read")

        for row in rows:
            try:
                answer_num = row.find("dd", class_="posts")
                if answer_num is None or answer_num.get_text(
                ) == "0 Antworten":
                    continue
                article_url = row.find("a", class_="topictitle")["href"][1:]
                article_url = f'https://forum.jusline.at/{article_url}'
                article_html = get_soup(article_url)
                article_data = extract_qa_jusline(article_html)
                article_data["url"] = article_url
                article_data["topic"] = topic_title
                questions_and_answers.append(article_data)
            except BaseException as e:
                print("ERROR PROCESSING ARTICLE")
                print(e)
            # print(article_data)

        next_page_button_link = html_page.find("a",
                                               class_="button",
                                               rel="next")
        if next_page_button_link is None:
            break

        next_page_url = next_page_button_link["href"]
        html_page = get_soup(f'https://forum.jusline.at/{next_page_url}')
        page += 1

    return questions_and_answers

예제 #2

파일 보기

파일: scrape_jura_fragen.py 프로젝트: matusekma/onlab2

def scrape_topic_pages(topic):
    page = 1
    questions_and_answers = []
    while True:
        print(f'Processing page: {page}')
        html = get_soup(
            f'https://jura-fragen.de/rechtsgebiet/{topic}/page/{page}')
        if 'error404' in html.body["class"]:  # no more pages
            break

        articles = html.find_all(class_="entry-title")
        for article_title in articles:
            article_url = article_title.a["href"]
            article_html = get_soup(article_url)
            article_data = extract_qa_jura_fragen(article_html)
            article_data["url"] = article_url
            questions_and_answers.append(article_data)
        page += 1

    return questions_and_answers

예제 #3

파일 보기

def scrape_pages():
    page = 1
    questions_and_answers = []
    while True:

        print(f'Processing page: {page}')
        html = get_soup(f'https://www.gutefrage.net/tag/recht/{page}')

        # check if error page was reached
        error_page_section = html.select("section.ErrorPageSection")
        if len(error_page_section) != 0:
            break

        article_cards = html.select(
            "article.ListingElement.Plate a.ListingElement-questionLink")
        for article_card in article_cards:
            try:
                article_url = f'https://www.gutefrage.net{article_card["href"]}'
                print(article_url)
                article_html = get_soup(article_url)
                article_data = extract_qa_gutefrage(article_html)
                article_data["url"] = article_url
                questions_and_answers.append(article_data)
            except BaseException as e:
                print("ERROR PROCESSING ARTICLE")
                print(e)

        # checkpoint
        if page % 2 == 0:
            print(len(questions_and_answers))
            json_string = json.dumps(questions_and_answers)
            with open(f'./data/gute_frage.json', 'w') as outfile:
                json.dump(json_string, outfile)

        page += 1

    return questions_and_answers

예제 #4

파일 보기

def scrape_topic_pages(topic):
    html_page_1 = get_soup(
            f'https://www.juraforum.de/ratgeber/{topic}/{1}')
    
    page_number_indicator = html_page_1.find(string=re.compile("Seite 1 von")) # returns ex. 'Seite 1 von 10:'
    page_number_string = page_number_indicator.strip().split(" ")[-1][:-1]
    page_number =  int(page_number_string) if len(page_number_string) > 0 else 1
    print(f'{page_number} pages in topic {topic}.')
    
    questions_and_answers = []
    for page in range(1, page_number + 1):
        print(f'Processing page: {page}')
        html = get_soup(
            f'https://www.juraforum.de/ratgeber/{topic}/{page}')

        articles = html.select("#spalte-inhalt .teaser-xl")
        for article in articles:
            article_url = article.h2.a["href"]
            article_html = get_soup(article_url)
            article_data = extract_qa_jura_forum(article_html)
            article_data["url"] = article_url
            questions_and_answers.append(article_data)

    return questions_and_answers

예제 #5

파일 보기

파일: scrape_forum_jusline.py 프로젝트: matusekma/onlab2

def scrape_topic_pages():
    html_page_starter = get_soup('https://forum.jusline.at/')

    forum_topic_urls = html_page_starter.findAll("a", class_="forumtitle")

    # get href and remove . from relative path
    forum_topic_urls = [link["href"][1:] for link in forum_topic_urls]

    forum_topic_urls = forum_topic_urls[:-1]

    questions_and_answers = []
    for topic_url in forum_topic_urls:
        questions_and_answers.extend(scrape_topic(topic_url))
        with open(f'./data/forum_jusline_checkpoint.json', 'w') as outfile:
            json.dump(questions_and_answers, outfile)

    return questions_and_answers