예제 #1
0
def parse():
    main_page_url = "https://www.ukr.net/ua/"
    parsed_main_page = helper.get_parsed_data(main_page_url)
    get_news_from_main(parsed_main_page)

    categories = "https://www.ukr.net/news/main.html"
    parsed_categories = helper.get_parsed_data(categories)
    get_news_from_categories(parsed_categories)
예제 #2
0
def get_news_from_categories(parsed_main_page):
    categories_list = get_categories_list(parsed_main_page)
    for category in categories_list:
        parsed_category_page = helper.get_parsed_data("https:" + category[1],
                                                      to_scroll=True)
        get_news_from_category(category[0], parsed_category_page)
        break
예제 #3
0
def get_images(url):
    try:
        parsed_page = helper.get_parsed_data(url)

        # Save all images links (by tag <img>)
        images = list()
        for image in parsed_page.find(name="div",
                                      attrs={
                                          "class": "h-entry c-main"
                                      }).findAll(name="img"):
            images.append(image["src"])

        return images
    except:
        return list()
예제 #4
0
def get_content(url):
    parsed_page = helper.get_parsed_data(url)

    article = parsed_page.find(name="article",
                               attrs={"class": "o-cmr u-content-read"})

    if article is None:
        raise ValueError("Didn't recognise as article")

    # Get text iterating article tag-by-tag
    text = str()
    for tag in article.findAll(name="p"):
        for string in tag.stripped_strings:
            text += (string + " ")

    return ("pseudo_title", text, "pseudo_images")
예제 #5
0
def get_news_from_categories(categories):
    for category in categories:
        parsed_category_page = helper.get_parsed_data(category[1])
        get_news_from_category(category[0], parsed_category_page)
        break
예제 #6
0
def parse():
    main_page_url = "https://tsn.ua/"
    parsed_main_page = helper.get_parsed_data(main_page_url)

    categories_list = get_categories_list(parsed_main_page)
    get_news_from_categories(categories_list)