def parse(): main_page_url = "https://www.ukr.net/ua/" parsed_main_page = helper.get_parsed_data(main_page_url) get_news_from_main(parsed_main_page) categories = "https://www.ukr.net/news/main.html" parsed_categories = helper.get_parsed_data(categories) get_news_from_categories(parsed_categories)
def get_news_from_categories(parsed_main_page): categories_list = get_categories_list(parsed_main_page) for category in categories_list: parsed_category_page = helper.get_parsed_data("https:" + category[1], to_scroll=True) get_news_from_category(category[0], parsed_category_page) break
def get_images(url): try: parsed_page = helper.get_parsed_data(url) # Save all images links (by tag <img>) images = list() for image in parsed_page.find(name="div", attrs={ "class": "h-entry c-main" }).findAll(name="img"): images.append(image["src"]) return images except: return list()
def get_content(url): parsed_page = helper.get_parsed_data(url) article = parsed_page.find(name="article", attrs={"class": "o-cmr u-content-read"}) if article is None: raise ValueError("Didn't recognise as article") # Get text iterating article tag-by-tag text = str() for tag in article.findAll(name="p"): for string in tag.stripped_strings: text += (string + " ") return ("pseudo_title", text, "pseudo_images")
def get_news_from_categories(categories): for category in categories: parsed_category_page = helper.get_parsed_data(category[1]) get_news_from_category(category[0], parsed_category_page) break
def parse(): main_page_url = "https://tsn.ua/" parsed_main_page = helper.get_parsed_data(main_page_url) categories_list = get_categories_list(parsed_main_page) get_news_from_categories(categories_list)