def test_check_html_tags_correct(): no_of_elements = 0 correct_html_tags = False soup = soup_session("http://www.businessinsider.com/latest") for link in soup.find_all('h2'): article_to_open = link.find('a', href=True) try: no_of_elements = [ int(s) for s in article_to_open.text.split() if s.isdigit() ] except AttributeError: continue if not no_of_elements: continue if no_of_elements: if article_to_open['href'].startswith("http"): list_article_link = article_to_open['href'] else: list_article_link = "http://www.businessinsider.com" + article_to_open[ 'href'] soup = soup_session(list_article_link) if soup.find('h2', attrs={'class': 'slide-title-text'}): correct_html_tags = True else: correct_html_tags = True assert correct_html_tags
def find_article_to_parse(create_post=True): """Finds a list article in CollegeHumor's latest article archive and posts the list article to Reddit.""" website = ArticleType.CollegeHumor website_name = convert_enum_to_string(website) print(f"Searching {website_name}'s archive.") soup = lvm.soup_session(archive_link) for article in soup.find_all('h3', attrs={'class': 'title'}): article_link = 'http://www.collegehumor.com' + article.find( 'a')['href'] if not lvm.article_title_meets_posting_requirements( website, article.text): continue if article_published_today(article_link): article_list_text = get_article_list_text( article_link, lvm.get_article_list_count(article.text)) if article_list_text and not lvm.post_previously_made( article_link): print(f"{website_name} list article found: " + article.text) if create_post: post_to_reddit(article.text, article_list_text, article_link, website) return True print(f"No {website_name} list articles were found to parse at this time.") return False
def find_article_to_parse(create_post=True): """Finds a list article in Polygon's latest article archive and posts the list article to Reddit.""" website = ArticleType.Polygon website_name = convert_enum_to_string(website) print(f"Searching {website_name}'s archive.") soup = lvm.soup_session(archive_link) for link in soup.find_all('h2', attrs={'class': 'c-entry-box--compact__title'}, limit=max_articles_to_search): article_header = link.find('a', href=True) article_link = article_header['href'] print("Parsing article: " + article_link) time.sleep(1) if not lvm.article_title_meets_posting_requirements( website, article_header.text): continue article_list_text = get_article_list_text( article_link, lvm.get_article_list_count(article_header.text)) if article_list_text and not lvm.post_previously_made(article_link): print(f"{website_name} list article found: " + article_header.text) if create_post: post_to_reddit(article_header.text, article_list_text, article_link, website) return True print(f"No {website_name} list articles were found to parse at this time.") return False
def find_article_to_parse(create_post=True): """Finds a list article in Business Insider's latest article archive and posts the list article to Reddit.""" website = ArticleType.Business_Insider website_name = convert_enum_to_string(website) print(f"Searching {website_name}'s archive.") soup = lvm.soup_session(archive_link) for link in soup.find_all('h2', attrs={'class': 'tout-title default-tout'}): article_title = link.find('a', href=True) article_link = article_title['href'] if article_title['href'].startswith("http") else "http://www.businessinsider.com" + article_title['href'] print("Parsing article: " + article_link) time.sleep(1) if not lvm.article_title_meets_posting_requirements(website, article_title.text): continue article_list_text = get_article_list_text(article_link, lvm.get_article_list_count(article_title.text)) if article_list_text and not lvm.post_previously_made(article_link): print(f"{website_name} list article found: " + article_title.text) if create_post: post_to_reddit(article_title.text, article_list_text, article_link, website) return True print(f"No {website_name} list articles were found to parse at this time.") return False
def get_article_list_text(link_to_check, total_list_elements): """Concatenates the list elements of the article into a single string. Ensures proper list formatting before making a post.""" list_counter = 1 full_list = "" soup = lvm.soup_session(link_to_check) for article in soup.find_all("h2"): list_item_number_element = article.find("span") list_item_text_element = article.contents list_item_number = list_item_number_element.text if list_item_number_element else str( list_counter) list_item_text = list_item_text_element[-1].strip( ) if list_item_text_element and isinstance( list_item_text_element[-1], NavigableString) else article.text.strip() if list_item_text: full_list += f"{list_item_number}. {list_item_text}\n" list_counter += 1 if lvm.article_text_meets_posting_requirements(ArticleType.Screen_Rant, full_list, list_counter, total_list_elements): if not full_list.startswith('1. '): full_list = lvm.reverse_list(full_list) return full_list
def find_article_to_parse(create_post=True): """Finds a list article in BuzzFeed's latest article archive and posts the list article to Reddit.""" website = ArticleType.BuzzFeed website_name = convert_enum_to_string(website) print(f"Searching {website_name}'s archive.") soup = lvm.soup_session(archive_link) for link in soup.find_all('article', attrs={'data-buzzblock': 'story-card'}, limit=max_articles_to_search): article_title = link.find('a', href=True) article_link = article_title['href'] print("Parsing article: " + article_link) time.sleep(1) if not lvm.article_title_meets_posting_requirements(website, article_title.text): continue no_of_elements = lvm.get_article_list_count(article_title.text) article_list_text = get_article_list_text(article_link, no_of_elements) if not article_list_text: article_list_text = paragraph_article_text(article_link, no_of_elements) if article_list_text and not lvm.post_previously_made(article_link): print(f"{website_name} list article found: " + article_title.text) if create_post: post_to_reddit(article_title.text, article_list_text, article_link, website) return True print(f"No {website_name} list articles were found to parse at this time.") return False
def article_published_today(link): """ Compares the date the article was written with today's date.""" soup = lvm.soup_session(link) todays_date = (date.today() - timedelta(0)).strftime( "%B %#d, %Y") # The # is platform specific date_to_check = soup.find('time', attrs={'class': 'date'}) return date_to_check == todays_date
def get_article_list_text(link_to_check, total_list_elements): """Concatenates the list elements of the article into a single string. Ensures proper list formatting before making a post.""" list_counter = 1 full_list = "" formatting_options = { # Header formatting "html_format_1": { "wrapper": ["h2"], "body": ["strong"] } } soup = lvm.soup_session(link_to_check) for option in formatting_options.values(): wrapper = option["wrapper"] body = option["body"] for article_point_wrapper in soup.find_all( wrapper[0], attrs=None if len(wrapper) == 1 else {wrapper[1]: wrapper[2]}): article_point_text = "" for article_point in article_point_wrapper.find_all( body[0], attrs=None if len(body) == 1 else {body[1]: body[2]}): article_point_text += article_point.text if article_point_text: article_point_text = article_point_text.strip() if not re.search("^[0-9]+[.]", article_point_text): article_point_text = str( list_counter) + '. ' + article_point_text full_list += article_point_text + '\n' list_counter += 1 if lvm.article_text_meets_posting_requirements(ArticleType.Polygon, full_list, list_counter, total_list_elements): if not full_list.startswith('1. '): full_list = lvm.reverse_list(full_list) break else: list_counter = 1 full_list = "" return full_list
def get_article_list_text(link_to_check, total_list_elements): """Concatenates the list elements of the article into a single string. Ensures proper list formatting before making a post.""" list_counter = 1 full_list = "" formatting_options = { # Header formatting "html_format_1": { "wrapper": ["div", "class", "slide-title clearfix"], "body": ["h2", "class", "slide-title-text"] }, # Slide formatting "html_format_2": { "wrapper": ["div", "class", "slide-module"], "body": ["h3"] }, # Paragraph formatting "html_format_3": { "wrapper": ["ol"], "body": ["li"] } } soup = lvm.soup_session(link_to_check) for option in formatting_options.values(): wrapper = option["wrapper"] body = option["body"] for article_point_wrapper in soup.find_all(wrapper[0], attrs=None if len(wrapper) == 1 else {wrapper[1]: wrapper[2]}): for article_point in article_point_wrapper.find_all(body[0], attrs=None if len(body) == 1 else {body[1]: body[2]}): if re.search("^[0-9]+[.]", article_point.text): full_list += article_point.text.strip() + '\n' else: full_list += str(list_counter) + '. ' + article_point.text.strip() + '\n' list_counter += 1 if lvm.article_text_meets_posting_requirements(ArticleType.Business_Insider, full_list, list_counter, total_list_elements): if not full_list.startswith('1. '): full_list = lvm.reverse_list(full_list) break else: list_counter = 1 full_list = "" return full_list
def get_article_list_text(link_to_check, total_list_elements): """Concatenates the list elements of the article into a single string. Ensures proper list formatting before making a post.""" list_counter = 1 full_list = "" soup = lvm.soup_session(link_to_check) for article in soup.find_all('h2'): list_item_number_element = article.find('span', attrs={'class': 'subbuzz__number'}) list_item_text_element = article.find('span', attrs={'class': 'js-subbuzz__title-text'}) if list_item_number_element and list_item_text_element: list_item_number = list_item_number_element.text # Tries to add a hyperlink to the article list element being searched, if it has any. try: for link in list_item_text_element.find_all('a', href=True): link_to_use = link['href'] # Removes redirect link if there is any. if link_to_use.startswith('http:') and (r'/https:' in link_to_use or r'/http:' in link_to_use): link_to_use = 'http' + link_to_use.split(r'/http', 1)[1] link_to_use = link_to_use.replace(')', r'\)') full_list += list_item_number + ' [' + list_item_text_element.text + '](' + link_to_use + ')' + '\n' break except KeyError as e: print("Key Error: " + str(e)) pass # If the list element doesn't have a link associated to it, post it as plain text. if not list_item_text_element.find_all('a', href=True): full_list += list_item_number + ' ' + list_item_text_element.text + '\n' list_counter += 1 if lvm.article_text_meets_posting_requirements(ArticleType.BuzzFeed, full_list, list_counter, total_list_elements): if not full_list.startswith('1. '): full_list = lvm.reverse_list(full_list) return full_list
def paragraph_article_text(link_to_check, total_list_elements): """Parses BuzzFeed list articles that are in paragraph form (has the 'p' HTML tag).""" list_counter = 1 full_list = "" soup = lvm.soup_session(link_to_check) for list_element in soup.find_all('p'): if list_element.text and list_element.text[0].isdigit(): full_list += list_element.text.replace(') ', '. ', 1) + '\n' list_counter += 1 if lvm.article_text_meets_posting_requirements(ArticleType.BuzzFeed, full_list, list_counter, total_list_elements): if not full_list.startswith('1. '): full_list = lvm.reverse_list(full_list) return full_list
def get_article_list_text(link_to_check, total_list_elements): """Concatenates the list elements of the article into a single string. Ensures proper list formatting before making a post.""" list_counter = 1 full_list = '' soup = lvm.soup_session(link_to_check) for article in soup.find_all('h2'): if not article.text or article.text[0].isdigit(): continue full_list += str(list_counter) + '. ' + article.text.strip() + '\n' list_counter += 1 if lvm.article_text_meets_posting_requirements(ArticleType.CollegeHumor, full_list, list_counter, total_list_elements): return full_list
def find_article_to_parse(create_post=True): """Finds a list article in Screen Rant's latest article archive and posts the list article to Reddit.""" website = ArticleType.Screen_Rant website_name = convert_enum_to_string(website) print(f"Searching {website_name}'s archive.") soup = lvm.soup_session(archive_link) for article in soup.find_all("h3", attrs={"class": "bc-title"}, limit=max_articles_to_search): article = article.find("a", href=True) if article: article_title = article['title'] article_link = article['href'] if article['href'].startswith( "http") else "http://www.screenrant.com" + article['href'] print(f"Parsing article: {article_link}") time.sleep(1) if not lvm.article_title_meets_posting_requirements( website, article_title): continue article_list_text = get_article_list_text( article_link, lvm.get_article_list_count(article_title)) if article_list_text and not lvm.post_previously_made( article_link): print(f"{website_name} list article found: {article_title}") if create_post: post_to_reddit(article_title, article_list_text, article_link, website) return True print(f"No {website_name} list articles were found to parse at this time.") return False
def get_article_list_text(link_to_check, total_list_elements): """Concatenates the list elements of the article into a single string. Ensures proper list formatting before making a post.""" list_counter = 1 full_list = "" soup = lvm.soup_session(link_to_check) for article in soup.find_all("h2", attrs={"class": "subheading subheading-num"}): # Expecting two items in contents list (number and header text). if len(article.contents) != 2: return "" list_item_number_element = article.contents[0] if article.contents[ 0].name == 'label' else None list_item_text_element = article.contents[1] if article.contents[ 1].name == 'span' else None list_item_number = list_item_number_element.text.strip( ) if list_item_number_element else None list_item_text = list_item_text_element.text.strip( ) if list_item_text_element else None if list_item_text: full_list += f"{list_item_number}. {list_item_text}\n" list_counter += 1 if lvm.article_text_meets_posting_requirements(ArticleType.Cracked, full_list, list_counter, total_list_elements): if not full_list.startswith('1. '): full_list = lvm.reverse_list(full_list) return full_list