def test_check_html_tags_correct():
    no_of_elements = 0
    correct_html_tags = False
    soup = soup_session("http://www.businessinsider.com/latest")

    for link in soup.find_all('h2'):

        article_to_open = link.find('a', href=True)

        try:
            no_of_elements = [
                int(s) for s in article_to_open.text.split() if s.isdigit()
            ]
        except AttributeError:
            continue

        if not no_of_elements:
            continue

    if no_of_elements:

        if article_to_open['href'].startswith("http"):
            list_article_link = article_to_open['href']
        else:
            list_article_link = "http://www.businessinsider.com" + article_to_open[
                'href']

        soup = soup_session(list_article_link)

        if soup.find('h2', attrs={'class': 'slide-title-text'}):
            correct_html_tags = True
    else:
        correct_html_tags = True

    assert correct_html_tags
Пример #2
0
def find_article_to_parse(create_post=True):
    """Finds a list article in CollegeHumor's latest article archive and posts the list article to Reddit."""

    website = ArticleType.CollegeHumor
    website_name = convert_enum_to_string(website)

    print(f"Searching {website_name}'s archive.")
    soup = lvm.soup_session(archive_link)

    for article in soup.find_all('h3', attrs={'class': 'title'}):

        article_link = 'http://www.collegehumor.com' + article.find(
            'a')['href']

        if not lvm.article_title_meets_posting_requirements(
                website, article.text):
            continue

        if article_published_today(article_link):
            article_list_text = get_article_list_text(
                article_link, lvm.get_article_list_count(article.text))
            if article_list_text and not lvm.post_previously_made(
                    article_link):
                print(f"{website_name} list article found: " + article.text)
                if create_post:
                    post_to_reddit(article.text, article_list_text,
                                   article_link, website)
                return True

    print(f"No {website_name} list articles were found to parse at this time.")
    return False
Пример #3
0
def find_article_to_parse(create_post=True):
    """Finds a list article in Polygon's latest article archive and posts the list article to Reddit."""

    website = ArticleType.Polygon
    website_name = convert_enum_to_string(website)

    print(f"Searching {website_name}'s archive.")
    soup = lvm.soup_session(archive_link)

    for link in soup.find_all('h2',
                              attrs={'class': 'c-entry-box--compact__title'},
                              limit=max_articles_to_search):

        article_header = link.find('a', href=True)
        article_link = article_header['href']
        print("Parsing article: " + article_link)
        time.sleep(1)

        if not lvm.article_title_meets_posting_requirements(
                website, article_header.text):
            continue

        article_list_text = get_article_list_text(
            article_link, lvm.get_article_list_count(article_header.text))
        if article_list_text and not lvm.post_previously_made(article_link):
            print(f"{website_name} list article found: " + article_header.text)
            if create_post:
                post_to_reddit(article_header.text, article_list_text,
                               article_link, website)
            return True

    print(f"No {website_name} list articles were found to parse at this time.")
    return False
def find_article_to_parse(create_post=True):
    """Finds a list article in Business Insider's latest article archive and posts the list article to Reddit."""

    website = ArticleType.Business_Insider
    website_name = convert_enum_to_string(website)

    print(f"Searching {website_name}'s archive.")
    soup = lvm.soup_session(archive_link)

    for link in soup.find_all('h2', attrs={'class': 'tout-title default-tout'}):

        article_title = link.find('a', href=True)
        article_link = article_title['href'] if article_title['href'].startswith("http") else "http://www.businessinsider.com" + article_title['href']

        print("Parsing article: " + article_link)
        time.sleep(1)

        if not lvm.article_title_meets_posting_requirements(website, article_title.text):
            continue

        article_list_text = get_article_list_text(article_link, lvm.get_article_list_count(article_title.text))
        if article_list_text and not lvm.post_previously_made(article_link):
            print(f"{website_name} list article found: " + article_title.text)
            if create_post:
                post_to_reddit(article_title.text, article_list_text, article_link, website)
            return True

    print(f"No {website_name} list articles were found to parse at this time.")
    return False
Пример #5
0
def get_article_list_text(link_to_check, total_list_elements):
    """Concatenates the list elements of the article into a single string. Ensures proper list formatting before making a post."""

    list_counter = 1
    full_list = ""

    soup = lvm.soup_session(link_to_check)

    for article in soup.find_all("h2"):

        list_item_number_element = article.find("span")
        list_item_text_element = article.contents

        list_item_number = list_item_number_element.text if list_item_number_element else str(
            list_counter)
        list_item_text = list_item_text_element[-1].strip(
        ) if list_item_text_element and isinstance(
            list_item_text_element[-1],
            NavigableString) else article.text.strip()

        if list_item_text:
            full_list += f"{list_item_number}. {list_item_text}\n"
            list_counter += 1

    if lvm.article_text_meets_posting_requirements(ArticleType.Screen_Rant,
                                                   full_list, list_counter,
                                                   total_list_elements):
        if not full_list.startswith('1. '):
            full_list = lvm.reverse_list(full_list)

        return full_list
Пример #6
0
def find_article_to_parse(create_post=True):
    """Finds a list article in BuzzFeed's latest article archive and posts the list article to Reddit."""

    website = ArticleType.BuzzFeed
    website_name = convert_enum_to_string(website)

    print(f"Searching {website_name}'s archive.")
    soup = lvm.soup_session(archive_link)

    for link in soup.find_all('article', attrs={'data-buzzblock': 'story-card'}, limit=max_articles_to_search):

        article_title = link.find('a', href=True)
        article_link = article_title['href']
        print("Parsing article: " + article_link)
        time.sleep(1)

        if not lvm.article_title_meets_posting_requirements(website, article_title.text):
            continue

        no_of_elements = lvm.get_article_list_count(article_title.text)

        article_list_text = get_article_list_text(article_link, no_of_elements)
        if not article_list_text:
            article_list_text = paragraph_article_text(article_link, no_of_elements)

        if article_list_text and not lvm.post_previously_made(article_link):
            print(f"{website_name} list article found: " + article_title.text)
            if create_post:
                post_to_reddit(article_title.text, article_list_text, article_link, website)
            return True

    print(f"No {website_name} list articles were found to parse at this time.")
    return False
Пример #7
0
def article_published_today(link):
    """ Compares the date the article was written with today's date."""

    soup = lvm.soup_session(link)

    todays_date = (date.today() - timedelta(0)).strftime(
        "%B %#d, %Y")  # The # is platform specific
    date_to_check = soup.find('time', attrs={'class': 'date'})

    return date_to_check == todays_date
Пример #8
0
def get_article_list_text(link_to_check, total_list_elements):
    """Concatenates the list elements of the article into a single string. Ensures proper list formatting before making a post."""

    list_counter = 1
    full_list = ""

    formatting_options = {
        # Header formatting
        "html_format_1": {
            "wrapper": ["h2"],
            "body": ["strong"]
        }
    }

    soup = lvm.soup_session(link_to_check)

    for option in formatting_options.values():

        wrapper = option["wrapper"]
        body = option["body"]

        for article_point_wrapper in soup.find_all(
                wrapper[0],
                attrs=None if len(wrapper) == 1 else {wrapper[1]: wrapper[2]}):
            article_point_text = ""
            for article_point in article_point_wrapper.find_all(
                    body[0],
                    attrs=None if len(body) == 1 else {body[1]: body[2]}):
                article_point_text += article_point.text

            if article_point_text:

                article_point_text = article_point_text.strip()
                if not re.search("^[0-9]+[.]", article_point_text):
                    article_point_text = str(
                        list_counter) + '. ' + article_point_text

                full_list += article_point_text + '\n'
                list_counter += 1

        if lvm.article_text_meets_posting_requirements(ArticleType.Polygon,
                                                       full_list, list_counter,
                                                       total_list_elements):
            if not full_list.startswith('1. '):
                full_list = lvm.reverse_list(full_list)
            break
        else:
            list_counter = 1
            full_list = ""

    return full_list
def get_article_list_text(link_to_check, total_list_elements):
    """Concatenates the list elements of the article into a single string. Ensures proper list formatting before making a post."""

    list_counter = 1
    full_list = ""
    formatting_options = {
        # Header formatting
        "html_format_1": {
            "wrapper": ["div", "class", "slide-title clearfix"],
            "body": ["h2", "class", "slide-title-text"]
        },
        # Slide formatting
        "html_format_2": {
            "wrapper": ["div", "class", "slide-module"],
            "body": ["h3"]
        },
        # Paragraph formatting
        "html_format_3": {
            "wrapper": ["ol"],
            "body": ["li"]
        }
    }

    soup = lvm.soup_session(link_to_check)

    for option in formatting_options.values():

        wrapper = option["wrapper"]
        body = option["body"]

        for article_point_wrapper in soup.find_all(wrapper[0], attrs=None if len(wrapper) == 1 else {wrapper[1]: wrapper[2]}):
            for article_point in article_point_wrapper.find_all(body[0], attrs=None if len(body) == 1 else {body[1]: body[2]}):
                if re.search("^[0-9]+[.]", article_point.text):
                    full_list += article_point.text.strip() + '\n'
                else:
                    full_list += str(list_counter) + '. ' + article_point.text.strip() + '\n'

                list_counter += 1

        if lvm.article_text_meets_posting_requirements(ArticleType.Business_Insider, full_list, list_counter, total_list_elements):
            if not full_list.startswith('1. '):
                full_list = lvm.reverse_list(full_list)
            break
        else:
            list_counter = 1
            full_list = ""

    return full_list
Пример #10
0
def get_article_list_text(link_to_check, total_list_elements):
    """Concatenates the list elements of the article into a single string. Ensures proper list formatting before making a post."""

    list_counter = 1
    full_list = ""

    soup = lvm.soup_session(link_to_check)

    for article in soup.find_all('h2'):

        list_item_number_element = article.find('span', attrs={'class': 'subbuzz__number'})
        list_item_text_element = article.find('span', attrs={'class': 'js-subbuzz__title-text'})

        if list_item_number_element and list_item_text_element:

            list_item_number = list_item_number_element.text

            # Tries to add a hyperlink to the article list element being searched, if it has any.
            try:
                for link in list_item_text_element.find_all('a', href=True):
                    link_to_use = link['href']

                    # Removes redirect link if there is any.
                    if link_to_use.startswith('http:') and (r'/https:' in link_to_use or r'/http:' in link_to_use):
                        link_to_use = 'http' + link_to_use.split(r'/http', 1)[1]

                    link_to_use = link_to_use.replace(')', r'\)')

                    full_list += list_item_number + ' [' + list_item_text_element.text + '](' + link_to_use + ')' + '\n'
                    break
            except KeyError as e:
                print("Key Error: " + str(e))
                pass

            # If the list element doesn't have a link associated to it, post it as plain text.
            if not list_item_text_element.find_all('a', href=True):
                full_list += list_item_number + ' ' + list_item_text_element.text + '\n'

            list_counter += 1

    if lvm.article_text_meets_posting_requirements(ArticleType.BuzzFeed, full_list, list_counter, total_list_elements):
        if not full_list.startswith('1. '):
            full_list = lvm.reverse_list(full_list)

        return full_list
Пример #11
0
def paragraph_article_text(link_to_check, total_list_elements):
    """Parses BuzzFeed list articles that are in paragraph form (has the 'p' HTML tag)."""

    list_counter = 1
    full_list = ""

    soup = lvm.soup_session(link_to_check)

    for list_element in soup.find_all('p'):
        if list_element.text and list_element.text[0].isdigit():
            full_list += list_element.text.replace(') ', '. ', 1) + '\n'
            list_counter += 1

    if lvm.article_text_meets_posting_requirements(ArticleType.BuzzFeed, full_list, list_counter, total_list_elements):
        if not full_list.startswith('1. '):
            full_list = lvm.reverse_list(full_list)

        return full_list
Пример #12
0
def get_article_list_text(link_to_check, total_list_elements):
    """Concatenates the list elements of the article into a single string. Ensures proper list formatting before making a post."""

    list_counter = 1
    full_list = ''

    soup = lvm.soup_session(link_to_check)

    for article in soup.find_all('h2'):
        if not article.text or article.text[0].isdigit():
            continue

        full_list += str(list_counter) + '. ' + article.text.strip() + '\n'
        list_counter += 1

    if lvm.article_text_meets_posting_requirements(ArticleType.CollegeHumor,
                                                   full_list, list_counter,
                                                   total_list_elements):
        return full_list
Пример #13
0
def find_article_to_parse(create_post=True):
    """Finds a list article in Screen Rant's latest article archive and posts the list article to Reddit."""

    website = ArticleType.Screen_Rant
    website_name = convert_enum_to_string(website)

    print(f"Searching {website_name}'s archive.")
    soup = lvm.soup_session(archive_link)

    for article in soup.find_all("h3",
                                 attrs={"class": "bc-title"},
                                 limit=max_articles_to_search):
        article = article.find("a", href=True)

        if article:

            article_title = article['title']
            article_link = article['href'] if article['href'].startswith(
                "http") else "http://www.screenrant.com" + article['href']

            print(f"Parsing article: {article_link}")
            time.sleep(1)

            if not lvm.article_title_meets_posting_requirements(
                    website, article_title):
                continue

            article_list_text = get_article_list_text(
                article_link, lvm.get_article_list_count(article_title))
            if article_list_text and not lvm.post_previously_made(
                    article_link):
                print(f"{website_name} list article found: {article_title}")
                if create_post:
                    post_to_reddit(article_title, article_list_text,
                                   article_link, website)
                return True

    print(f"No {website_name} list articles were found to parse at this time.")
    return False
Пример #14
0
def get_article_list_text(link_to_check, total_list_elements):
    """Concatenates the list elements of the article into a single string. Ensures proper list formatting before making a post."""

    list_counter = 1
    full_list = ""

    soup = lvm.soup_session(link_to_check)

    for article in soup.find_all("h2",
                                 attrs={"class": "subheading subheading-num"}):

        # Expecting two items in contents list (number and header text).
        if len(article.contents) != 2:
            return ""

        list_item_number_element = article.contents[0] if article.contents[
            0].name == 'label' else None
        list_item_text_element = article.contents[1] if article.contents[
            1].name == 'span' else None

        list_item_number = list_item_number_element.text.strip(
        ) if list_item_number_element else None
        list_item_text = list_item_text_element.text.strip(
        ) if list_item_text_element else None

        if list_item_text:
            full_list += f"{list_item_number}. {list_item_text}\n"
            list_counter += 1

    if lvm.article_text_meets_posting_requirements(ArticleType.Cracked,
                                                   full_list, list_counter,
                                                   total_list_elements):
        if not full_list.startswith('1. '):
            full_list = lvm.reverse_list(full_list)

        return full_list