Exemplo n.º 1
0
    def test_send_to_database(self):
        type = Type.TEST
        add_data(session=get_engine_session(),
                 entry_type=type.value,
                 handled_utc=(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                 original_url="https://google.com/thisisatest",
                 canonical_url=None,
                 note="Test")

        self.assertTrue(True)
Exemplo n.º 2
0
    def test_get_canonical_from_database_by_url(self, use_database=False):
        amount_of_correct_retrievals = 0
        session = get_engine_session()

        # Use data from the database
        if use_database:
            old_entries = get_data(session=session,
                                   limit=100,
                                   offset=5000,
                                   order_descending=True,
                                   canonical_url=True)

        # Or use a single entry as specified below
        else:
            old_entries = [
                Entry(
                    original_url=
                    "https://www.mynbc5.com/amp/article/emily-ferlazzo-joseph-bolton-vermont-missing-update/38004866",
                    canonical_url=
                    "https://abc3340.com/news/inside-your-world/the-federal-government-spends-billions-each-year-maintaining-empty-buildings-nationwide"
                )
            ]

        for old_entry in old_entries:
            log.info("OLD")
            log.info(old_entry.entry_id)
            log.info(old_entry.canonical_url)
            found_entry = get_entry_by_original_url(old_entry.original_url,
                                                    session)

            if found_entry:
                log.info("NEW")
                log.info(found_entry.entry_id)
                log.info(old_entry.canonical_url)

                if old_entry.entry_id == found_entry.entry_id:
                    amount_of_correct_retrievals += 1

            else:
                log.warning("No entry found!")

        self.assertEqual(amount_of_correct_retrievals, old_entries.len)
Exemplo n.º 3
0
    def test_canonical(self, use_database=True):
        amount_of_canonicals = 0
        old_amount_of_canonicals = 0

        # Use data from the database
        if use_database:
            old_entries = get_data(session=get_engine_session(),
                                   limit=500,
                                   offset=5000,
                                   order_descending=True,
                                   canonical_url=False)

        # Or use a single entry as specified below
        else:
            old_entries = [
                Entry(
                    original_url=
                    "https://www.google.com/amp/s/abc3340.com/amp/news/inside-your-world/the-federal-government-spends-billions-each-year-maintaining-empty-buildings-nationwide",
                    canonical_url=
                    "https://abc3340.com/news/inside-your-world/the-federal-government-spends-billions-each-year-maintaining-empty-buildings-nationwide"
                )
            ]

        # Loop through every old entry and try to find the canonicals, compare the results
        for old_entry in old_entries:
            if old_entry.canonical_url:
                old_amount_of_canonicals += 1

            urls = get_urls(old_entry.original_url)
            urls_info = get_urls_info(urls)
            if urls_info:
                for link in urls_info:
                    log.info(link.canonical_alt)

                    if link.amp_canonical:
                        log.info(link.amp_canonical)
                    if link.canonical:
                        amount_of_canonicals += 1

                    log.info(f"BODY   : {old_entry.original_url}")
                    log.info(f"OLD    : {old_entry.canonical_url}")
                    log.info(f"NEW    : {link.canonical}")

                    if link.canonical == old_entry.canonical_url:
                        log.info("It's the same!")
                    else:
                        log.info("It's not the same!")
                    """if link.canonical:
                        similarity = get_article_similarity(old_entry.original_url, link.canonical, log_articles=False)
                        log.info(f"Article similarity= {similarity}")"""

            else:
                log.warning(f"No canonicals found")

        log.info(
            f"\nCanonicals found: Old: {old_amount_of_canonicals}, New: {amount_of_canonicals}"
        )

        # If same as before, great!
        if amount_of_canonicals == old_amount_of_canonicals:
            self.assertEqual(amount_of_canonicals, old_amount_of_canonicals)
        # If it is better than before, great!
        if amount_of_canonicals > old_amount_of_canonicals:
            self.assertGreater(amount_of_canonicals, old_amount_of_canonicals)
        # If it is worse than before, not good.
        if amount_of_canonicals < old_amount_of_canonicals:
            self.assertLess(old_amount_of_canonicals, amount_of_canonicals)
Exemplo n.º 4
0
def run_bot(type=Type.SUBMISSION,
            guess_and_check=False,
            reply_to_post=True,
            write_to_database=True):
    # Get the stream instance (contains session, type and data)
    s = stream.get_stream(type)
    log.info("Set up new stream")

    # Start the stream
    for submission in s.praw_session.subreddit("+".join(
            s.allowed_subreddits)).stream.submissions():
        # Generate an item with all the relevant data
        i = Item(type=type,
                 id=submission.name,
                 subreddit=submission.subreddit,
                 author=submission.author,
                 body=get_submission_body(submission))

        # Check if the item meets the criteria
        meets_criteria, result_code = check_criteria(
            item=i,
            data=s,
            history_failed=s.submissions_failed,
            history_success=s.submissions_success,
            mustBeAMP=True,
            mustBeNew=True,
            mustNotBeDisallowedSubreddit=False,
            mustNotHaveFailed=True,
            mustNotBeMine=True,
            mustNotBeOptedOut=True,
            mustNotHaveDisallowedMods=False)

        # If it meets the criteria, try to find the canonicals and make a reply
        if meets_criteria:
            log.info(f"{i.id} in r/{i.subreddit} meets criteria")
            # Get the urls from the body and try to find the canonicals
            urls = get_urls(i.body)
            i.links = get_urls_info(urls, guess_and_check)

            # If a canonical was found, generate a reply, otherwise log a warning
            if any(link.canonical
                   for link in i.links) or any(link.amp_canonical
                                               for link in i.links):
                # Generate a reply
                reply_text, reply_canonical_text = generate_reply(
                    stream_type=s.type,
                    np_subreddits=s.np_subreddits,
                    item_type=i.type,
                    links=i.links,
                    subreddit=i.subreddit)

                # Try to post the reply
                if reply_to_post:
                    try:
                        reply = submission.reply(reply_text)
                        log.info(f"Replied to {i.id} with {reply.name}")
                        update_local_data("submissions_success", i.id)
                        s.submissions_success.append(i.id)

                    except (Forbidden, Exception):
                        log.warning("Couldn't post reply!")
                        log.error(traceback.format_exc())
                        update_local_data("submissions_failed", i.id)
                        s.submissions_failed.append(i.id)

                        # Check if AmputatorBot is banned in the subreddit
                        is_banned = check_if_banned(i.subreddit)
                        if is_banned:
                            update_local_data("disallowed_subreddits",
                                              i.subreddit)
                            s.disallowed_subreddits.append(i.subreddit)

            # If no canonicals were found, log the failed attempt
            else:
                log.warning("No canonicals found")
                update_local_data("submissions_failed", i.id)
                s.submissions_failed.append(i.id)

            # If write_to_database is enabled, make a new entry for every URL
            if write_to_database:
                for link in i.links:
                    if link.is_amp:
                        add_data(session=get_engine_session(),
                                 entry_type=type.value,
                                 handled_utc=datetime.now().strftime(
                                     '%Y-%m-%d %H:%M:%S'),
                                 original_url=link.url_clean,
                                 canonical_url=link.canonical)
Exemplo n.º 5
0
def run_bot(type=Type.MENTION,
            guess_and_check=True,
            reply_to_item=True,
            write_to_database=True):
    # Get the stream instance (contains session, type and data)
    s = stream.get_stream(type)
    log.info("Set up new stream")

    # Start the stream
    for message in s.praw_session.inbox.stream():
        # Mark the item as read
        message.mark_read()

        # Log the message type and id
        log.info(f"New message: {message.type}: {message.fullname}")

        # If the message is a comment_reply, ignore it
        if message.type == "comment_reply":
            continue
        # If the message is an username_mention, start summon process
        if message.type == "username_mention":
            parent = message.parent()
            i = Item(type=Type.MENTION,
                     id=parent.name,
                     subreddit=parent.subreddit,
                     author=parent.author,
                     context=message.context,
                     summoner=message.author,
                     parent_link=parent.permalink)

            # Check if the parent is a comment or submission
            if isinstance(parent, praw.models.Comment):
                i.body = parent.body
                i.parent_type = Type.COMMENT
            elif isinstance(parent, praw.models.Submission):
                i.body = get_submission_body(parent)
                i.parent_type = Type.SUBMISSION
            else:
                log.warning("Unknown parent instance")

            # Check if the item meets the criteria
            meets_criteria, result_code = check_criteria(
                item=i,
                data=s,
                history_failed=s.mentions_failed,
                history_success=s.mentions_success,
                mustBeAMP=True,
                mustBeNew=True,
                mustNotBeDisallowedSubreddit=True,
                mustNotHaveFailed=True,
                mustNotBeMine=True,
                mustNotBeOptedOut=True,
                mustNotHaveDisallowedMods=True)

            # If it meets the criteria, try to find the canonicals and make a reply
            if result_code != ResultCode.ERROR_NO_AMP:
                log.info(
                    f"{i.id} in r/{i.subreddit} is AMP, result_code={result_code.value}"
                )
                # Get the urls from the body and try to find the canonicals
                urls = get_urls(i.body)
                i.links = get_urls_info(urls, guess_and_check)

                # If a canonical was found, generate a reply, otherwise log a warning
                if any(link.canonical
                       for link in i.links) or any(link.amp_canonical
                                                   for link in i.links):
                    # Generate a reply
                    reply_text, reply_canonical_text = generate_reply(
                        stream_type=s.type,
                        np_subreddits=s.np_subreddits,
                        item_type=i.parent_type,
                        subreddit=i.subreddit,
                        links=i.links,
                        summoned_link=i.context)

                    # Send a DM if AmputatorBot can't reply because it's disallowed by a subreddit, mod or user
                    if result_code == ResultCode.ERROR_DISALLOWED_SUBREDDIT \
                            or result_code == ResultCode.ERROR_DISALLOWED_MOD \
                            or result_code == ResultCode.ERROR_USER_OPTED_OUT:

                        # Generate and send an error DM dynamically based on the error
                        subject, message = dm_generator(
                            result_code=result_code,
                            parent_link=i.parent_link,
                            parent_subreddit=i.subreddit,
                            parent_type=i.parent_type.value,
                            first_amp_url=i.links[0].url_clean,
                            canonical_text=reply_canonical_text)
                        s.praw_session.redditor(str(i.summoner)).message(
                            subject, message)
                        log.info(f"Send summoner DM of type {result_code}")

                    # Try to post the reply, send a DM to the summoner
                    elif reply_to_item:
                        try:
                            reply = parent.reply(reply_text)
                            log.info(f"Replied to {i.id} with {reply.name}")
                            update_local_data("mentions_success", i.id)
                            s.mentions_success.append(i.id)

                            # Generate and send a SUCCESS DM to the summoner
                            result_code = ResultCode.SUCCESS
                            subject, message = dm_generator(
                                result_code=result_code,
                                reply_link=reply.permalink,
                                parent_subreddit=i.subreddit,
                                parent_type=i.parent_type.value,
                                parent_link=i.parent_link,
                                first_amp_url=i.links[0].url_clean,
                                canonical_text=reply_canonical_text)
                            s.praw_session.redditor(str(i.summoner)).message(
                                subject, message)
                            log.info(f"Send summoner DM of type {result_code}")

                        except (Forbidden, Exception):
                            log.warning("Couldn't post reply!")
                            log.error(traceback.format_exc())
                            update_local_data("mentions_failed", i.id)
                            s.mentions_failed.append(i.id)

                            # Check if AmputatorBot is banned in the subreddit
                            is_banned = check_if_banned(i.subreddit)
                            if is_banned:
                                update_local_data("disallowed_subreddits",
                                                  i.subreddit)
                                s.disallowed_subreddits.append(i.subreddit)

                            # Generate and send an ERROR_REPLY_FAILED DM to the summoner
                            result_code = ResultCode.ERROR_REPLY_FAILED
                            subject, message = dm_generator(
                                result_code=result_code,
                                parent_type=i.parent_type.value,
                                parent_link=i.parent_link,
                                first_amp_url=i.links[0].url_clean,
                                canonical_text=reply_canonical_text)
                            s.praw_session.redditor(str(i.summoner)).message(
                                subject, message)
                            log.info(f"Send summoner DM of type {result_code}")

                # If no canonicals were found, log the failed attempt
                else:
                    log.warning("No canonicals found")
                    update_local_data("mentions_failed", i.id)
                    s.mentions_failed.append(i.id)

                    # Check if the domain is problematic (meaning it's raising frequent errors)
                    if any(link.domain in s.problematic_domains
                           for link in i.links):
                        result_code = ResultCode.ERROR_PROBLEMATIC_DOMAIN
                    else:
                        result_code = ResultCode.ERROR_NO_CANONICALS

                    # Generate and send an
                    subject, message = dm_generator(
                        result_code=result_code,
                        parent_type=i.parent_type.value,
                        parent_link=i.parent_link,
                        first_amp_url=i.links[0].url_clean)

                    s.praw_session.redditor(str(i.summoner)).message(
                        subject, message)

                # If write_to_database is enabled, make a new entry for every URL
                if write_to_database:
                    for link in i.links:
                        if link.is_amp:
                            add_data(session=get_engine_session(),
                                     entry_type=type.value,
                                     handled_utc=datetime.now().strftime(
                                         '%Y-%m-%d %H:%M:%S'),
                                     original_url=link.url_clean,
                                     canonical_url=link.canonical)

        # If the message is a DM / message, check for opt-out and opt-back-in requests
        elif message.type == "unknown":
            subject = message.subject.lower()
            if subject == "opt me out of amputatorbot":
                try:
                    author = message.author.name
                    log.info(f"New opt-out request by {author}")

                    # If the user is already opted out, notify the user
                    if author.casefold() in list(
                            user.casefold() for user in s.disallowed_users):
                        log.warning("User has already opted out!")
                        s.praw_session.redditor(author).message(
                            subject=
                            "You have already opted out of AmputatorBot",
                            message=
                            "You have already opted out, so AmputatorBot won't reply to your comments "
                            "and submissions anymore. You will still be able to see AmputatorBot's replies to "
                            "other people's content. Block u/AmputatorBot if you don't want that either. "
                            "Cheers!")

                    # If the user hasn't been opted out yet, add user to the list and notify the user
                    else:
                        log.info("User has not opted out yet")
                        update_local_data("disallowed_users", author)
                        s.disallowed_users.append(author)
                        s.praw_session.redditor(author).message(
                            subject=
                            "You have successfully opted out of AmputatorBot",
                            message=
                            "You have successfully opted out of AmputatorBot. AmputatorBot won't reply to your "
                            "comments and submissions anymore (although it can take up to 24 hours to fully "
                            "process your opt-out request). You will still be able to see AmputatorBot's "
                            "replies to other people's content. Block u/AmputatorBot if you don't want that "
                            "either. Cheers!")

                except (RedditAPIException, Forbidden, Exception):
                    log.error(traceback.format_exc())
                    log.warning(
                        f"Something went wrong while processing opt-out request {message.fullname}"
                    )

            elif subject == "opt me back in again of amputatorbot":
                try:
                    author = message.author.name
                    log.info(f"New opt-back-in request by {author}")

                    # If the user is not opted out, notify the user
                    if author.casefold() not in list(
                            user.casefold() for user in s.disallowed_users):
                        log.warning("User is not opted out!")
                        s.praw_session.redditor(author).message(
                            subject="You don't have to opt in of AmputatorBot",
                            message=
                            "This opt-back-in feature is meant only for users who choose to opt-out earlier "
                            "but now regret it. At no point did you opt out of AmputatorBot so there's no "
                            "need to opt back in. Cheers!")

                    # If the user has opted out, remove user from the list and notify the user
                    else:
                        log.info("User is currently opted out")
                        remove_local_data("disallowed_users", author)
                        s.disallowed_users.remove(author)
                        s.praw_session.redditor(author).message(
                            subject=
                            "You have successfully opted back in of AmputatorBot",
                            message=
                            "You have successfully opted back in of AmputatorBot, meaning AmputatorBot can "
                            "reply to your comments and submissions again (although it can take up to 24 hours "
                            "to fully process your opt-back-in request). Thank you! Cheers!"
                        )

                except (RedditAPIException, Forbidden, Exception):
                    log.error(traceback.format_exc())
                    log.warning(
                        f"Something went wrong while processing opt-back-in request {message.fullname}"
                    )

            elif "you've been permanently banned from participating in" in subject:
                subreddit = message.subreddit
                if subreddit:
                    log.info(f"New ban issued by r/{subreddit}")
                    is_banned = check_if_banned(subreddit)
                    if is_banned:
                        update_local_data("disallowed_subreddits", subreddit)
                        s.disallowed_subreddits.append(subreddit)
                        log.info(f"Added {subreddit} to disallowed_subreddits")
                else:
                    log.warning(
                        f"Message wasn't send by a subreddit, but by {message.author.name}"
                    )

        else:
            log.warning(f"Unknown message type: {message.type}")
            continue
        log.info("\n")
Exemplo n.º 6
0
    def on_status(self, data):
        try:
            i = Item(type=Type.TWEET,
                     id=data.id,
                     body=data.text.encode(encoding='utf-8',
                                           errors='ignore').decode(),
                     author=get_twitterer_name(data))

            cached_urls = get_cached_tweet_urls(data)

            if len(cached_urls) >= 1:
                # Check if the item meets the criteria
                meets_criteria, result_code = check_tweet_criteria(
                    item=i,
                    cached_urls=cached_urls,
                    tweet=data,
                    data=self.s,
                    history_failed=self.s.tweets_failed,
                    history_success=self.s.tweets_success,
                    mustBeAMP=True,
                    mustNotBeRetweet=True,
                    mustBeCached=True,
                    mustBeNew=True,
                    mustNotHaveFailed=True,
                    mustNotBeMine=True,
                    mustNotBeOptedOut=True)

                # If it meets the criteria, try to find the canonicals and make a reply
                if meets_criteria:
                    log.info(f"{i.id} meets criteria")
                    # Get the urls from the body and try to find the canonicals
                    i.links = get_urls_info(cached_urls, self.guess_and_check)

                    # If a canonical was found, generate a reply, otherwise log a warning
                    if any(link.canonical for link in i.links):
                        # Generate a reply
                        generated_tweet = generate_tweet(i.links)

                        # Try to post the reply
                        if self.reply_to_item:
                            try:
                                reply = self.api.update_status(
                                    status=generated_tweet,
                                    in_reply_to_status_id=i.id,
                                    auto_populate_reply_metadata=True)
                                log.info(f"Replied to {i.id} with {reply.id}")
                                update_local_data("tweets_success", i.id)
                                self.s.tweets_success.append(i.id)

                            except (TweepError, Exception):
                                log.warning("Couldn't post reply!")
                                log.error(traceback.format_exc())
                                update_local_data("tweets_failed", i.id)
                                self.s.tweets_failed.append(i.id)

                    # If no canonicals were found, log the failed attempt
                    else:
                        log.warning("No canonicals found")
                        update_local_data("tweets_failed", i.id)
                        self.s.tweets_failed.append(i.id)

                    # If write_to_database is enabled, make a new entry for every URL
                    if self.write_to_database:
                        for link in i.links:
                            add_data(session=get_engine_session(),
                                     entry_type=i.type.value,
                                     handled_utc=datetime.now().strftime(
                                         '%Y-%m-%d %H:%M:%S'),
                                     original_url=link.url_clean,
                                     canonical_url=link.canonical)

        except (TweepError, Exception):
            log.error(traceback.format_exc())
            log.warning("\nSomething went wrong while handling a tweet")
            sleep(120)
        return True
Exemplo n.º 7
0
    def test_canonical(self, use_database=True):
        new_canonicals_amount = 0
        old_canonicals_amount = 0

        # Use data from the database
        if use_database:
            old_entries = get_data(session=get_engine_session(),
                                   limit=100,
                                   offset=5000,
                                   order_descending=True,
                                   canonical_url=None)

        # Or use a single entry as specified below
        else:
            old_entries = [
                Entry(
                    original_url=
                    "www.google.com/amp/s/abc3340.com/amp/news/inside-your-world/the-federal-government-spends-billions-each-year-maintaining-empty-buildings-nationwide",
                    canonical_url=
                    "https://abc3340.com/news/inside-your-world/the-federal-government-spends-billions-each-year-maintaining-empty-buildings-nationwide"
                )
            ]

        # Loop through every old entry and try to find the canonicals, compare the results
        for old_entry in old_entries:
            if old_entry.canonical_url:
                old_canonicals_amount += 1

            urls = get_urls(old_entry.original_url)
            urls_info = get_urls_info(urls)
            if urls_info:
                for link in urls_info:
                    if link.canonical:
                        new_canonicals_amount += 1
                        if link.canonical.url == old_entry.canonical_url:
                            log.info("Canonical URLs match")
                        else:
                            log.warning("Canonical URLs do not match!")

                    log.info(f"BODY : {old_entry.original_url}")
                    log.info(f"OLD  : {old_entry.canonical_url}")
                    log.info(
                        f"NEW  : {link.canonical.url if link.canonical else None}"
                    )

            else:
                log.warning(f"No canonicals found")

        log.info(
            f"\nCanonicals found: Old: {old_canonicals_amount}, New: {new_canonicals_amount}"
        )

        # If same as before, great!
        if new_canonicals_amount == old_canonicals_amount:
            self.assertEqual(new_canonicals_amount, old_canonicals_amount)

        # If better than before, great!
        if new_canonicals_amount > old_canonicals_amount:
            self.assertGreater(new_canonicals_amount, old_canonicals_amount)

        # If it is worse than before, not good.
        if new_canonicals_amount < old_canonicals_amount:
            self.assertLess(old_canonicals_amount, new_canonicals_amount)