def check_for_repost(self, post: Post) -> Optional[SearchResults]:
        """
        Take a given post and check if it's a repost
        :rtype: SearchResults
        :param post: Post obj
        :return: Search results
        """
        if post.post_type == 'image':
            try:
                return self.image_service.check_image(
                    post.url,
                    post=post,
                )
            except NoIndexException:
                log.error(
                    'No available index for image repost check.  Trying again later'
                )
                return

        elif post.post_type == 'link':
            search_results = get_link_reposts(post.url,
                                              self.uowm,
                                              get_default_link_search_settings(
                                                  self.config),
                                              post=post,
                                              get_total=True)
            return filter_search_results(
                search_results,
                reddit=self.reddit,
                uitl_api=f'{self.config.util_api}/maintenance/removed')
        else:
            log.info(
                f'Post {post.post_id} is a {post.post_type} post.  Skipping')
            return
Пример #2
0
    def process_link_repost_request(self,
                                    summons: Summons,
                                    post: Post,
                                    monitored_sub: MonitoredSub = None):
        response = SummonsResponse(summons=summons)

        search_results = get_link_reposts(post.url,
                                          self.uowm,
                                          get_default_link_search_settings(
                                              self.config),
                                          post=post,
                                          get_total=True)
        search_results = filter_search_results(
            search_results,
            reddit=self.reddit.reddit,
            uitl_api=f'{self.config.util_api}/maintenance/removed')

        if not monitored_sub:
            response.message = self.response_builder.build_default_comment(
                search_results, signature=False)
        else:
            response.message = self.response_builder.build_sub_comment(
                monitored_sub, search_results, signature=False)

        if search_results.matches:
            save_link_repost(post, search_results.matches[0].post, self.uowm,
                             'summons')

        self._send_response(response)
Пример #3
0
def link_repost_check(self, posts, ):
    with self.uowm.start() as uow:
        for post in posts:
            """
            if post.url_hash == '540f1167d27dcca2ea2772443beb5c79':
                continue
            """
            if post.url_hash in self.link_blacklist:
                log.info('Skipping blacklisted URL hash %s', post.url_hash)
                continue

            log.debug('Checking URL for repost: %s', post.url_hash)
            search_results = get_link_reposts(post.url, self.uowm, get_default_link_search_settings(self.config),
                                              post=post)

            if len(search_results.matches) > 10000:
                log.info('Link hash %s shared %s times. Adding to blacklist', post.url_hash, len(search_results.matches))
                self.link_blacklist.append(post.url_hash)
                self.notification_svc.send_notification(f'URL has been shared {len(search_results.matches)} times. Adding to blacklist. \n\n {post.url}')

            search_results = filter_search_results(
                search_results,
                uitl_api=f'{self.config.util_api}/maintenance/removed'
            )
            search_results.search_times.stop_timer('total_search_time')
            log.info('Link Query Time: %s', search_results.search_times.query_time)
            if not search_results.matches:
                log.debug('Not matching linkes for post %s', post.post_id)
                post.checked_repost = True
                uow.posts.update(post)
                uow.commit()
                continue

            log.info('Found %s matching links', len(search_results.matches))
            log.info('Creating Link Repost. Post %s is a repost of %s', post.post_id, search_results.matches[0].post.post_id)
            repost_of = search_results.matches[0].post
            new_repost = LinkRepost(post_id=post.post_id, repost_of=repost_of.post_id, author=post.author, source='ingest', subreddit=post.subreddit)
            repost_of.repost_count += 1
            post.checked_repost = True
            uow.posts.update(post)
            uow.link_repost.add(new_repost)

            try:
                uow.commit()
                self.event_logger.save_event(RepostEvent(event_type='repost_found', status='success',
                                                         repost_of=search_results.matches[0].post.post_id,
                                                         post_type=post.post_type))
            except IntegrityError as e:
                uow.rollback()
                log.exception('Error saving link repost', exc_info=True)
                self.event_logger.save_event(RepostEvent(event_type='repost_found', status='error',
                                                         repost_of=search_results.matches[0].post.post_id,
                                                         post_type=post.post_type))
        self.event_logger.save_event(
            BatchedEvent(event_type='repost_check', status='success', count=len(posts), post_type='link'))
    def _filter_results_for_reposts(self,
                                    search_results: ImageSearchResults,
                                    sort_by='created') -> ImageSearchResults:
        """
        Take a list of matches and filter out posts that are not reposts.
        This is done via distance checking, creation date, crosspost
        :param checked_post: The post we're finding matches for
        :param search_results: A cleaned list of matches
        :param target_hamming_distance: Hamming cutoff for matches
        :param target_annoy_distance: Annoy cutoff for matches
        :rtype: List[ImageSearchMatch]
        """

        log.debug('Starting result filters with %s matches',
                  len(search_results.matches))

        search_results.matches = list(
            filter(filter_no_dhash, search_results.matches))

        search_results = filter_search_results(
            search_results,
            reddit=self.reddit,
            uitl_api=f'{self.config.util_api}/maintenance/removed')

        closest_match = get_closest_image_match(search_results.matches,
                                                check_url=True)
        if closest_match and closest_match.hamming_match_percent > 40:  # TODO - Move to config
            search_results.closest_match = closest_match
            if search_results.closest_match and search_results.meme_template:
                match_hash = self._get_meme_hash(
                    search_results.closest_match.post.url)
                search_results.closest_match.hamming_distance = hamming(
                    search_results.meme_hash, match_hash)
                search_results.closest_match.hash_size = len(match_hash)

        # Has to be after closest match so we don't drop closest
        search_results.matches = list(
            filter(
                annoy_distance_filter(
                    search_results.search_settings.target_annoy_distance),
                search_results.matches))
        search_results.matches = list(
            filter(
                hamming_distance_filter(
                    search_results.target_hamming_distance),
                search_results.matches))

        if search_results.meme_template:
            search_results.search_times.start_timer('meme_filter_time')
            search_results.matches = self._final_meme_filter(
                search_results.meme_hash, search_results.matches,
                search_results.target_meme_hamming_distance)
            search_results.search_times.stop_timer('meme_filter_time')

        search_results.matches = sort_reposts(search_results.matches,
                                              sort_by=sort_by)

        for match in search_results.matches:
            log.debug('Match found: %s - A:%s H:%s P:%s',
                      f'https://redd.it/{match.post.post_id}',
                      round(match.annoy_distance, 5), match.hamming_distance,
                      f'{match.hamming_match_percent}%')

        return search_results
Пример #5
0
 def test_filter_search_results_hit_all_filters(self):
     search_results = get_image_search_results_multi_match()
     search_results.search_settings.filter_same_author = True
     search_results.search_settings.filter_crossposts = True
     search_results.search_settings.only_older_matches = True
     search_results.search_settings.same_sub = True
     search_results.search_settings.target_title_match = None
     search_results.search_settings.max_days_old = 4
     search_results.checked_post.author = 'barry'
     search_results.checked_post.subreddit = 'sub1'
     search_results.checked_post.post_id = '1111'
     search_results.checked_post.created_at = datetime.utcfromtimestamp(
         1573995250)
     matches = []
     # Dropped by same author
     matches.append(
         ImageSearchMatch(
             'test.com', 1,
             Post(id=1,
                  author='barry',
                  post_id='abc123',
                  created_at=datetime.strptime('2019-01-28 05:20:03',
                                               '%Y-%m-%d %H:%M:%S')), 10,
             10, 32))
     # Dropped by crosspost
     matches.append(
         ImageSearchMatch(
             'test.com', 1,
             Post(id=2,
                  author='steve',
                  post_id='123abc',
                  crosspost_parent='abc',
                  created_at=datetime.strptime('2019-06-28 05:20:03',
                                               '%Y-%m-%d %H:%M:%S')), 10,
             10, 32))
     # Dropped by only older
     matches.append(
         ImageSearchMatch(
             'test.com', 1,
             Post(id=3,
                  author='steve',
                  post_id='3333',
                  title='some normal title',
                  created_at=datetime.utcfromtimestamp(1574081650)), 10,
             0.250, 32))
     # Dropped by same sub
     matches.append(
         ImageSearchMatch(
             'test.com', 1,
             Post(id=4,
                  author='steve',
                  post_id='4444',
                  title='some normal title',
                  subreddit='sub2',
                  created_at=datetime.utcfromtimestamp(1573908850)), 10,
             0.250, 32))
     matches.append(
         ImageSearchMatch(
             'test.com', 1,
             Post(id=5,
                  author='steve',
                  post_id='5555',
                  title='some normal title',
                  subreddit='sub1',
                  created_at=datetime.utcfromtimestamp(1573988200)), 10,
             0.250, 32))
     # Dropped by same post
     matches.append(
         ImageSearchMatch(
             'test.com', 1,
             Post(id=6,
                  post_id='1111',
                  title='some normal title',
                  subreddit='sub1',
                  created_at=datetime.utcfromtimestamp(1573908850)), 10,
             0.250, 32))
     matches.append(
         ImageSearchMatch(
             'test.com', 1,
             Post(id=7,
                  post_id='6666',
                  title='some normal title',
                  subreddit='sub1',
                  created_at=datetime.utcfromtimestamp(1573908850)), 10,
             0.250, 32))
     search_results.matches = matches
     with patch('redditrepostsleuth.core.util.repost_filters.datetime'
                ) as mock_date:
         mock_date.utcnow.return_value = datetime.utcfromtimestamp(
             1574360460)
         r = filter_search_results(search_results)
     self.assertEqual(1, len(search_results.matches))
     self.assertEqual('5555', r.matches[0].post.post_id)
     print('')