Exemplo n.º 1
0
def get_image_search_results_multi_match():
    search_results = ImageSearchResults('test.com',
                                        get_image_search_settings(),
                                        checked_post=Post(post_id='abc123',
                                                          post_type='image',
                                                          subreddit='test'))
    search_results.search_times = ImageSearchTimes()
    search_results.search_times.total_search_time = 10
    search_results.matches.append(
        ImageSearchMatch(
            'test.com', 1,
            Post(id=1,
                 post_id='1111',
                 created_at=datetime.strptime('2019-01-28 05:20:03',
                                              '%Y-%m-%d %H:%M:%S')), 10, 10,
            32))
    search_results.matches.append(
        ImageSearchMatch(
            'test.com', 1,
            Post(id=2,
                 post_id='2222',
                 created_at=datetime.strptime('2019-06-28 05:20:03',
                                              '%Y-%m-%d %H:%M:%S')), 10, 10,
            32))
    search_results.matches.append(
        ImageSearchMatch('test.com', 1,
                         Post(id=3, post_id='3333', title='some normal title'),
                         10, 0.250, 32))
    return search_results
Exemplo n.º 2
0
 def _get_image_search_results_multi_match(self):
     search_results = ImageSearchResults('test.com', self._get_image_search_settings(),
                                         checked_post=Post(post_id='abc123', post_type='image', subreddit='test'))
     search_results.search_times = ImageSearchTimes()
     search_results.search_times.total_search_time = 10
     search_results.matches.append(
         ImageSearchMatch(
             'test.com',
             1,
             Post(post_id='abc123', created_at=datetime.strptime('2019-01-28 05:20:03', '%Y-%m-%d %H:%M:%S')),
             10,
             10,
             32
         )
     )
     search_results.matches.append(
         ImageSearchMatch(
             'test.com',
             1,
             Post(post_id='123abc', created_at=datetime.strptime('2019-06-28 05:20:03', '%Y-%m-%d %H:%M:%S')),
             10,
             10,
             32
         )
     )
     return search_results
Exemplo n.º 3
0
def get_image_search_results_no_match():
    search_results = ImageSearchResults('test.com',
                                        get_image_search_settings(),
                                        checked_post=Post(post_id='abc123',
                                                          post_type='image',
                                                          subreddit='test'))
    search_results.search_times = ImageSearchTimes()
    search_results.search_times.total_search_time = 10
    return search_results
    def _log_search(
        self,
        search_results: ImageSearchResults,
        source: str,
        used_current_index: bool,
        used_historical_index: bool,
    ) -> ImageSearchResults:
        image_search = ImageSearch(
            post_id=search_results.checked_post.post_id
            if search_results.checked_post else 'url',
            used_historical_index=used_historical_index,
            used_current_index=used_current_index,
            target_hamming_distance=search_results.target_hamming_distance,
            target_annoy_distance=search_results.search_settings.
            target_annoy_distance,
            same_sub=search_results.search_settings.same_sub,
            max_days_old=search_results.search_settings.max_days_old,
            filter_dead_matches=search_results.search_settings.
            filter_dead_matches,
            only_older_matches=search_results.search_settings.
            only_older_matches,
            meme_filter=search_results.search_settings.meme_filter,
            meme_template_used=search_results.meme_template.id
            if search_results.meme_template else None,
            search_time=search_results.search_times.total_search_time,
            index_search_time=search_results.search_times.index_search_time,
            total_filter_time=search_results.search_times.total_filter_time,
            target_title_match=search_results.search_settings.
            target_title_match,
            matches_found=len(search_results.matches),
            source=source,
            subreddit=search_results.checked_post.subreddit
            if search_results.checked_post else 'url',
            search_results=create_search_result_json(search_results),
            target_image_meme_match=search_results.search_settings.
            target_meme_match_percent,
            target_image_match=search_results.search_settings.
            target_match_percent)

        with self.uowm.start() as uow:
            uow.image_search.add(image_search)
            try:
                uow.commit()
                search_results.logged_search = image_search
            except Exception as e:
                log.exception('Failed to save image search', exc_info=False)

        return search_results
    def _filter_results_for_reposts(self,
                                    search_results: ImageSearchResults,
                                    sort_by='created') -> ImageSearchResults:
        """
        Take a list of matches and filter out posts that are not reposts.
        This is done via distance checking, creation date, crosspost
        :param checked_post: The post we're finding matches for
        :param search_results: A cleaned list of matches
        :param target_hamming_distance: Hamming cutoff for matches
        :param target_annoy_distance: Annoy cutoff for matches
        :rtype: List[ImageSearchMatch]
        """

        log.debug('Starting result filters with %s matches',
                  len(search_results.matches))

        search_results.matches = list(
            filter(filter_no_dhash, search_results.matches))

        search_results = filter_search_results(
            search_results,
            reddit=self.reddit,
            uitl_api=f'{self.config.util_api}/maintenance/removed')

        closest_match = get_closest_image_match(search_results.matches,
                                                check_url=True)
        if closest_match and closest_match.hamming_match_percent > 40:  # TODO - Move to config
            search_results.closest_match = closest_match
            if search_results.closest_match and search_results.meme_template:
                match_hash = self._get_meme_hash(
                    search_results.closest_match.post.url)
                search_results.closest_match.hamming_distance = hamming(
                    search_results.meme_hash, match_hash)
                search_results.closest_match.hash_size = len(match_hash)

        # Has to be after closest match so we don't drop closest
        search_results.matches = list(
            filter(
                annoy_distance_filter(
                    search_results.search_settings.target_annoy_distance),
                search_results.matches))
        search_results.matches = list(
            filter(
                hamming_distance_filter(
                    search_results.target_hamming_distance),
                search_results.matches))

        if search_results.meme_template:
            search_results.search_times.start_timer('meme_filter_time')
            search_results.matches = self._final_meme_filter(
                search_results.meme_hash, search_results.matches,
                search_results.target_meme_hamming_distance)
            search_results.search_times.stop_timer('meme_filter_time')

        search_results.matches = sort_reposts(search_results.matches,
                                              sort_by=sort_by)

        for match in search_results.matches:
            log.debug('Match found: %s - A:%s H:%s P:%s',
                      f'https://redd.it/{match.post.post_id}',
                      round(match.annoy_distance, 5), match.hamming_distance,
                      f'{match.hamming_match_percent}%')

        return search_results
    def check_image(
            self,
            url: Text,
            post: Post = None,
            source='unknown',
            sort_by='created',
            search_settings: ImageSearchSettings = None) -> ImageSearchResults:
        log.info('Checking URL for matches: %s', url)

        if not search_settings:
            log.info('No search settings provided, using default')
            search_settings = get_default_image_search_settings(self.config)

        search_results = ImageSearchResults(url,
                                            checked_post=post,
                                            search_settings=search_settings)

        search_results.search_times.start_timer('total_search_time')

        if search_settings.meme_filter:
            search_results.search_times.start_timer('meme_detection_time')
            search_results.meme_template = self._get_meme_template(
                search_results.target_hash)
            search_results.search_times.stop_timer('meme_detection_time')
            if search_results.meme_template:
                search_settings.target_match_percent = 100  # Keep only 100% matches on default hash size
                search_results.meme_hash = self._get_meme_hash(url)
                if not search_results.meme_hash:
                    log.error('No meme hash, disabled meme filter')
                    search_results.meme_template = None
                else:
                    log.info('Using meme filter %s',
                             search_results.meme_template.id)

        log.debug('Search Settings: %s', search_settings)

        api_search_results = self._get_matches(
            search_results.target_hash,
            search_results.target_hamming_distance,
            search_settings.target_annoy_distance,
            max_matches=search_settings.max_matches,
            max_depth=search_settings.max_depth,
            search_times=search_results.search_times)

        search_results.search_times.index_search_time = api_search_results.index_search_time
        search_results.total_searched = api_search_results.total_searched

        search_results.search_times.start_timer('set_match_post_time')
        search_results.matches = self._build_search_results(
            api_search_results.historical_matches, url,
            search_results.target_hash)
        search_results.matches += self._build_search_results(
            api_search_results.current_matches,
            url,
            search_results.target_hash,
            historical_index=False)
        search_results.search_times.stop_timer('set_match_post_time')

        search_results.search_times.start_timer('remove_duplicate_time')
        search_results.matches = self._remove_duplicates(
            search_results.matches)
        if post:
            search_results.matches = set_all_title_similarity(
                search_results.checked_post.title, search_results.matches)
        search_results.search_times.stop_timer('remove_duplicate_time')

        search_results = self._filter_results_for_reposts(search_results,
                                                          sort_by=sort_by)
        search_results.search_times.stop_timer('total_search_time')
        self._log_search_time(search_results, source)

        search_results = self._log_search(
            search_results,
            source,
            api_search_results.used_current_index,
            api_search_results.used_historical_index,
        )

        log.info('Seached %s items and found %s matches',
                 search_results.total_searched, len(search_results.matches))
        return search_results
Exemplo n.º 7
0
 def test_build_image_report_link_positive(self):
     search_results = ImageSearchResults('test.com', Mock(), checked_post=Post(post_id='abc123'))
     search_results.matches.append(ImageSearchMatch('test.com', 123, Mock(), 1, 1, 32))
     result = build_image_report_link(search_results)
     expected = "*I'm not perfect, but you can help. Report [ [False Positive](https://www.reddit.com/message/compose/?to=RepostSleuthBot&subject=False%20Positive&message={\"post_id\": \"abc123\", \"meme_template\": null}) ]*"
     self.assertEqual(expected, result)