def _filter_results_for_reposts(self, search_results: ImageSearchResults, sort_by='created') -> ImageSearchResults: """ Take a list of matches and filter out posts that are not reposts. This is done via distance checking, creation date, crosspost :param checked_post: The post we're finding matches for :param search_results: A cleaned list of matches :param target_hamming_distance: Hamming cutoff for matches :param target_annoy_distance: Annoy cutoff for matches :rtype: List[ImageSearchMatch] """ log.debug('Starting result filters with %s matches', len(search_results.matches)) search_results.matches = list( filter(filter_no_dhash, search_results.matches)) search_results = filter_search_results( search_results, reddit=self.reddit, uitl_api=f'{self.config.util_api}/maintenance/removed') closest_match = get_closest_image_match(search_results.matches, check_url=True) if closest_match and closest_match.hamming_match_percent > 40: # TODO - Move to config search_results.closest_match = closest_match if search_results.closest_match and search_results.meme_template: match_hash = self._get_meme_hash( search_results.closest_match.post.url) search_results.closest_match.hamming_distance = hamming( search_results.meme_hash, match_hash) search_results.closest_match.hash_size = len(match_hash) # Has to be after closest match so we don't drop closest search_results.matches = list( filter( annoy_distance_filter( search_results.search_settings.target_annoy_distance), search_results.matches)) search_results.matches = list( filter( hamming_distance_filter( search_results.target_hamming_distance), search_results.matches)) if search_results.meme_template: search_results.search_times.start_timer('meme_filter_time') search_results.matches = self._final_meme_filter( search_results.meme_hash, search_results.matches, search_results.target_meme_hamming_distance) search_results.search_times.stop_timer('meme_filter_time') search_results.matches = sort_reposts(search_results.matches, sort_by=sort_by) for match in search_results.matches: log.debug('Match found: %s - A:%s H:%s P:%s', f'https://redd.it/{match.post.post_id}', round(match.annoy_distance, 5), match.hamming_distance, f'{match.hamming_match_percent}%') return search_results
def check_image( self, url: Text, post: Post = None, source='unknown', sort_by='created', search_settings: ImageSearchSettings = None) -> ImageSearchResults: log.info('Checking URL for matches: %s', url) if not search_settings: log.info('No search settings provided, using default') search_settings = get_default_image_search_settings(self.config) search_results = ImageSearchResults(url, checked_post=post, search_settings=search_settings) search_results.search_times.start_timer('total_search_time') if search_settings.meme_filter: search_results.search_times.start_timer('meme_detection_time') search_results.meme_template = self._get_meme_template( search_results.target_hash) search_results.search_times.stop_timer('meme_detection_time') if search_results.meme_template: search_settings.target_match_percent = 100 # Keep only 100% matches on default hash size search_results.meme_hash = self._get_meme_hash(url) if not search_results.meme_hash: log.error('No meme hash, disabled meme filter') search_results.meme_template = None else: log.info('Using meme filter %s', search_results.meme_template.id) log.debug('Search Settings: %s', search_settings) api_search_results = self._get_matches( search_results.target_hash, search_results.target_hamming_distance, search_settings.target_annoy_distance, max_matches=search_settings.max_matches, max_depth=search_settings.max_depth, search_times=search_results.search_times) search_results.search_times.index_search_time = api_search_results.index_search_time search_results.total_searched = api_search_results.total_searched search_results.search_times.start_timer('set_match_post_time') search_results.matches = self._build_search_results( api_search_results.historical_matches, url, search_results.target_hash) search_results.matches += self._build_search_results( api_search_results.current_matches, url, search_results.target_hash, historical_index=False) search_results.search_times.stop_timer('set_match_post_time') search_results.search_times.start_timer('remove_duplicate_time') search_results.matches = self._remove_duplicates( search_results.matches) if post: search_results.matches = set_all_title_similarity( search_results.checked_post.title, search_results.matches) search_results.search_times.stop_timer('remove_duplicate_time') search_results = self._filter_results_for_reposts(search_results, sort_by=sort_by) search_results.search_times.stop_timer('total_search_time') self._log_search_time(search_results, source) search_results = self._log_search( search_results, source, api_search_results.used_current_index, api_search_results.used_historical_index, ) log.info('Seached %s items and found %s matches', search_results.total_searched, len(search_results.matches)) return search_results