def _GenerateApplicationDicts(self): unsorted_dict = HydrusData.BuildKeyToListDict(( master_service_id, (index, application_service_id) ) for ( master_service_id, index, application_service_id ) in self._Execute( 'SELECT master_service_id, service_index, application_service_id FROM tag_sibling_application;' )) self._service_ids_to_applicable_service_ids = collections.defaultdict( list) self._service_ids_to_applicable_service_ids.update({ master_service_id: [ application_service_id for (index, application_service_id ) in sorted(index_and_applicable_service_ids) ] for (master_service_id, index_and_applicable_service_ids) in unsorted_dict.items() }) self._service_ids_to_interested_service_ids = collections.defaultdict( set) for (master_service_id, application_service_ids ) in self._service_ids_to_applicable_service_ids.items(): for application_service_id in application_service_ids: self._service_ids_to_interested_service_ids[ application_service_id].add(master_service_id)
def GetTagSiblingsIdsChains(self, service_id, tag_ids): done_tag_ids = set() next_tag_ids = set(tag_ids) result_rows = set() while len(next_tag_ids) > 0: with self._MakeTemporaryIntegerTable( next_tag_ids, 'tag_id') as temp_next_tag_ids_table_name: done_tag_ids.update(next_tag_ids) next_tag_ids = set() # keep these separate--older sqlite can't do cross join to an OR ON # temp tag_ids to siblings queries = [ 'SELECT status, bad_tag_id, good_tag_id FROM {} CROSS JOIN tag_siblings ON ( bad_tag_id = tag_id ) WHERE service_id = ?' .format(temp_next_tag_ids_table_name), 'SELECT status, bad_tag_id, good_tag_id FROM {} CROSS JOIN tag_siblings ON ( good_tag_id = tag_id ) WHERE service_id = ?' .format(temp_next_tag_ids_table_name), 'SELECT status, bad_tag_id, good_tag_id FROM {} CROSS JOIN tag_sibling_petitions ON ( bad_tag_id = tag_id ) WHERE service_id = ?' .format(temp_next_tag_ids_table_name), 'SELECT status, bad_tag_id, good_tag_id FROM {} CROSS JOIN tag_sibling_petitions ON ( good_tag_id = tag_id ) WHERE service_id = ?' .format(temp_next_tag_ids_table_name) ] query = ' UNION '.join(queries) for row in self._Execute( query, (service_id, service_id, service_id, service_id)): result_rows.add(row) (status, bad_tag_id, good_tag_id) = row for tag_id in (bad_tag_id, good_tag_id): if tag_id not in done_tag_ids: next_tag_ids.add(tag_id) unsorted_statuses_to_pair_ids = HydrusData.BuildKeyToListDict( (status, (bad_tag_id, good_tag_id)) for (status, bad_tag_id, good_tag_id) in result_rows) statuses_to_pair_ids = collections.defaultdict(list) statuses_to_pair_ids.update({ status: sorted(pair_ids) for (status, pair_ids) in unsorted_statuses_to_pair_ids.items() }) return statuses_to_pair_ids
def MergeTagsManagers(tags_managers): # we cheat here and just get display tags, since this is read only and storage exacts isn't super important def CurrentAndPendingFilter(items): for (service_key, statuses_to_tags) in items: filtered = { status: tags for (status, tags) in list(statuses_to_tags.items()) if status in (HC.CONTENT_STATUS_CURRENT, HC.CONTENT_STATUS_PENDING) } yield (service_key, filtered) # [[( service_key, statuses_to_tags )]] s_k_s_t_t_tupled = (CurrentAndPendingFilter( tags_manager.GetServiceKeysToStatusesToTags( ClientTags.TAG_DISPLAY_ACTUAL).items()) for tags_manager in tags_managers) # [(service_key, statuses_to_tags)] flattened_s_k_s_t_t = itertools.chain.from_iterable(s_k_s_t_t_tupled) # service_key : [ statuses_to_tags ] s_k_s_t_t_dict = HydrusData.BuildKeyToListDict(flattened_s_k_s_t_t) # now let's merge so we have service_key : statuses_to_tags merged_service_keys_to_statuses_to_tags = collections.defaultdict( HydrusData.default_dict_set) for (service_key, several_statuses_to_tags) in list(s_k_s_t_t_dict.items()): # [[( status, tags )]] s_t_t_tupled = (list(s_t_t.items()) for s_t_t in several_statuses_to_tags) # [( status, tags )] flattened_s_t_t = itertools.chain.from_iterable(s_t_t_tupled) statuses_to_tags = HydrusData.default_dict_set() for (status, tags) in flattened_s_t_t: statuses_to_tags[status].update(tags) merged_service_keys_to_statuses_to_tags[ service_key] = statuses_to_tags return TagsManager(merged_service_keys_to_statuses_to_tags, merged_service_keys_to_statuses_to_tags)
def GetSomePetitionedRows(self, service_id: int): petitioned_files_table_name = GenerateFilesTableName( service_id, HC.CONTENT_STATUS_PETITIONED) petitioned_rows = list( HydrusData.BuildKeyToListDict( self._c.execute( 'SELECT reason_id, hash_id FROM {} ORDER BY reason_id LIMIT 100;' .format(petitioned_files_table_name))).items()) return petitioned_rows
def GetJSONDumpNamesToBackupTimestamps( self, dump_type ): names_to_backup_timestamps = HydrusData.BuildKeyToListDict( self._c.execute( 'SELECT dump_name, timestamp FROM json_dumps_named WHERE dump_type = ? ORDER BY timestamp ASC;', ( dump_type, ) ) ) for ( name, timestamp_list ) in list( names_to_backup_timestamps.items() ): timestamp_list.pop( -1 ) # remove the non backup timestamp if len( timestamp_list ) == 0: del names_to_backup_timestamps[ name ] return names_to_backup_timestamps
def GetTagParentsIds(self, service_id): statuses_and_pair_ids = self._Execute( 'SELECT status, child_tag_id, parent_tag_id FROM tag_parents WHERE service_id = ? UNION SELECT status, child_tag_id, parent_tag_id FROM tag_parent_petitions WHERE service_id = ?;', (service_id, service_id)).fetchall() unsorted_statuses_to_pair_ids = HydrusData.BuildKeyToListDict( (status, (child_tag_id, parent_tag_id)) for (status, child_tag_id, parent_tag_id) in statuses_and_pair_ids) statuses_to_pair_ids = collections.defaultdict(list) statuses_to_pair_ids.update({ status: sorted(pair_ids) for (status, pair_ids) in unsorted_statuses_to_pair_ids.items() }) return statuses_to_pair_ids
def DoSomeWork(self, source): time_started_precise = HydrusData.GetNowPrecise() data = source.GetSomeData() content_updates = [] pairs = [] for (hash, tags) in data: pairs.extend(((tag, hash) for tag in tags)) num_done = len(pairs) tags_to_hashes = HydrusData.BuildKeyToListDict(pairs) if self._content_action == HC.CONTENT_UPDATE_PETITION: reason = 'Mass Migration Job' else: reason = None for (tag, hashes) in tags_to_hashes.items(): content_updates.append( HydrusData.ContentUpdate(HC.CONTENT_TYPE_MAPPINGS, self._content_action, (tag, hashes), reason=reason)) service_keys_to_content_updates = { self._tag_service_key: content_updates } self._controller.WriteSynchronous('content_updates', service_keys_to_content_updates) return GetBasicSpeedStatement(num_done, time_started_precise)
def Search(self, hash_id, max_hamming_distance): if max_hamming_distance == 0: similar_hash_ids = self._STL( self._c.execute( 'SELECT hash_id FROM shape_perceptual_hash_map WHERE phash_id IN ( SELECT phash_id FROM shape_perceptual_hash_map WHERE hash_id = ? );', (hash_id, ))) similar_hash_ids_and_distances = [ (similar_hash_id, 0) for similar_hash_id in similar_hash_ids ] else: search_radius = max_hamming_distance top_node_result = self._c.execute( 'SELECT phash_id FROM shape_vptree WHERE parent_id IS NULL;' ).fetchone() if top_node_result is None: return [] (root_node_phash_id, ) = top_node_result search = self._STL( self._c.execute( 'SELECT phash FROM shape_perceptual_hashes NATURAL JOIN shape_perceptual_hash_map WHERE hash_id = ?;', (hash_id, ))) if len(search) == 0: return [] similar_phash_ids_to_distances = {} num_cycles = 0 total_nodes_searched = 0 for search_phash in search: next_potentials = [root_node_phash_id] while len(next_potentials) > 0: current_potentials = next_potentials next_potentials = [] num_cycles += 1 total_nodes_searched += len(current_potentials) for group_of_current_potentials in HydrusData.SplitListIntoChunks( current_potentials, 10000): # this is split into fixed lists of results of subgroups because as an iterable it was causing crashes on linux!! # after investigation, it seemed to be SQLite having a problem with part of Get64BitHammingDistance touching phashes it presumably was still hanging on to # the crash was in sqlite code, again presumably on subsequent fetch # adding a delay in seemed to fix it as well. guess it was some memory maintenance buffer/bytes thing # anyway, we now just get the whole lot of results first and then work on the whole lot ''' #old method select_statement = 'SELECT phash_id, phash, radius, inner_id, outer_id FROM shape_perceptual_hashes NATURAL JOIN shape_vptree WHERE phash_id = ?;' results = list( self._ExecuteManySelectSingleParam( select_statement, group_of_current_potentials ) ) ''' with HydrusDB.TemporaryIntegerTable( self._c, group_of_current_potentials, 'phash_id') as temp_table_name: # temp phash_ids to actual phashes and tree info results = self._c.execute( 'SELECT phash_id, phash, radius, inner_id, outer_id FROM {} CROSS JOIN shape_perceptual_hashes USING ( phash_id ) CROSS JOIN shape_vptree USING ( phash_id );' .format(temp_table_name)).fetchall() for (node_phash_id, node_phash, node_radius, inner_phash_id, outer_phash_id) in results: # first check the node itself--is it similar? node_hamming_distance = HydrusData.Get64BitHammingDistance( search_phash, node_phash) if node_hamming_distance <= search_radius: if node_phash_id in similar_phash_ids_to_distances: current_distance = similar_phash_ids_to_distances[ node_phash_id] similar_phash_ids_to_distances[ node_phash_id] = min( node_hamming_distance, current_distance) else: similar_phash_ids_to_distances[ node_phash_id] = node_hamming_distance # now how about its children? if node_radius is not None: # we have two spheres--node and search--their centers separated by node_hamming_distance # we want to search inside/outside the node_sphere if the search_sphere intersects with those spaces # there are four possibles: # (----N----)-(--S--) intersects with outer only - distance between N and S > their radii # (----N---(-)-S--) intersects with both # (----N-(--S-)-) intersects with both # (---(-N-S--)-) intersects with inner only - distance between N and S + radius_S does not exceed radius_N if inner_phash_id is not None: spheres_disjoint = node_hamming_distance > ( node_radius + search_radius) if not spheres_disjoint: # i.e. they intersect at some point next_potentials.append(inner_phash_id) if outer_phash_id is not None: search_sphere_subset_of_node_sphere = ( node_hamming_distance + search_radius) <= node_radius if not search_sphere_subset_of_node_sphere: # i.e. search sphere intersects with non-node sphere space at some point next_potentials.append(outer_phash_id) if HG.db_report_mode: HydrusData.ShowText( 'Similar file search touched {} nodes over {} cycles.'. format(HydrusData.ToHumanInt(total_nodes_searched), HydrusData.ToHumanInt(num_cycles))) # so, now we have phash_ids and distances. let's map that to actual files. # files can have multiple phashes, and phashes can refer to multiple files, so let's make sure we are setting the smallest distance we found similar_phash_ids = list(similar_phash_ids_to_distances.keys()) with HydrusDB.TemporaryIntegerTable(self._c, similar_phash_ids, 'phash_id') as temp_table_name: # temp phashes to hash map similar_phash_ids_to_hash_ids = HydrusData.BuildKeyToListDict( self._c.execute( 'SELECT phash_id, hash_id FROM {} CROSS JOIN shape_perceptual_hash_map USING ( phash_id );' .format(temp_table_name))) similar_hash_ids_to_distances = {} for (phash_id, hash_ids) in similar_phash_ids_to_hash_ids.items(): distance = similar_phash_ids_to_distances[phash_id] for hash_id in hash_ids: if hash_id not in similar_hash_ids_to_distances: similar_hash_ids_to_distances[hash_id] = distance else: current_distance = similar_hash_ids_to_distances[ hash_id] if distance < current_distance: similar_hash_ids_to_distances[hash_id] = distance similar_hash_ids_and_distances = list( similar_hash_ids_to_distances.items()) return similar_hash_ids_and_distances
def GetTagParentsIdsChains(self, service_id, tag_ids): # I experimented with one or two recursive queries, and for siblings, but it mostly ended up hellmode index efficiency. I think ( service_id, integer ) did it in # note that this has to do sibling lookup as well to fetch pairs that are only connected to our chain by sibling relationships, and we are assuming here that the sibling lookup cache is valid searched_tag_ids = set() next_tag_ids = set(tag_ids) result_rows = set() while len(next_tag_ids) > 0: tag_ids_seen_this_round = set() ideal_tag_ids = self.modules_tag_siblings.GetIdeals( ClientTags.TAG_DISPLAY_IDEAL, service_id, next_tag_ids) tag_ids_seen_this_round.update( self.modules_tag_siblings.GetChainsMembersFromIdeals( ClientTags.TAG_DISPLAY_IDEAL, service_id, ideal_tag_ids)) with self._MakeTemporaryIntegerTable( next_tag_ids, 'tag_id') as temp_next_tag_ids_table_name: searched_tag_ids.update(next_tag_ids) # keep these separate--older sqlite can't do cross join to an OR ON # temp tag_ids to parents queries = [ 'SELECT status, child_tag_id, parent_tag_id FROM {} CROSS JOIN tag_parents ON ( child_tag_id = tag_id ) WHERE service_id = ?' .format(temp_next_tag_ids_table_name), 'SELECT status, child_tag_id, parent_tag_id FROM {} CROSS JOIN tag_parents ON ( parent_tag_id = tag_id ) WHERE service_id = ?' .format(temp_next_tag_ids_table_name), 'SELECT status, child_tag_id, parent_tag_id FROM {} CROSS JOIN tag_parent_petitions ON ( child_tag_id = tag_id ) WHERE service_id = ?' .format(temp_next_tag_ids_table_name), 'SELECT status, child_tag_id, parent_tag_id FROM {} CROSS JOIN tag_parent_petitions ON ( parent_tag_id = tag_id ) WHERE service_id = ?' .format(temp_next_tag_ids_table_name) ] query = ' UNION '.join(queries) for row in self._Execute( query, (service_id, service_id, service_id, service_id)): result_rows.add(row) (status, child_tag_id, parent_tag_id) = row tag_ids_seen_this_round.update( (child_tag_id, parent_tag_id)) next_tag_ids = tag_ids_seen_this_round.difference(searched_tag_ids) unsorted_statuses_to_pair_ids = HydrusData.BuildKeyToListDict( (status, (child_tag_id, parent_tag_id)) for (status, child_tag_id, parent_tag_id) in result_rows) statuses_to_pair_ids = collections.defaultdict(list) statuses_to_pair_ids.update({ status: sorted(pair_ids) for (status, pair_ids) in unsorted_statuses_to_pair_ids.items() }) return statuses_to_pair_ids