def FilterAllPendingHashIds(self, hash_ids, just_these_service_ids=None): if just_these_service_ids is None: service_ids = self.modules_services.GetServiceIds( HC.SPECIFIC_FILE_SERVICES) else: service_ids = just_these_service_ids pending_hash_ids = set() with HydrusDB.TemporaryIntegerTable( self._c, hash_ids, 'hash_id') as temp_hash_ids_table_name: for service_id in service_ids: pending_files_table_name = GenerateFilesTableName( service_id, HC.CONTENT_STATUS_PENDING) hash_id_iterator = self._STI( self._c.execute( 'SELECT hash_id FROM {} CROSS JOIN {} USING ( hash_id );' .format(temp_hash_ids_table_name, pending_files_table_name))) pending_hash_ids.update(hash_id_iterator) return pending_hash_ids
def GetTotalSize(self, hash_ids: typing.Collection[int]) -> int: if len(hash_ids) == 1: (hash_id, ) = hash_ids result = self._c.execute( 'SELECT size FROM files_info WHERE hash_id = ?;', (hash_id, )).fetchone() else: with HydrusDB.TemporaryIntegerTable( self._c, hash_ids, 'hash_id') as temp_hash_ids_table_name: result = self._c.execute( 'SELECT SUM( size ) FROM {} CROSS JOIN files_info USING ( hash_id );' .format(temp_hash_ids_table_name)).fetchone() if result is None: return 0 (total_size, ) = result return total_size
def _PopulateTagIdsToTagsCache( self, tag_ids ): if len( self._tag_ids_to_tags_cache ) > 100000: if not isinstance( tag_ids, set ): tag_ids = set( tag_ids ) self._tag_ids_to_tags_cache = { tag_id : tag for ( tag_id, tag ) in self._tag_ids_to_tags_cache.items() if tag_id in tag_ids } uncached_tag_ids = { tag_id for tag_id in tag_ids if tag_id not in self._tag_ids_to_tags_cache } if len( uncached_tag_ids ) > 0: if len( uncached_tag_ids ) == 1: ( uncached_tag_id, ) = uncached_tag_ids rows = self._c.execute( 'SELECT tag_id, namespace, subtag FROM tags NATURAL JOIN namespaces NATURAL JOIN subtags WHERE tag_id = ?;', ( uncached_tag_id, ) ).fetchall() else: with HydrusDB.TemporaryIntegerTable( self._c, uncached_tag_ids, 'tag_id' ) as temp_table_name: # temp tag_ids to tags to subtags and namespaces rows = self._c.execute( 'SELECT tag_id, namespace, subtag FROM {} CROSS JOIN tags USING ( tag_id ) CROSS JOIN subtags USING ( subtag_id ) CROSS JOIN namespaces USING ( namespace_id );'.format( temp_table_name ) ).fetchall() uncached_tag_ids_to_tags = { tag_id : HydrusTags.CombineTag( namespace, subtag ) for ( tag_id, namespace, subtag ) in rows } if len( uncached_tag_ids_to_tags ) < len( uncached_tag_ids ): for tag_id in uncached_tag_ids: if tag_id not in uncached_tag_ids_to_tags: tag = 'unknown tag:' + HydrusData.GenerateKey().hex() ( namespace, subtag ) = HydrusTags.SplitTag( tag ) namespace_id = self.GetNamespaceId( namespace ) subtag_id = self.GetSubtagId( subtag ) self._c.execute( 'REPLACE INTO tags ( tag_id, namespace_id, subtag_id ) VALUES ( ?, ?, ? );', ( tag_id, namespace_id, subtag_id ) ) uncached_tag_ids_to_tags[ tag_id ] = tag self._tag_ids_to_tags_cache.update( uncached_tag_ids_to_tags )
def GetUndeleteRows(self, service_id, hash_ids): deleted_files_table_name = GenerateFilesTableName( service_id, HC.CONTENT_STATUS_DELETED) with HydrusDB.TemporaryIntegerTable( self._c, hash_ids, 'hash_id') as temp_hash_ids_table_name: rows = self._c.execute( 'SELECT hash_id, original_timestamp FROM {} CROSS JOIN {} USING ( hash_id );' .format(temp_hash_ids_table_name, deleted_files_table_name)).fetchall() return rows
def GetCurrentHashIdsToTimestamps(self, service_id, hash_ids): current_files_table_name = GenerateFilesTableName( service_id, HC.CONTENT_STATUS_CURRENT) with HydrusDB.TemporaryIntegerTable( self._c, hash_ids, 'hash_id') as temp_hash_ids_table_name: rows = dict( self._c.execute( 'SELECT hash_id, timestamp FROM {} CROSS JOIN {} USING ( hash_id );' .format(temp_hash_ids_table_name, current_files_table_name))) return rows
def FilterPendingHashIds(self, service_id, hash_ids): if service_id == self.modules_services.combined_file_service_id: return set(hash_ids) with HydrusDB.TemporaryIntegerTable( self._c, hash_ids, 'hash_id') as temp_hash_ids_table_name: pending_files_table_name = GenerateFilesTableName( service_id, HC.CONTENT_STATUS_PENDING) pending_hash_ids = self._STS( self._c.execute( 'SELECT hash_id FROM {} CROSS JOIN {} USING ( hash_id );'. format(temp_hash_ids_table_name, pending_files_table_name))) return pending_hash_ids
def _PopulateHashIdsToHashesCache( self, hash_ids ): if len( self._hash_ids_to_hashes_cache ) > 100000: if not isinstance( hash_ids, set ): hash_ids = set( hash_ids ) self._hash_ids_to_hashes_cache = { hash_id : hash for ( hash_id, hash ) in self._hash_ids_to_hashes_cache.items() if hash_id in hash_ids } uncached_hash_ids = { hash_id for hash_id in hash_ids if hash_id not in self._hash_ids_to_hashes_cache } if len( uncached_hash_ids ) > 0: if len( uncached_hash_ids ) == 1: ( uncached_hash_id, ) = uncached_hash_ids # this makes 0 or 1 rows, so do fetchall rather than fetchone local_uncached_hash_ids_to_hashes = { hash_id : hash for ( hash_id, hash ) in self._c.execute( 'SELECT hash_id, hash FROM local_hashes_cache WHERE hash_id = ?;', ( uncached_hash_id, ) ) } else: with HydrusDB.TemporaryIntegerTable( self._c, uncached_hash_ids, 'hash_id' ) as temp_table_name: # temp hash_ids to actual hashes local_uncached_hash_ids_to_hashes = { hash_id : hash for ( hash_id, hash ) in self._c.execute( 'SELECT hash_id, hash FROM {} CROSS JOIN local_hashes_cache USING ( hash_id );'.format( temp_table_name ) ) } self._hash_ids_to_hashes_cache.update( local_uncached_hash_ids_to_hashes ) uncached_hash_ids = { hash_id for hash_id in uncached_hash_ids if hash_id not in self._hash_ids_to_hashes_cache } if len( uncached_hash_ids ) > 0: hash_ids_to_hashes = self.modules_hashes.GetHashIdsToHashes( hash_ids = uncached_hash_ids ) self._hash_ids_to_hashes_cache.update( hash_ids_to_hashes )
def _PopulateTagIdsToTagsCache( self, tag_ids ): if len( self._tag_ids_to_tags_cache ) > 100000: if not isinstance( tag_ids, set ): tag_ids = set( tag_ids ) self._tag_ids_to_tags_cache = { tag_id : tag for ( tag_id, tag ) in self._tag_ids_to_tags_cache.items() if tag_id in tag_ids } uncached_tag_ids = { tag_id for tag_id in tag_ids if tag_id not in self._tag_ids_to_tags_cache } if len( uncached_tag_ids ) > 0: if len( uncached_tag_ids ) == 1: ( uncached_tag_id, ) = uncached_tag_ids # this makes 0 or 1 rows, so do fetchall rather than fetchone local_uncached_tag_ids_to_tags = { tag_id : tag for ( tag_id, tag ) in self._c.execute( 'SELECT tag_id, tag FROM local_tags_cache WHERE tag_id = ?;', ( uncached_tag_id, ) ) } else: with HydrusDB.TemporaryIntegerTable( self._c, uncached_tag_ids, 'tag_id' ) as temp_table_name: # temp tag_ids to actual tags local_uncached_tag_ids_to_tags = { tag_id : tag for ( tag_id, tag ) in self._c.execute( 'SELECT tag_id, tag FROM {} CROSS JOIN local_tags_cache USING ( tag_id );'.format( temp_table_name ) ) } self._tag_ids_to_tags_cache.update( local_uncached_tag_ids_to_tags ) uncached_tag_ids = { tag_id for tag_id in uncached_tag_ids if tag_id not in self._tag_ids_to_tags_cache } if len( uncached_tag_ids ) > 0: tag_ids_to_tags = self.modules_tags.GetTagIdsToTags( tag_ids = uncached_tag_ids ) self._tag_ids_to_tags_cache.update( tag_ids_to_tags )
def GetServiceIdCounts(self, hash_ids) -> typing.Dict[int, int]: with HydrusDB.TemporaryIntegerTable( self._c, hash_ids, 'hash_id') as temp_hash_ids_table_name: service_ids_to_counts = {} for service_id in self.modules_services.GetServiceIds( HC.SPECIFIC_FILE_SERVICES): current_files_table_name = GenerateFilesTableName( service_id, HC.CONTENT_STATUS_CURRENT) # temp hashes to files (count, ) = self._c.execute( 'SELECT COUNT( * ) FROM {} CROSS JOIN {} USING ( hash_id );' .format(temp_hash_ids_table_name, current_files_table_name)).fetchone() service_ids_to_counts[service_id] = count return service_ids_to_counts
def GetNumViewable(self, hash_ids: typing.Collection[int]) -> int: if len(hash_ids) == 1: (hash_id, ) = hash_ids result = self._STL( self._c.execute( 'SELECT mime FROM files_info WHERE hash_id = ?;', (hash_id, ))) else: with HydrusDB.TemporaryIntegerTable( self._c, hash_ids, 'hash_id') as temp_hash_ids_table_name: result = self._STL( self._c.execute( 'SELECT mime FROM {} CROSS JOIN files_info USING ( hash_id );' .format(temp_hash_ids_table_name))) return sum((1 for mime in result if mime in HC.SEARCHABLE_MIMES))
def Search(self, hash_id, max_hamming_distance): if max_hamming_distance == 0: similar_hash_ids = self._STL( self._c.execute( 'SELECT hash_id FROM shape_perceptual_hash_map WHERE phash_id IN ( SELECT phash_id FROM shape_perceptual_hash_map WHERE hash_id = ? );', (hash_id, ))) similar_hash_ids_and_distances = [ (similar_hash_id, 0) for similar_hash_id in similar_hash_ids ] else: search_radius = max_hamming_distance top_node_result = self._c.execute( 'SELECT phash_id FROM shape_vptree WHERE parent_id IS NULL;' ).fetchone() if top_node_result is None: return [] (root_node_phash_id, ) = top_node_result search = self._STL( self._c.execute( 'SELECT phash FROM shape_perceptual_hashes NATURAL JOIN shape_perceptual_hash_map WHERE hash_id = ?;', (hash_id, ))) if len(search) == 0: return [] similar_phash_ids_to_distances = {} num_cycles = 0 total_nodes_searched = 0 for search_phash in search: next_potentials = [root_node_phash_id] while len(next_potentials) > 0: current_potentials = next_potentials next_potentials = [] num_cycles += 1 total_nodes_searched += len(current_potentials) for group_of_current_potentials in HydrusData.SplitListIntoChunks( current_potentials, 10000): # this is split into fixed lists of results of subgroups because as an iterable it was causing crashes on linux!! # after investigation, it seemed to be SQLite having a problem with part of Get64BitHammingDistance touching phashes it presumably was still hanging on to # the crash was in sqlite code, again presumably on subsequent fetch # adding a delay in seemed to fix it as well. guess it was some memory maintenance buffer/bytes thing # anyway, we now just get the whole lot of results first and then work on the whole lot ''' #old method select_statement = 'SELECT phash_id, phash, radius, inner_id, outer_id FROM shape_perceptual_hashes NATURAL JOIN shape_vptree WHERE phash_id = ?;' results = list( self._ExecuteManySelectSingleParam( select_statement, group_of_current_potentials ) ) ''' with HydrusDB.TemporaryIntegerTable( self._c, group_of_current_potentials, 'phash_id') as temp_table_name: # temp phash_ids to actual phashes and tree info results = self._c.execute( 'SELECT phash_id, phash, radius, inner_id, outer_id FROM {} CROSS JOIN shape_perceptual_hashes USING ( phash_id ) CROSS JOIN shape_vptree USING ( phash_id );' .format(temp_table_name)).fetchall() for (node_phash_id, node_phash, node_radius, inner_phash_id, outer_phash_id) in results: # first check the node itself--is it similar? node_hamming_distance = HydrusData.Get64BitHammingDistance( search_phash, node_phash) if node_hamming_distance <= search_radius: if node_phash_id in similar_phash_ids_to_distances: current_distance = similar_phash_ids_to_distances[ node_phash_id] similar_phash_ids_to_distances[ node_phash_id] = min( node_hamming_distance, current_distance) else: similar_phash_ids_to_distances[ node_phash_id] = node_hamming_distance # now how about its children? if node_radius is not None: # we have two spheres--node and search--their centers separated by node_hamming_distance # we want to search inside/outside the node_sphere if the search_sphere intersects with those spaces # there are four possibles: # (----N----)-(--S--) intersects with outer only - distance between N and S > their radii # (----N---(-)-S--) intersects with both # (----N-(--S-)-) intersects with both # (---(-N-S--)-) intersects with inner only - distance between N and S + radius_S does not exceed radius_N if inner_phash_id is not None: spheres_disjoint = node_hamming_distance > ( node_radius + search_radius) if not spheres_disjoint: # i.e. they intersect at some point next_potentials.append(inner_phash_id) if outer_phash_id is not None: search_sphere_subset_of_node_sphere = ( node_hamming_distance + search_radius) <= node_radius if not search_sphere_subset_of_node_sphere: # i.e. search sphere intersects with non-node sphere space at some point next_potentials.append(outer_phash_id) if HG.db_report_mode: HydrusData.ShowText( 'Similar file search touched {} nodes over {} cycles.'. format(HydrusData.ToHumanInt(total_nodes_searched), HydrusData.ToHumanInt(num_cycles))) # so, now we have phash_ids and distances. let's map that to actual files. # files can have multiple phashes, and phashes can refer to multiple files, so let's make sure we are setting the smallest distance we found similar_phash_ids = list(similar_phash_ids_to_distances.keys()) with HydrusDB.TemporaryIntegerTable(self._c, similar_phash_ids, 'phash_id') as temp_table_name: # temp phashes to hash map similar_phash_ids_to_hash_ids = HydrusData.BuildKeyToListDict( self._c.execute( 'SELECT phash_id, hash_id FROM {} CROSS JOIN shape_perceptual_hash_map USING ( phash_id );' .format(temp_table_name))) similar_hash_ids_to_distances = {} for (phash_id, hash_ids) in similar_phash_ids_to_hash_ids.items(): distance = similar_phash_ids_to_distances[phash_id] for hash_id in hash_ids: if hash_id not in similar_hash_ids_to_distances: similar_hash_ids_to_distances[hash_id] = distance else: current_distance = similar_hash_ids_to_distances[ hash_id] if distance < current_distance: similar_hash_ids_to_distances[hash_id] = distance similar_hash_ids_and_distances = list( similar_hash_ids_to_distances.items()) return similar_hash_ids_and_distances
def MaintainTree(self, maintenance_mode=HC.MAINTENANCE_FORCED, job_key=None, stop_time=None): time_started = HydrusData.GetNow() pub_job_key = False job_key_pubbed = False if job_key is None: job_key = ClientThreading.JobKey(cancellable=True) pub_job_key = True try: job_key.SetVariable('popup_title', 'similar files metadata maintenance') rebalance_phash_ids = self._STL( self._c.execute( 'SELECT phash_id FROM shape_maintenance_branch_regen;')) num_to_do = len(rebalance_phash_ids) while len(rebalance_phash_ids) > 0: if pub_job_key and not job_key_pubbed and HydrusData.TimeHasPassed( time_started + 5): HG.client_controller.pub('modal_message', job_key) job_key_pubbed = True (i_paused, should_quit) = job_key.WaitIfNeeded() should_stop = HG.client_controller.ShouldStopThisWork( maintenance_mode, stop_time=stop_time) if should_quit or should_stop: return num_done = num_to_do - len(rebalance_phash_ids) text = 'rebalancing similar file metadata - ' + HydrusData.ConvertValueRangeToPrettyString( num_done, num_to_do) HG.client_controller.frame_splash_status.SetSubtext(text) job_key.SetVariable('popup_text_1', text) job_key.SetVariable('popup_gauge_1', (num_done, num_to_do)) with HydrusDB.TemporaryIntegerTable( self._c, rebalance_phash_ids, 'phash_id') as temp_table_name: # temp phashes to tree (biggest_phash_id, ) = self._c.execute( 'SELECT phash_id FROM {} CROSS JOIN shape_vptree USING ( phash_id ) ORDER BY inner_population + outer_population DESC;' .format(temp_table_name)).fetchone() self._RegenerateBranch(job_key, biggest_phash_id) rebalance_phash_ids = self._STL( self._c.execute( 'SELECT phash_id FROM shape_maintenance_branch_regen;') ) finally: job_key.SetVariable('popup_text_1', 'done!') job_key.DeleteVariable('popup_gauge_1') job_key.DeleteVariable( 'popup_text_2') # used in the regenbranch call job_key.Finish() job_key.Delete(5)
def _RegenerateBranch(self, job_key, phash_id): job_key.SetVariable('popup_text_2', 'reviewing existing branch') # grab everything in the branch (parent_id, ) = self._c.execute( 'SELECT parent_id FROM shape_vptree WHERE phash_id = ?;', (phash_id, )).fetchone() cte_table_name = 'branch ( branch_phash_id )' initial_select = 'SELECT ?' recursive_select = 'SELECT phash_id FROM shape_vptree, branch ON parent_id = branch_phash_id' with_clause = 'WITH RECURSIVE ' + cte_table_name + ' AS ( ' + initial_select + ' UNION ALL ' + recursive_select + ')' unbalanced_nodes = self._c.execute( with_clause + ' SELECT branch_phash_id, phash FROM branch, shape_perceptual_hashes ON phash_id = branch_phash_id;', (phash_id, )).fetchall() # removal of old branch, maintenance schedule, and orphan phashes job_key.SetVariable( 'popup_text_2', HydrusData.ToHumanInt(len(unbalanced_nodes)) + ' leaves found--now clearing out old branch') unbalanced_phash_ids = {p_id for (p_id, p_h) in unbalanced_nodes} self._c.executemany('DELETE FROM shape_vptree WHERE phash_id = ?;', ((p_id, ) for p_id in unbalanced_phash_ids)) self._c.executemany( 'DELETE FROM shape_maintenance_branch_regen WHERE phash_id = ?;', ((p_id, ) for p_id in unbalanced_phash_ids)) with HydrusDB.TemporaryIntegerTable( self._c, unbalanced_phash_ids, 'phash_id') as temp_phash_ids_table_name: useful_phash_ids = self._STS( self._c.execute( 'SELECT phash_id FROM {} CROSS JOIN shape_perceptual_hash_map USING ( phash_id );' .format(temp_phash_ids_table_name))) orphan_phash_ids = unbalanced_phash_ids.difference(useful_phash_ids) self._c.executemany( 'DELETE FROM shape_perceptual_hashes WHERE phash_id = ?;', ((p_id, ) for p_id in orphan_phash_ids)) useful_nodes = [ row for row in unbalanced_nodes if row[0] in useful_phash_ids ] useful_population = len(useful_nodes) # now create the new branch, starting by choosing a new root and updating the parent's left/right reference to that if useful_population > 0: (new_phash_id, new_phash) = self._PopBestRootNode( useful_nodes) #HydrusData.RandomPop( useful_nodes ) else: new_phash_id = None if parent_id is not None: (parent_inner_id, ) = self._c.execute( 'SELECT inner_id FROM shape_vptree WHERE phash_id = ?;', (parent_id, )).fetchone() if parent_inner_id == phash_id: query = 'UPDATE shape_vptree SET inner_id = ?, inner_population = ? WHERE phash_id = ?;' else: query = 'UPDATE shape_vptree SET outer_id = ?, outer_population = ? WHERE phash_id = ?;' self._c.execute(query, (new_phash_id, useful_population, parent_id)) if useful_population > 0: self._GenerateBranch(job_key, parent_id, new_phash_id, new_phash, useful_nodes)
def _PopulateHashIdsToHashesCache( self, hash_ids, exception_on_error = False ): if len( self._hash_ids_to_hashes_cache ) > 100000: if not isinstance( hash_ids, set ): hash_ids = set( hash_ids ) self._hash_ids_to_hashes_cache = { hash_id : hash for ( hash_id, hash ) in self._hash_ids_to_hashes_cache.items() if hash_id in hash_ids } uncached_hash_ids = { hash_id for hash_id in hash_ids if hash_id not in self._hash_ids_to_hashes_cache } if len( uncached_hash_ids ) > 0: pubbed_error = False if len( uncached_hash_ids ) == 1: ( uncached_hash_id, ) = uncached_hash_ids rows = self._c.execute( 'SELECT hash_id, hash FROM hashes WHERE hash_id = ?;', ( uncached_hash_id, ) ).fetchall() else: with HydrusDB.TemporaryIntegerTable( self._c, uncached_hash_ids, 'hash_id' ) as temp_table_name: # temp hash_ids to actual hashes rows = self._c.execute( 'SELECT hash_id, hash FROM {} CROSS JOIN hashes USING ( hash_id );'.format( temp_table_name ) ).fetchall() uncached_hash_ids_to_hashes = dict( rows ) if len( uncached_hash_ids_to_hashes ) < len( uncached_hash_ids ): for hash_id in uncached_hash_ids: if hash_id not in uncached_hash_ids_to_hashes: if exception_on_error: raise HydrusExceptions.DataMissing( 'Did not find all entries for those hash ids!' ) HydrusData.DebugPrint( 'Database hash error: hash_id ' + str( hash_id ) + ' was missing!' ) HydrusData.PrintException( Exception( 'Missing file identifier stack trace.' ) ) if not pubbed_error: HydrusData.ShowText( 'A file identifier was missing! This is a serious error that means your client database has an orphan file id! Think about contacting hydrus dev!' ) pubbed_error = True hash = bytes.fromhex( 'aaaaaaaaaaaaaaaa' ) + os.urandom( 16 ) uncached_hash_ids_to_hashes[ hash_id ] = hash self._hash_ids_to_hashes_cache.update( uncached_hash_ids_to_hashes )