def DiscardBlankPerceptualHashes(phashes): phashes = { phash for phash in phashes if HydrusData.Get64BitHammingDistance(phash, CC.BLANK_PHASH) > 4 } return phashes
def DiscardBlankPerceptualHashes(perceptual_hashes): perceptual_hashes = { perceptual_hash for perceptual_hash in perceptual_hashes if HydrusData.Get64BitHammingDistance(perceptual_hash, CC.BLANK_PERCEPTUAL_HASH) > 4 } return perceptual_hashes
def Search(self, hash_id, max_hamming_distance): if max_hamming_distance == 0: similar_hash_ids = self._STL( self._c.execute( 'SELECT hash_id FROM shape_perceptual_hash_map WHERE phash_id IN ( SELECT phash_id FROM shape_perceptual_hash_map WHERE hash_id = ? );', (hash_id, ))) similar_hash_ids_and_distances = [ (similar_hash_id, 0) for similar_hash_id in similar_hash_ids ] else: search_radius = max_hamming_distance top_node_result = self._c.execute( 'SELECT phash_id FROM shape_vptree WHERE parent_id IS NULL;' ).fetchone() if top_node_result is None: return [] (root_node_phash_id, ) = top_node_result search = self._STL( self._c.execute( 'SELECT phash FROM shape_perceptual_hashes NATURAL JOIN shape_perceptual_hash_map WHERE hash_id = ?;', (hash_id, ))) if len(search) == 0: return [] similar_phash_ids_to_distances = {} num_cycles = 0 total_nodes_searched = 0 for search_phash in search: next_potentials = [root_node_phash_id] while len(next_potentials) > 0: current_potentials = next_potentials next_potentials = [] num_cycles += 1 total_nodes_searched += len(current_potentials) for group_of_current_potentials in HydrusData.SplitListIntoChunks( current_potentials, 10000): # this is split into fixed lists of results of subgroups because as an iterable it was causing crashes on linux!! # after investigation, it seemed to be SQLite having a problem with part of Get64BitHammingDistance touching phashes it presumably was still hanging on to # the crash was in sqlite code, again presumably on subsequent fetch # adding a delay in seemed to fix it as well. guess it was some memory maintenance buffer/bytes thing # anyway, we now just get the whole lot of results first and then work on the whole lot ''' #old method select_statement = 'SELECT phash_id, phash, radius, inner_id, outer_id FROM shape_perceptual_hashes NATURAL JOIN shape_vptree WHERE phash_id = ?;' results = list( self._ExecuteManySelectSingleParam( select_statement, group_of_current_potentials ) ) ''' with HydrusDB.TemporaryIntegerTable( self._c, group_of_current_potentials, 'phash_id') as temp_table_name: # temp phash_ids to actual phashes and tree info results = self._c.execute( 'SELECT phash_id, phash, radius, inner_id, outer_id FROM {} CROSS JOIN shape_perceptual_hashes USING ( phash_id ) CROSS JOIN shape_vptree USING ( phash_id );' .format(temp_table_name)).fetchall() for (node_phash_id, node_phash, node_radius, inner_phash_id, outer_phash_id) in results: # first check the node itself--is it similar? node_hamming_distance = HydrusData.Get64BitHammingDistance( search_phash, node_phash) if node_hamming_distance <= search_radius: if node_phash_id in similar_phash_ids_to_distances: current_distance = similar_phash_ids_to_distances[ node_phash_id] similar_phash_ids_to_distances[ node_phash_id] = min( node_hamming_distance, current_distance) else: similar_phash_ids_to_distances[ node_phash_id] = node_hamming_distance # now how about its children? if node_radius is not None: # we have two spheres--node and search--their centers separated by node_hamming_distance # we want to search inside/outside the node_sphere if the search_sphere intersects with those spaces # there are four possibles: # (----N----)-(--S--) intersects with outer only - distance between N and S > their radii # (----N---(-)-S--) intersects with both # (----N-(--S-)-) intersects with both # (---(-N-S--)-) intersects with inner only - distance between N and S + radius_S does not exceed radius_N if inner_phash_id is not None: spheres_disjoint = node_hamming_distance > ( node_radius + search_radius) if not spheres_disjoint: # i.e. they intersect at some point next_potentials.append(inner_phash_id) if outer_phash_id is not None: search_sphere_subset_of_node_sphere = ( node_hamming_distance + search_radius) <= node_radius if not search_sphere_subset_of_node_sphere: # i.e. search sphere intersects with non-node sphere space at some point next_potentials.append(outer_phash_id) if HG.db_report_mode: HydrusData.ShowText( 'Similar file search touched {} nodes over {} cycles.'. format(HydrusData.ToHumanInt(total_nodes_searched), HydrusData.ToHumanInt(num_cycles))) # so, now we have phash_ids and distances. let's map that to actual files. # files can have multiple phashes, and phashes can refer to multiple files, so let's make sure we are setting the smallest distance we found similar_phash_ids = list(similar_phash_ids_to_distances.keys()) with HydrusDB.TemporaryIntegerTable(self._c, similar_phash_ids, 'phash_id') as temp_table_name: # temp phashes to hash map similar_phash_ids_to_hash_ids = HydrusData.BuildKeyToListDict( self._c.execute( 'SELECT phash_id, hash_id FROM {} CROSS JOIN shape_perceptual_hash_map USING ( phash_id );' .format(temp_table_name))) similar_hash_ids_to_distances = {} for (phash_id, hash_ids) in similar_phash_ids_to_hash_ids.items(): distance = similar_phash_ids_to_distances[phash_id] for hash_id in hash_ids: if hash_id not in similar_hash_ids_to_distances: similar_hash_ids_to_distances[hash_id] = distance else: current_distance = similar_hash_ids_to_distances[ hash_id] if distance < current_distance: similar_hash_ids_to_distances[hash_id] = distance similar_hash_ids_and_distances = list( similar_hash_ids_to_distances.items()) return similar_hash_ids_and_distances
def _PopBestRootNode(self, node_rows): if len(node_rows) == 1: root_row = node_rows.pop() return root_row MAX_VIEWPOINTS = 256 MAX_SAMPLE = 64 if len(node_rows) > MAX_VIEWPOINTS: viewpoints = random.sample(node_rows, MAX_VIEWPOINTS) else: viewpoints = node_rows if len(node_rows) > MAX_SAMPLE: sample = random.sample(node_rows, MAX_SAMPLE) else: sample = node_rows final_scores = [] for (v_id, v_phash) in viewpoints: views = sorted( (HydrusData.Get64BitHammingDistance(v_phash, s_phash) for (s_id, s_phash) in sample if v_id != s_id)) # let's figure out the ratio of left_children to right_children, preferring 1:1, and convert it to a discrete integer score median_index = len(views) // 2 radius = views[median_index] num_left = len([1 for view in views if view < radius]) num_radius = len([1 for view in views if view == radius]) num_right = len([1 for view in views if view > radius]) if num_left <= num_right: num_left += num_radius else: num_right += num_radius smaller = min(num_left, num_right) larger = max(num_left, num_right) ratio = smaller / larger ratio_score = int(ratio * MAX_SAMPLE / 2) # now let's calc the standard deviation--larger sd tends to mean less sphere overlap when searching mean_view = sum(views) / len(views) squared_diffs = [(view - mean_view)**2 for view in views] sd = (sum(squared_diffs) / len(squared_diffs))**0.5 final_scores.append((ratio_score, sd, v_id)) final_scores.sort() # we now have a list like [ ( 11, 4.0, [id] ), ( 15, 3.7, [id] ), ( 15, 4.3, [id] ) ] (ratio_gumpf, sd_gumpf, root_id) = final_scores.pop() for (i, (v_id, v_phash)) in enumerate(node_rows): if v_id == root_id: root_row = node_rows.pop(i) return root_row
def _AddLeaf(self, phash_id, phash): result = self._c.execute( 'SELECT phash_id FROM shape_vptree WHERE parent_id IS NULL;' ).fetchone() if result is None: parent_id = None else: (root_node_phash_id, ) = result ancestors_we_are_inside = [] ancestors_we_are_outside = [] an_ancestor_is_unbalanced = False next_ancestor_id = root_node_phash_id while next_ancestor_id is not None: ancestor_id = next_ancestor_id ( ancestor_phash, ancestor_radius, ancestor_inner_id, ancestor_inner_population, ancestor_outer_id, ancestor_outer_population ) = self._c.execute( 'SELECT phash, radius, inner_id, inner_population, outer_id, outer_population FROM shape_perceptual_hashes NATURAL JOIN shape_vptree WHERE phash_id = ?;', (ancestor_id, )).fetchone() distance_to_ancestor = HydrusData.Get64BitHammingDistance( phash, ancestor_phash) if ancestor_radius is None or distance_to_ancestor <= ancestor_radius: ancestors_we_are_inside.append(ancestor_id) ancestor_inner_population += 1 next_ancestor_id = ancestor_inner_id if ancestor_inner_id is None: self._c.execute( 'UPDATE shape_vptree SET inner_id = ?, radius = ? WHERE phash_id = ?;', (phash_id, distance_to_ancestor, ancestor_id)) parent_id = ancestor_id else: ancestors_we_are_outside.append(ancestor_id) ancestor_outer_population += 1 next_ancestor_id = ancestor_outer_id if ancestor_outer_id is None: self._c.execute( 'UPDATE shape_vptree SET outer_id = ? WHERE phash_id = ?;', (phash_id, ancestor_id)) parent_id = ancestor_id if not an_ancestor_is_unbalanced and ancestor_inner_population + ancestor_outer_population > 16: larger = max(ancestor_inner_population, ancestor_outer_population) smaller = min(ancestor_inner_population, ancestor_outer_population) if smaller / larger < 0.5: self._c.execute( 'INSERT OR IGNORE INTO shape_maintenance_branch_regen ( phash_id ) VALUES ( ? );', (ancestor_id, )) # we only do this for the eldest ancestor, as the eventual rebalancing will affect all children an_ancestor_is_unbalanced = True self._c.executemany( 'UPDATE shape_vptree SET inner_population = inner_population + 1 WHERE phash_id = ?;', ((ancestor_id, ) for ancestor_id in ancestors_we_are_inside)) self._c.executemany( 'UPDATE shape_vptree SET outer_population = outer_population + 1 WHERE phash_id = ?;', ((ancestor_id, ) for ancestor_id in ancestors_we_are_outside)) radius = None inner_id = None inner_population = 0 outer_id = None outer_population = 0 self._c.execute( 'INSERT OR REPLACE INTO shape_vptree ( phash_id, parent_id, radius, inner_id, inner_population, outer_id, outer_population ) VALUES ( ?, ?, ?, ?, ?, ?, ? );', (phash_id, parent_id, radius, inner_id, inner_population, outer_id, outer_population))
def _GenerateBranch(self, job_key, parent_id, phash_id, phash, children): process_queue = collections.deque() process_queue.append((parent_id, phash_id, phash, children)) insert_rows = [] num_done = 0 num_to_do = len(children) + 1 while len(process_queue) > 0: job_key.SetVariable( 'popup_text_2', 'generating new branch -- ' + HydrusData.ConvertValueRangeToPrettyString( num_done, num_to_do)) (parent_id, phash_id, phash, children) = process_queue.popleft() if len(children) == 0: inner_id = None inner_population = 0 outer_id = None outer_population = 0 radius = None else: children = sorted( ((HydrusData.Get64BitHammingDistance(phash, child_phash), child_id, child_phash) for (child_id, child_phash) in children)) median_index = len(children) // 2 median_radius = children[median_index][0] inner_children = [(child_id, child_phash) for (distance, child_id, child_phash) in children if distance < median_radius] radius_children = [(child_id, child_phash) for (distance, child_id, child_phash) in children if distance == median_radius] outer_children = [(child_id, child_phash) for (distance, child_id, child_phash) in children if distance > median_radius] if len(inner_children) <= len(outer_children): radius = median_radius inner_children.extend(radius_children) else: radius = median_radius - 1 outer_children.extend(radius_children) inner_population = len(inner_children) outer_population = len(outer_children) (inner_id, inner_phash) = self._PopBestRootNode( inner_children) #HydrusData.MedianPop( inner_children ) if len(outer_children) == 0: outer_id = None else: (outer_id, outer_phash) = self._PopBestRootNode( outer_children ) #HydrusData.MedianPop( outer_children ) insert_rows.append((phash_id, parent_id, radius, inner_id, inner_population, outer_id, outer_population)) if inner_id is not None: process_queue.append( (phash_id, inner_id, inner_phash, inner_children)) if outer_id is not None: process_queue.append( (phash_id, outer_id, outer_phash, outer_children)) num_done += 1 job_key.SetVariable('popup_text_2', 'branch constructed, now committing') self._c.executemany( 'INSERT OR REPLACE INTO shape_vptree ( phash_id, parent_id, radius, inner_id, inner_population, outer_id, outer_population ) VALUES ( ?, ?, ?, ?, ?, ?, ? );', insert_rows)