예제 #1
0
def DiscardBlankPerceptualHashes(phashes):

    phashes = {
        phash
        for phash in phashes
        if HydrusData.Get64BitHammingDistance(phash, CC.BLANK_PHASH) > 4
    }

    return phashes
예제 #2
0
def DiscardBlankPerceptualHashes(perceptual_hashes):

    perceptual_hashes = {
        perceptual_hash
        for perceptual_hash in perceptual_hashes
        if HydrusData.Get64BitHammingDistance(perceptual_hash,
                                              CC.BLANK_PERCEPTUAL_HASH) > 4
    }

    return perceptual_hashes
    def Search(self, hash_id, max_hamming_distance):

        if max_hamming_distance == 0:

            similar_hash_ids = self._STL(
                self._c.execute(
                    'SELECT hash_id FROM shape_perceptual_hash_map WHERE phash_id IN ( SELECT phash_id FROM shape_perceptual_hash_map WHERE hash_id = ? );',
                    (hash_id, )))

            similar_hash_ids_and_distances = [
                (similar_hash_id, 0) for similar_hash_id in similar_hash_ids
            ]

        else:

            search_radius = max_hamming_distance

            top_node_result = self._c.execute(
                'SELECT phash_id FROM shape_vptree WHERE parent_id IS NULL;'
            ).fetchone()

            if top_node_result is None:

                return []

            (root_node_phash_id, ) = top_node_result

            search = self._STL(
                self._c.execute(
                    'SELECT phash FROM shape_perceptual_hashes NATURAL JOIN shape_perceptual_hash_map WHERE hash_id = ?;',
                    (hash_id, )))

            if len(search) == 0:

                return []

            similar_phash_ids_to_distances = {}

            num_cycles = 0
            total_nodes_searched = 0

            for search_phash in search:

                next_potentials = [root_node_phash_id]

                while len(next_potentials) > 0:

                    current_potentials = next_potentials
                    next_potentials = []

                    num_cycles += 1
                    total_nodes_searched += len(current_potentials)

                    for group_of_current_potentials in HydrusData.SplitListIntoChunks(
                            current_potentials, 10000):

                        # this is split into fixed lists of results of subgroups because as an iterable it was causing crashes on linux!!
                        # after investigation, it seemed to be SQLite having a problem with part of Get64BitHammingDistance touching phashes it presumably was still hanging on to
                        # the crash was in sqlite code, again presumably on subsequent fetch
                        # adding a delay in seemed to fix it as well. guess it was some memory maintenance buffer/bytes thing
                        # anyway, we now just get the whole lot of results first and then work on the whole lot
                        '''
                        #old method
                        select_statement = 'SELECT phash_id, phash, radius, inner_id, outer_id FROM shape_perceptual_hashes NATURAL JOIN shape_vptree WHERE phash_id = ?;'
                        
                        results = list( self._ExecuteManySelectSingleParam( select_statement, group_of_current_potentials ) )
                        '''

                        with HydrusDB.TemporaryIntegerTable(
                                self._c, group_of_current_potentials,
                                'phash_id') as temp_table_name:

                            # temp phash_ids to actual phashes and tree info
                            results = self._c.execute(
                                'SELECT phash_id, phash, radius, inner_id, outer_id FROM {} CROSS JOIN shape_perceptual_hashes USING ( phash_id ) CROSS JOIN shape_vptree USING ( phash_id );'
                                .format(temp_table_name)).fetchall()

                        for (node_phash_id, node_phash, node_radius,
                             inner_phash_id, outer_phash_id) in results:

                            # first check the node itself--is it similar?

                            node_hamming_distance = HydrusData.Get64BitHammingDistance(
                                search_phash, node_phash)

                            if node_hamming_distance <= search_radius:

                                if node_phash_id in similar_phash_ids_to_distances:

                                    current_distance = similar_phash_ids_to_distances[
                                        node_phash_id]

                                    similar_phash_ids_to_distances[
                                        node_phash_id] = min(
                                            node_hamming_distance,
                                            current_distance)

                                else:

                                    similar_phash_ids_to_distances[
                                        node_phash_id] = node_hamming_distance

                            # now how about its children?

                            if node_radius is not None:

                                # we have two spheres--node and search--their centers separated by node_hamming_distance
                                # we want to search inside/outside the node_sphere if the search_sphere intersects with those spaces
                                # there are four possibles:
                                # (----N----)-(--S--)    intersects with outer only - distance between N and S > their radii
                                # (----N---(-)-S--)      intersects with both
                                # (----N-(--S-)-)        intersects with both
                                # (---(-N-S--)-)         intersects with inner only - distance between N and S + radius_S does not exceed radius_N

                                if inner_phash_id is not None:

                                    spheres_disjoint = node_hamming_distance > (
                                        node_radius + search_radius)

                                    if not spheres_disjoint:  # i.e. they intersect at some point

                                        next_potentials.append(inner_phash_id)

                                if outer_phash_id is not None:

                                    search_sphere_subset_of_node_sphere = (
                                        node_hamming_distance +
                                        search_radius) <= node_radius

                                    if not search_sphere_subset_of_node_sphere:  # i.e. search sphere intersects with non-node sphere space at some point

                                        next_potentials.append(outer_phash_id)

            if HG.db_report_mode:

                HydrusData.ShowText(
                    'Similar file search touched {} nodes over {} cycles.'.
                    format(HydrusData.ToHumanInt(total_nodes_searched),
                           HydrusData.ToHumanInt(num_cycles)))

            # so, now we have phash_ids and distances. let's map that to actual files.
            # files can have multiple phashes, and phashes can refer to multiple files, so let's make sure we are setting the smallest distance we found

            similar_phash_ids = list(similar_phash_ids_to_distances.keys())

            with HydrusDB.TemporaryIntegerTable(self._c, similar_phash_ids,
                                                'phash_id') as temp_table_name:

                # temp phashes to hash map
                similar_phash_ids_to_hash_ids = HydrusData.BuildKeyToListDict(
                    self._c.execute(
                        'SELECT phash_id, hash_id FROM {} CROSS JOIN shape_perceptual_hash_map USING ( phash_id );'
                        .format(temp_table_name)))

            similar_hash_ids_to_distances = {}

            for (phash_id, hash_ids) in similar_phash_ids_to_hash_ids.items():

                distance = similar_phash_ids_to_distances[phash_id]

                for hash_id in hash_ids:

                    if hash_id not in similar_hash_ids_to_distances:

                        similar_hash_ids_to_distances[hash_id] = distance

                    else:

                        current_distance = similar_hash_ids_to_distances[
                            hash_id]

                        if distance < current_distance:

                            similar_hash_ids_to_distances[hash_id] = distance

            similar_hash_ids_and_distances = list(
                similar_hash_ids_to_distances.items())

        return similar_hash_ids_and_distances
    def _PopBestRootNode(self, node_rows):

        if len(node_rows) == 1:

            root_row = node_rows.pop()

            return root_row

        MAX_VIEWPOINTS = 256
        MAX_SAMPLE = 64

        if len(node_rows) > MAX_VIEWPOINTS:

            viewpoints = random.sample(node_rows, MAX_VIEWPOINTS)

        else:

            viewpoints = node_rows

        if len(node_rows) > MAX_SAMPLE:

            sample = random.sample(node_rows, MAX_SAMPLE)

        else:

            sample = node_rows

        final_scores = []

        for (v_id, v_phash) in viewpoints:

            views = sorted(
                (HydrusData.Get64BitHammingDistance(v_phash, s_phash)
                 for (s_id, s_phash) in sample if v_id != s_id))

            # let's figure out the ratio of left_children to right_children, preferring 1:1, and convert it to a discrete integer score

            median_index = len(views) // 2

            radius = views[median_index]

            num_left = len([1 for view in views if view < radius])
            num_radius = len([1 for view in views if view == radius])
            num_right = len([1 for view in views if view > radius])

            if num_left <= num_right:

                num_left += num_radius

            else:

                num_right += num_radius

            smaller = min(num_left, num_right)
            larger = max(num_left, num_right)

            ratio = smaller / larger

            ratio_score = int(ratio * MAX_SAMPLE / 2)

            # now let's calc the standard deviation--larger sd tends to mean less sphere overlap when searching

            mean_view = sum(views) / len(views)
            squared_diffs = [(view - mean_view)**2 for view in views]
            sd = (sum(squared_diffs) / len(squared_diffs))**0.5

            final_scores.append((ratio_score, sd, v_id))

        final_scores.sort()

        # we now have a list like [ ( 11, 4.0, [id] ), ( 15, 3.7, [id] ), ( 15, 4.3, [id] ) ]

        (ratio_gumpf, sd_gumpf, root_id) = final_scores.pop()

        for (i, (v_id, v_phash)) in enumerate(node_rows):

            if v_id == root_id:

                root_row = node_rows.pop(i)

                return root_row
    def _AddLeaf(self, phash_id, phash):

        result = self._c.execute(
            'SELECT phash_id FROM shape_vptree WHERE parent_id IS NULL;'
        ).fetchone()

        if result is None:

            parent_id = None

        else:

            (root_node_phash_id, ) = result

            ancestors_we_are_inside = []
            ancestors_we_are_outside = []

            an_ancestor_is_unbalanced = False

            next_ancestor_id = root_node_phash_id

            while next_ancestor_id is not None:

                ancestor_id = next_ancestor_id

                (
                    ancestor_phash, ancestor_radius, ancestor_inner_id,
                    ancestor_inner_population, ancestor_outer_id,
                    ancestor_outer_population
                ) = self._c.execute(
                    'SELECT phash, radius, inner_id, inner_population, outer_id, outer_population FROM shape_perceptual_hashes NATURAL JOIN shape_vptree WHERE phash_id = ?;',
                    (ancestor_id, )).fetchone()

                distance_to_ancestor = HydrusData.Get64BitHammingDistance(
                    phash, ancestor_phash)

                if ancestor_radius is None or distance_to_ancestor <= ancestor_radius:

                    ancestors_we_are_inside.append(ancestor_id)
                    ancestor_inner_population += 1
                    next_ancestor_id = ancestor_inner_id

                    if ancestor_inner_id is None:

                        self._c.execute(
                            'UPDATE shape_vptree SET inner_id = ?, radius = ? WHERE phash_id = ?;',
                            (phash_id, distance_to_ancestor, ancestor_id))

                        parent_id = ancestor_id

                else:

                    ancestors_we_are_outside.append(ancestor_id)
                    ancestor_outer_population += 1
                    next_ancestor_id = ancestor_outer_id

                    if ancestor_outer_id is None:

                        self._c.execute(
                            'UPDATE shape_vptree SET outer_id = ? WHERE phash_id = ?;',
                            (phash_id, ancestor_id))

                        parent_id = ancestor_id

                if not an_ancestor_is_unbalanced and ancestor_inner_population + ancestor_outer_population > 16:

                    larger = max(ancestor_inner_population,
                                 ancestor_outer_population)
                    smaller = min(ancestor_inner_population,
                                  ancestor_outer_population)

                    if smaller / larger < 0.5:

                        self._c.execute(
                            'INSERT OR IGNORE INTO shape_maintenance_branch_regen ( phash_id ) VALUES ( ? );',
                            (ancestor_id, ))

                        # we only do this for the eldest ancestor, as the eventual rebalancing will affect all children

                        an_ancestor_is_unbalanced = True

            self._c.executemany(
                'UPDATE shape_vptree SET inner_population = inner_population + 1 WHERE phash_id = ?;',
                ((ancestor_id, ) for ancestor_id in ancestors_we_are_inside))
            self._c.executemany(
                'UPDATE shape_vptree SET outer_population = outer_population + 1 WHERE phash_id = ?;',
                ((ancestor_id, ) for ancestor_id in ancestors_we_are_outside))

        radius = None
        inner_id = None
        inner_population = 0
        outer_id = None
        outer_population = 0

        self._c.execute(
            'INSERT OR REPLACE INTO shape_vptree ( phash_id, parent_id, radius, inner_id, inner_population, outer_id, outer_population ) VALUES ( ?, ?, ?, ?, ?, ?, ? );',
            (phash_id, parent_id, radius, inner_id, inner_population, outer_id,
             outer_population))
    def _GenerateBranch(self, job_key, parent_id, phash_id, phash, children):

        process_queue = collections.deque()

        process_queue.append((parent_id, phash_id, phash, children))

        insert_rows = []

        num_done = 0
        num_to_do = len(children) + 1

        while len(process_queue) > 0:

            job_key.SetVariable(
                'popup_text_2', 'generating new branch -- ' +
                HydrusData.ConvertValueRangeToPrettyString(
                    num_done, num_to_do))

            (parent_id, phash_id, phash, children) = process_queue.popleft()

            if len(children) == 0:

                inner_id = None
                inner_population = 0

                outer_id = None
                outer_population = 0

                radius = None

            else:

                children = sorted(
                    ((HydrusData.Get64BitHammingDistance(phash, child_phash),
                      child_id, child_phash)
                     for (child_id, child_phash) in children))

                median_index = len(children) // 2

                median_radius = children[median_index][0]

                inner_children = [(child_id, child_phash)
                                  for (distance, child_id,
                                       child_phash) in children
                                  if distance < median_radius]
                radius_children = [(child_id, child_phash)
                                   for (distance, child_id,
                                        child_phash) in children
                                   if distance == median_radius]
                outer_children = [(child_id, child_phash)
                                  for (distance, child_id,
                                       child_phash) in children
                                  if distance > median_radius]

                if len(inner_children) <= len(outer_children):

                    radius = median_radius

                    inner_children.extend(radius_children)

                else:

                    radius = median_radius - 1

                    outer_children.extend(radius_children)

                inner_population = len(inner_children)
                outer_population = len(outer_children)

                (inner_id, inner_phash) = self._PopBestRootNode(
                    inner_children)  #HydrusData.MedianPop( inner_children )

                if len(outer_children) == 0:

                    outer_id = None

                else:

                    (outer_id, outer_phash) = self._PopBestRootNode(
                        outer_children
                    )  #HydrusData.MedianPop( outer_children )

            insert_rows.append((phash_id, parent_id, radius, inner_id,
                                inner_population, outer_id, outer_population))

            if inner_id is not None:

                process_queue.append(
                    (phash_id, inner_id, inner_phash, inner_children))

            if outer_id is not None:

                process_queue.append(
                    (phash_id, outer_id, outer_phash, outer_children))

            num_done += 1

        job_key.SetVariable('popup_text_2',
                            'branch constructed, now committing')

        self._c.executemany(
            'INSERT OR REPLACE INTO shape_vptree ( phash_id, parent_id, radius, inner_id, inner_population, outer_id, outer_population ) VALUES ( ?, ?, ?, ?, ?, ?, ? );',
            insert_rows)