Exemplo n.º 1
0
    def _GenerateApplicationDicts(self):

        unsorted_dict = HydrusData.BuildKeyToListDict((
            master_service_id, (index, application_service_id)
        ) for (
            master_service_id, index, application_service_id
        ) in self._Execute(
            'SELECT master_service_id, service_index, application_service_id FROM tag_sibling_application;'
        ))

        self._service_ids_to_applicable_service_ids = collections.defaultdict(
            list)

        self._service_ids_to_applicable_service_ids.update({
            master_service_id: [
                application_service_id
                for (index, application_service_id
                     ) in sorted(index_and_applicable_service_ids)
            ]
            for (master_service_id,
                 index_and_applicable_service_ids) in unsorted_dict.items()
        })

        self._service_ids_to_interested_service_ids = collections.defaultdict(
            set)

        for (master_service_id, application_service_ids
             ) in self._service_ids_to_applicable_service_ids.items():

            for application_service_id in application_service_ids:

                self._service_ids_to_interested_service_ids[
                    application_service_id].add(master_service_id)
Exemplo n.º 2
0
    def GetTagSiblingsIdsChains(self, service_id, tag_ids):

        done_tag_ids = set()
        next_tag_ids = set(tag_ids)
        result_rows = set()

        while len(next_tag_ids) > 0:

            with self._MakeTemporaryIntegerTable(
                    next_tag_ids, 'tag_id') as temp_next_tag_ids_table_name:

                done_tag_ids.update(next_tag_ids)

                next_tag_ids = set()

                # keep these separate--older sqlite can't do cross join to an OR ON

                # temp tag_ids to siblings
                queries = [
                    'SELECT status, bad_tag_id, good_tag_id FROM {} CROSS JOIN tag_siblings ON ( bad_tag_id = tag_id ) WHERE service_id = ?'
                    .format(temp_next_tag_ids_table_name),
                    'SELECT status, bad_tag_id, good_tag_id FROM {} CROSS JOIN tag_siblings ON ( good_tag_id = tag_id ) WHERE service_id = ?'
                    .format(temp_next_tag_ids_table_name),
                    'SELECT status, bad_tag_id, good_tag_id FROM {} CROSS JOIN tag_sibling_petitions ON ( bad_tag_id = tag_id ) WHERE service_id = ?'
                    .format(temp_next_tag_ids_table_name),
                    'SELECT status, bad_tag_id, good_tag_id FROM {} CROSS JOIN tag_sibling_petitions ON ( good_tag_id = tag_id ) WHERE service_id = ?'
                    .format(temp_next_tag_ids_table_name)
                ]

                query = ' UNION '.join(queries)

                for row in self._Execute(
                        query,
                    (service_id, service_id, service_id, service_id)):

                    result_rows.add(row)

                    (status, bad_tag_id, good_tag_id) = row

                    for tag_id in (bad_tag_id, good_tag_id):

                        if tag_id not in done_tag_ids:

                            next_tag_ids.add(tag_id)

        unsorted_statuses_to_pair_ids = HydrusData.BuildKeyToListDict(
            (status, (bad_tag_id, good_tag_id))
            for (status, bad_tag_id, good_tag_id) in result_rows)

        statuses_to_pair_ids = collections.defaultdict(list)

        statuses_to_pair_ids.update({
            status: sorted(pair_ids)
            for (status, pair_ids) in unsorted_statuses_to_pair_ids.items()
        })

        return statuses_to_pair_ids
Exemplo n.º 3
0
    def MergeTagsManagers(tags_managers):

        # we cheat here and just get display tags, since this is read only and storage exacts isn't super important

        def CurrentAndPendingFilter(items):

            for (service_key, statuses_to_tags) in items:

                filtered = {
                    status: tags
                    for (status, tags) in list(statuses_to_tags.items())
                    if status in (HC.CONTENT_STATUS_CURRENT,
                                  HC.CONTENT_STATUS_PENDING)
                }

                yield (service_key, filtered)

        # [[( service_key, statuses_to_tags )]]
        s_k_s_t_t_tupled = (CurrentAndPendingFilter(
            tags_manager.GetServiceKeysToStatusesToTags(
                ClientTags.TAG_DISPLAY_ACTUAL).items())
                            for tags_manager in tags_managers)

        # [(service_key, statuses_to_tags)]
        flattened_s_k_s_t_t = itertools.chain.from_iterable(s_k_s_t_t_tupled)

        # service_key : [ statuses_to_tags ]
        s_k_s_t_t_dict = HydrusData.BuildKeyToListDict(flattened_s_k_s_t_t)

        # now let's merge so we have service_key : statuses_to_tags

        merged_service_keys_to_statuses_to_tags = collections.defaultdict(
            HydrusData.default_dict_set)

        for (service_key,
             several_statuses_to_tags) in list(s_k_s_t_t_dict.items()):

            # [[( status, tags )]]
            s_t_t_tupled = (list(s_t_t.items())
                            for s_t_t in several_statuses_to_tags)

            # [( status, tags )]
            flattened_s_t_t = itertools.chain.from_iterable(s_t_t_tupled)

            statuses_to_tags = HydrusData.default_dict_set()

            for (status, tags) in flattened_s_t_t:

                statuses_to_tags[status].update(tags)

            merged_service_keys_to_statuses_to_tags[
                service_key] = statuses_to_tags

        return TagsManager(merged_service_keys_to_statuses_to_tags,
                           merged_service_keys_to_statuses_to_tags)
    def GetSomePetitionedRows(self, service_id: int):

        petitioned_files_table_name = GenerateFilesTableName(
            service_id, HC.CONTENT_STATUS_PETITIONED)

        petitioned_rows = list(
            HydrusData.BuildKeyToListDict(
                self._c.execute(
                    'SELECT reason_id, hash_id FROM {} ORDER BY reason_id LIMIT 100;'
                    .format(petitioned_files_table_name))).items())

        return petitioned_rows
Exemplo n.º 5
0
 def GetJSONDumpNamesToBackupTimestamps( self, dump_type ):
     
     names_to_backup_timestamps = HydrusData.BuildKeyToListDict( self._c.execute( 'SELECT dump_name, timestamp FROM json_dumps_named WHERE dump_type = ? ORDER BY timestamp ASC;', ( dump_type, ) ) )
     
     for ( name, timestamp_list ) in list( names_to_backup_timestamps.items() ):
         
         timestamp_list.pop( -1 ) # remove the non backup timestamp
         
         if len( timestamp_list ) == 0:
             
             del names_to_backup_timestamps[ name ]
             
         
     
     return names_to_backup_timestamps
Exemplo n.º 6
0
    def GetTagParentsIds(self, service_id):

        statuses_and_pair_ids = self._Execute(
            'SELECT status, child_tag_id, parent_tag_id FROM tag_parents WHERE service_id = ? UNION SELECT status, child_tag_id, parent_tag_id FROM tag_parent_petitions WHERE service_id = ?;',
            (service_id, service_id)).fetchall()

        unsorted_statuses_to_pair_ids = HydrusData.BuildKeyToListDict(
            (status, (child_tag_id, parent_tag_id))
            for (status, child_tag_id, parent_tag_id) in statuses_and_pair_ids)

        statuses_to_pair_ids = collections.defaultdict(list)

        statuses_to_pair_ids.update({
            status: sorted(pair_ids)
            for (status, pair_ids) in unsorted_statuses_to_pair_ids.items()
        })

        return statuses_to_pair_ids
Exemplo n.º 7
0
    def DoSomeWork(self, source):

        time_started_precise = HydrusData.GetNowPrecise()

        data = source.GetSomeData()

        content_updates = []

        pairs = []

        for (hash, tags) in data:

            pairs.extend(((tag, hash) for tag in tags))

        num_done = len(pairs)

        tags_to_hashes = HydrusData.BuildKeyToListDict(pairs)

        if self._content_action == HC.CONTENT_UPDATE_PETITION:

            reason = 'Mass Migration Job'

        else:

            reason = None

        for (tag, hashes) in tags_to_hashes.items():

            content_updates.append(
                HydrusData.ContentUpdate(HC.CONTENT_TYPE_MAPPINGS,
                                         self._content_action, (tag, hashes),
                                         reason=reason))

        service_keys_to_content_updates = {
            self._tag_service_key: content_updates
        }

        self._controller.WriteSynchronous('content_updates',
                                          service_keys_to_content_updates)

        return GetBasicSpeedStatement(num_done, time_started_precise)
    def Search(self, hash_id, max_hamming_distance):

        if max_hamming_distance == 0:

            similar_hash_ids = self._STL(
                self._c.execute(
                    'SELECT hash_id FROM shape_perceptual_hash_map WHERE phash_id IN ( SELECT phash_id FROM shape_perceptual_hash_map WHERE hash_id = ? );',
                    (hash_id, )))

            similar_hash_ids_and_distances = [
                (similar_hash_id, 0) for similar_hash_id in similar_hash_ids
            ]

        else:

            search_radius = max_hamming_distance

            top_node_result = self._c.execute(
                'SELECT phash_id FROM shape_vptree WHERE parent_id IS NULL;'
            ).fetchone()

            if top_node_result is None:

                return []

            (root_node_phash_id, ) = top_node_result

            search = self._STL(
                self._c.execute(
                    'SELECT phash FROM shape_perceptual_hashes NATURAL JOIN shape_perceptual_hash_map WHERE hash_id = ?;',
                    (hash_id, )))

            if len(search) == 0:

                return []

            similar_phash_ids_to_distances = {}

            num_cycles = 0
            total_nodes_searched = 0

            for search_phash in search:

                next_potentials = [root_node_phash_id]

                while len(next_potentials) > 0:

                    current_potentials = next_potentials
                    next_potentials = []

                    num_cycles += 1
                    total_nodes_searched += len(current_potentials)

                    for group_of_current_potentials in HydrusData.SplitListIntoChunks(
                            current_potentials, 10000):

                        # this is split into fixed lists of results of subgroups because as an iterable it was causing crashes on linux!!
                        # after investigation, it seemed to be SQLite having a problem with part of Get64BitHammingDistance touching phashes it presumably was still hanging on to
                        # the crash was in sqlite code, again presumably on subsequent fetch
                        # adding a delay in seemed to fix it as well. guess it was some memory maintenance buffer/bytes thing
                        # anyway, we now just get the whole lot of results first and then work on the whole lot
                        '''
                        #old method
                        select_statement = 'SELECT phash_id, phash, radius, inner_id, outer_id FROM shape_perceptual_hashes NATURAL JOIN shape_vptree WHERE phash_id = ?;'
                        
                        results = list( self._ExecuteManySelectSingleParam( select_statement, group_of_current_potentials ) )
                        '''

                        with HydrusDB.TemporaryIntegerTable(
                                self._c, group_of_current_potentials,
                                'phash_id') as temp_table_name:

                            # temp phash_ids to actual phashes and tree info
                            results = self._c.execute(
                                'SELECT phash_id, phash, radius, inner_id, outer_id FROM {} CROSS JOIN shape_perceptual_hashes USING ( phash_id ) CROSS JOIN shape_vptree USING ( phash_id );'
                                .format(temp_table_name)).fetchall()

                        for (node_phash_id, node_phash, node_radius,
                             inner_phash_id, outer_phash_id) in results:

                            # first check the node itself--is it similar?

                            node_hamming_distance = HydrusData.Get64BitHammingDistance(
                                search_phash, node_phash)

                            if node_hamming_distance <= search_radius:

                                if node_phash_id in similar_phash_ids_to_distances:

                                    current_distance = similar_phash_ids_to_distances[
                                        node_phash_id]

                                    similar_phash_ids_to_distances[
                                        node_phash_id] = min(
                                            node_hamming_distance,
                                            current_distance)

                                else:

                                    similar_phash_ids_to_distances[
                                        node_phash_id] = node_hamming_distance

                            # now how about its children?

                            if node_radius is not None:

                                # we have two spheres--node and search--their centers separated by node_hamming_distance
                                # we want to search inside/outside the node_sphere if the search_sphere intersects with those spaces
                                # there are four possibles:
                                # (----N----)-(--S--)    intersects with outer only - distance between N and S > their radii
                                # (----N---(-)-S--)      intersects with both
                                # (----N-(--S-)-)        intersects with both
                                # (---(-N-S--)-)         intersects with inner only - distance between N and S + radius_S does not exceed radius_N

                                if inner_phash_id is not None:

                                    spheres_disjoint = node_hamming_distance > (
                                        node_radius + search_radius)

                                    if not spheres_disjoint:  # i.e. they intersect at some point

                                        next_potentials.append(inner_phash_id)

                                if outer_phash_id is not None:

                                    search_sphere_subset_of_node_sphere = (
                                        node_hamming_distance +
                                        search_radius) <= node_radius

                                    if not search_sphere_subset_of_node_sphere:  # i.e. search sphere intersects with non-node sphere space at some point

                                        next_potentials.append(outer_phash_id)

            if HG.db_report_mode:

                HydrusData.ShowText(
                    'Similar file search touched {} nodes over {} cycles.'.
                    format(HydrusData.ToHumanInt(total_nodes_searched),
                           HydrusData.ToHumanInt(num_cycles)))

            # so, now we have phash_ids and distances. let's map that to actual files.
            # files can have multiple phashes, and phashes can refer to multiple files, so let's make sure we are setting the smallest distance we found

            similar_phash_ids = list(similar_phash_ids_to_distances.keys())

            with HydrusDB.TemporaryIntegerTable(self._c, similar_phash_ids,
                                                'phash_id') as temp_table_name:

                # temp phashes to hash map
                similar_phash_ids_to_hash_ids = HydrusData.BuildKeyToListDict(
                    self._c.execute(
                        'SELECT phash_id, hash_id FROM {} CROSS JOIN shape_perceptual_hash_map USING ( phash_id );'
                        .format(temp_table_name)))

            similar_hash_ids_to_distances = {}

            for (phash_id, hash_ids) in similar_phash_ids_to_hash_ids.items():

                distance = similar_phash_ids_to_distances[phash_id]

                for hash_id in hash_ids:

                    if hash_id not in similar_hash_ids_to_distances:

                        similar_hash_ids_to_distances[hash_id] = distance

                    else:

                        current_distance = similar_hash_ids_to_distances[
                            hash_id]

                        if distance < current_distance:

                            similar_hash_ids_to_distances[hash_id] = distance

            similar_hash_ids_and_distances = list(
                similar_hash_ids_to_distances.items())

        return similar_hash_ids_and_distances
Exemplo n.º 9
0
    def GetTagParentsIdsChains(self, service_id, tag_ids):

        # I experimented with one or two recursive queries, and for siblings, but it mostly ended up hellmode index efficiency. I think ( service_id, integer ) did it in

        # note that this has to do sibling lookup as well to fetch pairs that are only connected to our chain by sibling relationships, and we are assuming here that the sibling lookup cache is valid

        searched_tag_ids = set()
        next_tag_ids = set(tag_ids)
        result_rows = set()

        while len(next_tag_ids) > 0:

            tag_ids_seen_this_round = set()

            ideal_tag_ids = self.modules_tag_siblings.GetIdeals(
                ClientTags.TAG_DISPLAY_IDEAL, service_id, next_tag_ids)

            tag_ids_seen_this_round.update(
                self.modules_tag_siblings.GetChainsMembersFromIdeals(
                    ClientTags.TAG_DISPLAY_IDEAL, service_id, ideal_tag_ids))

            with self._MakeTemporaryIntegerTable(
                    next_tag_ids, 'tag_id') as temp_next_tag_ids_table_name:

                searched_tag_ids.update(next_tag_ids)

                # keep these separate--older sqlite can't do cross join to an OR ON

                # temp tag_ids to parents
                queries = [
                    'SELECT status, child_tag_id, parent_tag_id FROM {} CROSS JOIN tag_parents ON ( child_tag_id = tag_id ) WHERE service_id = ?'
                    .format(temp_next_tag_ids_table_name),
                    'SELECT status, child_tag_id, parent_tag_id FROM {} CROSS JOIN tag_parents ON ( parent_tag_id = tag_id ) WHERE service_id = ?'
                    .format(temp_next_tag_ids_table_name),
                    'SELECT status, child_tag_id, parent_tag_id FROM {} CROSS JOIN tag_parent_petitions ON ( child_tag_id = tag_id ) WHERE service_id = ?'
                    .format(temp_next_tag_ids_table_name),
                    'SELECT status, child_tag_id, parent_tag_id FROM {} CROSS JOIN tag_parent_petitions ON ( parent_tag_id = tag_id ) WHERE service_id = ?'
                    .format(temp_next_tag_ids_table_name)
                ]

                query = ' UNION '.join(queries)

                for row in self._Execute(
                        query,
                    (service_id, service_id, service_id, service_id)):

                    result_rows.add(row)

                    (status, child_tag_id, parent_tag_id) = row

                    tag_ids_seen_this_round.update(
                        (child_tag_id, parent_tag_id))

            next_tag_ids = tag_ids_seen_this_round.difference(searched_tag_ids)

        unsorted_statuses_to_pair_ids = HydrusData.BuildKeyToListDict(
            (status, (child_tag_id, parent_tag_id))
            for (status, child_tag_id, parent_tag_id) in result_rows)

        statuses_to_pair_ids = collections.defaultdict(list)

        statuses_to_pair_ids.update({
            status: sorted(pair_ids)
            for (status, pair_ids) in unsorted_statuses_to_pair_ids.items()
        })

        return statuses_to_pair_ids