Пример #1
0
 def GetHashIdsThatHaveTagsSimpleLocation( self, tag_display_type: int, file_service_key: bytes, tag_search_context: ClientSearch.TagSearchContext, namespace_ids_table_name = None, hash_ids_table_name = None, job_key = None ):
     
     mapping_and_tag_table_names = self.modules_tag_search.GetMappingAndTagTables( tag_display_type, file_service_key, tag_search_context )
     
     if hash_ids_table_name is None:
         
         if namespace_ids_table_name is None:
             
             # hellmode
             queries = [ 'SELECT DISTINCT hash_id FROM {};'.format( mappings_table_name ) for ( mappings_table_name, tags_table_name ) in mapping_and_tag_table_names ]
             
         else:
             
             # temp namespaces to tags to mappings
             queries = [ 'SELECT DISTINCT hash_id FROM {} CROSS JOIN {} USING ( namespace_id ) CROSS JOIN {} USING ( tag_id );'.format( namespace_ids_table_name, tags_table_name, mappings_table_name ) for ( mappings_table_name, tags_table_name ) in mapping_and_tag_table_names ]
             
         
     else:
         
         if namespace_ids_table_name is None:
             
             queries = [ 'SELECT hash_id FROM {} WHERE EXISTS ( SELECT 1 FROM {} WHERE {}.hash_id = {}.hash_id );'.format( hash_ids_table_name, mappings_table_name, mappings_table_name, hash_ids_table_name ) for ( mappings_table_name, tags_table_name ) in mapping_and_tag_table_names ]
             
         else:
             
             # temp hashes to mappings to tags to temp namespaces
             # this was originally a 'WHERE EXISTS' thing, but doing that on a three way cross join is too complex for that to work well
             # let's hope DISTINCT can save time too
             queries = [ 'SELECT DISTINCT hash_id FROM {} CROSS JOIN {} USING ( hash_id ) CROSS JOIN {} USING ( tag_id ) CROSS JOIN {} USING ( namespace_id );'.format( hash_ids_table_name, mappings_table_name, tags_table_name, namespace_ids_table_name ) for ( mappings_table_name, tags_table_name ) in mapping_and_tag_table_names ]
             
         
     
     cancelled_hook = None
     
     if job_key is not None:
         
         cancelled_hook = job_key.IsCancelled
         
     
     nonzero_tag_hash_ids = set()
     
     for query in queries:
         
         cursor = self._Execute( query )
         
         nonzero_tag_hash_ids.update( self._STI( HydrusDB.ReadFromCancellableCursor( cursor, 10240, cancelled_hook ) ) )
         
         if job_key is not None and job_key.IsCancelled():
             
             return set()
             
         
     
     return nonzero_tag_hash_ids
Пример #2
0
    def GetSubtagIdsFromWildcardIntoTable(self,
                                          file_service_id: int,
                                          tag_service_id: int,
                                          subtag_wildcard,
                                          subtag_id_table_name,
                                          job_key=None):

        if tag_service_id == self.modules_services.combined_tag_service_id:

            search_tag_service_ids = self.modules_services.GetServiceIds(
                HC.REAL_TAG_SERVICES)

        else:

            search_tag_service_ids = (tag_service_id, )

        for search_tag_service_id in search_tag_service_ids:

            if '*' in subtag_wildcard:

                subtags_fts4_table_name = self.GetSubtagsFTS4TableName(
                    file_service_id, search_tag_service_id)

                wildcard_has_fts4_searchable_characters = WildcardHasFTS4SearchableCharacters(
                    subtag_wildcard)

                if subtag_wildcard == '*':

                    # hellmode, but shouldn't be called normally
                    cursor = self._Execute('SELECT docid FROM {};'.format(
                        subtags_fts4_table_name))

                elif ClientSearch.IsComplexWildcard(
                        subtag_wildcard
                ) or not wildcard_has_fts4_searchable_characters:

                    # FTS4 does not support complex wildcards, so instead we'll search our raw subtags
                    # however, since we want to search 'searchable' text, we use the 'searchable subtags map' to cross between real and searchable

                    like_param = ConvertWildcardToSQLiteLikeParameter(
                        subtag_wildcard)

                    if subtag_wildcard.startswith(
                            '*'
                    ) or not wildcard_has_fts4_searchable_characters:

                        # this is a SCAN, but there we go
                        # a potential optimisation here, in future, is to store fts4 of subtags reversed, then for '*amus', we can just search that reverse cache for 'suma*'
                        # and this would only double the size of the fts4 cache, the largest cache in the whole db! a steal!
                        # it also would not fix '*amu*', but with some cleverness could speed up '*amus ar*'

                        query = 'SELECT docid FROM {} WHERE subtag LIKE ?;'.format(
                            subtags_fts4_table_name)

                        cursor = self._Execute(query, (like_param, ))

                    else:

                        # we have an optimisation here--rather than searching all subtags for bl*ah, let's search all the bl* subtags for bl*ah!

                        prefix_fts4_wildcard = subtag_wildcard.split('*')[0]

                        prefix_fts4_wildcard_param = '"{}*"'.format(
                            prefix_fts4_wildcard)

                        query = 'SELECT docid FROM {} WHERE subtag MATCH ? AND subtag LIKE ?;'.format(
                            subtags_fts4_table_name)

                        cursor = self._Execute(
                            query, (prefix_fts4_wildcard_param, like_param))

                else:

                    # we want the " " wrapping our search text to keep whitespace words connected and in order
                    # "samus ar*" should not match "around samus"

                    # simple 'sam*' style subtag, so we can search fts4 no prob

                    subtags_fts4_param = '"{}"'.format(subtag_wildcard)

                    cursor = self._Execute(
                        'SELECT docid FROM {} WHERE subtag MATCH ?;'.format(
                            subtags_fts4_table_name), (subtags_fts4_param, ))

                cancelled_hook = None

                if job_key is not None:

                    cancelled_hook = job_key.IsCancelled

                loop_of_subtag_id_tuples = HydrusDB.ReadFromCancellableCursor(
                    cursor, 1024, cancelled_hook=cancelled_hook)

                self._ExecuteMany(
                    'INSERT OR IGNORE INTO {} ( subtag_id ) VALUES ( ? );'.
                    format(subtag_id_table_name), loop_of_subtag_id_tuples)

            else:

                # old notes from before we had searchable subtag map. I deleted that map once, albeit in an older and less efficient form. *don't delete it again, it has use*
                #
                # NOTE: doing a subtag = 'blah' lookup on subtags_fts4 tables is ultra slow, lmao!
                # attempts to match '/a/' to 'a' with clever FTS4 MATCHing (i.e. a MATCH on a*\b, then an '= a') proved not super successful
                # in testing, it was still a bit slow. my guess is it is still iterating through all the nodes for ^a*, the \b just makes it a bit more efficient sometimes
                # in tests '^a\b' was about twice as fast as 'a*', so the \b might not even be helping at all
                # so, I decided to move back to a lean and upgraded searchable subtag map, and here we are

                searchable_subtag = subtag_wildcard

                if self.modules_tags.SubtagExists(searchable_subtag):

                    searchable_subtag_id = self.modules_tags.GetSubtagId(
                        searchable_subtag)

                    self._Execute(
                        'INSERT OR IGNORE INTO {} ( subtag_id ) VALUES ( ? );'.
                        format(subtag_id_table_name), (searchable_subtag_id, ))

                    subtags_searchable_map_table_name = self.GetSubtagsSearchableMapTableName(
                        file_service_id, search_tag_service_id)

                    self._Execute(
                        'INSERT OR IGNORE INTO {} ( subtag_id ) SELECT subtag_id FROM {} WHERE searchable_subtag_id = ?;'
                        .format(subtag_id_table_name,
                                subtags_searchable_map_table_name),
                        (searchable_subtag_id, ))

            if job_key is not None and job_key.IsCancelled():

                self._Execute('DELETE FROM {};'.format(subtag_id_table_name))

                return
Пример #3
0
 def GetHashIdsAndNonZeroTagCounts( self, tag_display_type: int, location_context: ClientLocation.LocationContext, tag_search_context: ClientSearch.TagSearchContext, hash_ids, namespace_wildcard = None, job_key = None ):
     
     if namespace_wildcard == '*':
         
         namespace_wildcard = None
         
     
     if namespace_wildcard is None:
         
         namespace_ids = []
         
     else:
         
         namespace_ids = self.modules_tag_search.GetNamespaceIdsFromWildcard( namespace_wildcard )
         
     
     with self._MakeTemporaryIntegerTable( namespace_ids, 'namespace_id' ) as temp_namespace_ids_table_name:
         
         ( file_service_keys, file_location_is_cross_referenced ) = location_context.GetCoveringCurrentFileServiceKeys()
         
         mapping_and_tag_table_names = set()
         
         for file_service_key in file_service_keys:
             
             mapping_and_tag_table_names.update( self.modules_tag_search.GetMappingAndTagTables( tag_display_type, file_service_key, tag_search_context ) )
             
         
         # reason why I (JOIN each table) rather than (join the UNION) is based on previous hell with having query planner figure out a "( a UNION b UNION c ) NATURAL JOIN stuff" situation
         # although the following sometimes makes certifiable 2KB ( 6 UNION * 4-table ) queries, it actually works fast
         
         # OK, a new problem is mass UNION leads to terrible cancelability because the first row cannot be fetched until the first n - 1 union queries are done
         # I tried some gubbins to try to do a pseudo table-union rather than query union and do 'get files->distinct tag count for this union of tables, and fetch hash_ids first on the union', but did not have luck
         
         # so NOW we are just going to do it in bits of files mate. this also reduces memory use from the distinct-making UNION with large numbers of hash_ids
         
         results = []
         
         BLOCK_SIZE = max( 64, int( len( hash_ids ) ** 0.5 ) ) # go for square root for now
         
         for group_of_hash_ids in HydrusData.SplitIteratorIntoChunks( hash_ids, BLOCK_SIZE ):
             
             with self._MakeTemporaryIntegerTable( group_of_hash_ids, 'hash_id' ) as hash_ids_table_name:
                 
                 if namespace_wildcard is None:
                     
                     # temp hashes to mappings
                     select_statements = [ 'SELECT hash_id, tag_id FROM {} CROSS JOIN {} USING ( hash_id )'.format( hash_ids_table_name, mappings_table_name ) for ( mappings_table_name, tags_table_name ) in mapping_and_tag_table_names ]
                     
                 else:
                     
                     # temp hashes to mappings to tags to namespaces
                     select_statements = [ 'SELECT hash_id, tag_id FROM {} CROSS JOIN {} USING ( hash_id ) CROSS JOIN {} USING ( tag_id ) CROSS JOIN {} USING ( namespace_id )'.format( hash_ids_table_name, mappings_table_name, tags_table_name, temp_namespace_ids_table_name ) for ( mappings_table_name, tags_table_name ) in mapping_and_tag_table_names ]
                     
                 
                 unions = '( {} )'.format( ' UNION '.join( select_statements ) )
                 
                 query = 'SELECT hash_id, COUNT( tag_id ) FROM {} GROUP BY hash_id;'.format( unions )
                 
                 cursor = self._Execute( query )
                 
                 cancelled_hook = None
                 
                 if job_key is not None:
                     
                     cancelled_hook = job_key.IsCancelled
                     
                 
                 loop_of_results = HydrusDB.ReadFromCancellableCursor( cursor, 64, cancelled_hook = cancelled_hook )
                 
                 if job_key is not None and job_key.IsCancelled():
                     
                     return results
                     
                 
                 results.extend( loop_of_results )
                 
             
         
         return results
Пример #4
0
 def GetHashIdsFromTagIds( self, tag_display_type: int, file_service_key: bytes, tag_search_context: ClientSearch.TagSearchContext, tag_ids: typing.Collection[ int ], hash_ids = None, hash_ids_table_name = None, job_key = None ):
     
     do_hash_table_join = False
     
     if hash_ids_table_name is not None and hash_ids is not None:
         
         tag_service_id = self.modules_services.GetServiceId( tag_search_context.service_key )
         file_service_id = self.modules_services.GetServiceId( file_service_key )
         
         estimated_count = self.modules_mappings_counts.GetAutocompleteCountEstimate( tag_display_type, tag_service_id, file_service_id, tag_ids, tag_search_context.include_current_tags, tag_search_context.include_pending_tags )
         
         if ClientDBMappingsStorage.DoingAFileJoinTagSearchIsFaster( len( hash_ids ), estimated_count ):
             
             do_hash_table_join = True
             
         
     
     result_hash_ids = set()
     
     table_names = self.modules_tag_search.GetMappingTables( tag_display_type, file_service_key, tag_search_context )
     
     cancelled_hook = None
     
     if job_key is not None:
         
         cancelled_hook = job_key.IsCancelled
         
     
     if len( tag_ids ) == 1:
         
         ( tag_id, ) = tag_ids
         
         if do_hash_table_join:
             
             # temp hashes to mappings
             queries = [ 'SELECT hash_id FROM {} CROSS JOIN {} USING ( hash_id ) WHERE tag_id = ?'.format( hash_ids_table_name, table_name ) for table_name in table_names ]
             
         else:
             
             queries = [ 'SELECT hash_id FROM {} WHERE tag_id = ?;'.format( table_name ) for table_name in table_names ]
             
         
         for query in queries:
             
             cursor = self._Execute( query, ( tag_id, ) )
             
             result_hash_ids.update( self._STI( HydrusDB.ReadFromCancellableCursor( cursor, 1024, cancelled_hook ) ) )
             
         
     else:
         
         with self._MakeTemporaryIntegerTable( tag_ids, 'tag_id' ) as temp_tag_ids_table_name:
             
             if do_hash_table_join:
                 
                 # temp hashes to mappings to temp tags
                 # old method, does not do EXISTS efficiently, it makes a list instead and checks that
                 # queries = [ 'SELECT hash_id FROM {} WHERE EXISTS ( SELECT 1 FROM {} CROSS JOIN {} USING ( tag_id ) WHERE {}.hash_id = {}.hash_id );'.format( hash_ids_table_name, table_name, temp_tag_ids_table_name, table_name, hash_ids_table_name ) for table_name in table_names ]
                 # new method, this seems to actually do the correlated scalar subquery, although it does seem to be sqlite voodoo
                 queries = [ 'SELECT hash_id FROM {} WHERE EXISTS ( SELECT 1 FROM {} WHERE {}.hash_id = {}.hash_id AND EXISTS ( SELECT 1 FROM {} WHERE {}.tag_id = {}.tag_id ) );'.format( hash_ids_table_name, table_name, table_name, hash_ids_table_name, temp_tag_ids_table_name, table_name, temp_tag_ids_table_name ) for table_name in table_names ]
                 
             else:
                 
                 # temp tags to mappings
                 queries = [ 'SELECT hash_id FROM {} CROSS JOIN {} USING ( tag_id );'.format( temp_tag_ids_table_name, table_name ) for table_name in table_names ]
                 
             
             for query in queries:
                 
                 cursor = self._Execute( query )
                 
                 result_hash_ids.update( self._STI( HydrusDB.ReadFromCancellableCursor( cursor, 1024, cancelled_hook ) ) )
                 
             
         
     
     return result_hash_ids