def Wrap(self, cl): """Use this method to wrap the sphinx client. """ self.wrap_cl = cl if getattr(cl, 'query_parser', None): user_sph_map = cl.query_parser.kwargs.get('user_sph_map', {}) else: user_sph_map = {} self.query_parser = QueryParser(QuerySimilar, user_sph_map=user_sph_map) return self
# provide search cache #cl.AttachCache(redis_cache) # setup the different facets cl.AttachFacets( Facet('author', sql_col='author', sql_table='author_terms', sph_field='authors'), Facet('keyword', sql_col='tag', sql_table='title_terms', attr='title_attr', sph_field='title'), Facet('venue', sql_col='source_id', sql_table='venue_terms', sph_field='source_id'), Facet('year', sql_table=None) ) # we want to group by counts group_sort = '@count desc' # setup sorting and ordering of each facet for f in cl.facets: f.SetMaxNumValues(15) # group by a custom function f.SetGroupSort(group_sort) # order the term alphabetically within each facet f.SetOrderBy('@term', order='asc') # the query should always be parsed beforehand query_parser = QueryParser(MultiFieldQuery, user_sph_map={ 'author' : 'authors', 'keyword' : 'title', 'venue' : 'source_id', 'year' : 'year', }) cl.AttachQueryParser(query_parser)
sql_table='plot_keyword_terms'), Facet('director'), Facet('actor'), ) # for all facets compute count, groupby and this score group_func = ''' sum( if (runtime_attr > 45, if (nb_votes_attr > 1000, if (nb_votes_attr < 10000, nb_votes_attr * user_rating_attr, 10000 * user_rating_attr), 1000 * user_rating_attr), 300 * user_rating_attr) )''' # setup sorting and ordering of each facet for f in cl.facets: f.SetGroupFunc(group_func) # order the term alphabetically within each facet f.SetOrderBy('@term') # the query should always be parsed beforehand query_parser = QueryParser(MultiFieldQuery, user_sph_map={ 'genre': 'genres', 'keyword': 'plot_keywords', 'director': 'directors', 'actor': 'actors' }) cl.AttachQueryParser(query_parser)
cl.AttachDBFetch(db_fetch) # setup the different facets cl.AttachFacets( Facet('author', sph_field='authors'), Facet('journal', sph_field='journal_title_abbreviation'), Facet('mesh', attr='mesh_attr', sql_col='mesh_term', sql_table='mesh_terms_terms'), Facet('year', sql_table=None), ) # setup sorting and ordering of each facet for f in cl.facets: # group by a custom function f.SetGroupFunc('sum(citation_count_attr)') # f.SetGroupSort('@count desc') # order the term alphabetically within each facet f.SetOrderBy('@term', order='asc') # fixes Sphinx bug: SetSelect with distributed indexes f._set_select = f._set_select + ', %s, %s, pmid_attr, citation_count_attr' % (f._attr, 'year_attr') # stop after having processed 500 results (see optimization) f.SetCutOff(500) # the query should always be parsed beforehand query_parser = QueryParser(MultiFieldQuery, user_sph_map={ 'author' : 'authors', 'journal' : 'journal_title_abbreviation', 'mesh' : 'mesh', 'year' : 'year', }) cl.AttachQueryParser(query_parser)
class SimClient(object): """Creates a wrapped sphinx client together with a computed index. The computed index is queried if a similary search query is encoutered. In this case the the function sphinx_setup is called in order to reset the wrapped sphinx client. The log_score of each item is found in the Sphinx attribute "log_score_attr". It must be set to 1 and declared as a float in your Sphinx configuration file. """ def __init__(self, cl=None, query_handler=None, sphinx_setup=None, **opts): # essential options self.Wrap(cl) self.query_handler = query_handler self.SetSphinxSetup(sphinx_setup) # other options index_path = opts.get('index_path', '') if index_path: self.LoadIndex(index_path) self.max_items = opts.get('max_items', 1000) self.max_terms = opts.get('max_terms', 20) self.exclude_queried = opts.get('exclude_queried', True) self.allow_empty = opts.get('allow_empty', True) if self.allow_empty: QuerySimilar.ALLOW_EMPTY = True def __getattr__(self, name): return getattr(self.wrap_cl, name) def Wrap(self, cl): """Use this method to wrap the sphinx client. """ self.wrap_cl = cl if getattr(cl, 'query_parser', None): user_sph_map = cl.query_parser.kwargs.get('user_sph_map', {}) else: user_sph_map = {} self.query_parser = QueryParser(QuerySimilar, user_sph_map=user_sph_map) return self def LoadIndex(self, index_path): """Load the similarity search index in memory. """ self.query_handler = bsets.QueryHandler(bsets.load_index(index_path)) def SetSphinxSetup(self, setup): """Set the setup function which will be triggered in similarity search on the sphinx client. This function takes a sphinx client and operates on it in order to change sorting mode or ranking etc ... The Sphinx attribute "log_score_attr" holds each item log score. """ self.sphinx_setup = setup def Query(self, query, index='*', comment=''): """If the query has item ids perform a similarity search query otherwise perform a normal sphinx query. """ # parse the query which is assumed to be a string self.query = self.query_parser.Parse(query) self.time_similarity = 0 item_ids = self.query.GetItemIds() if item_ids: # perform similarity search on the set of query items log_scores = self.DoSimQuery(item_ids) # setup the sphinx client with log scores self._SetupSphinxClient(item_ids, dict(log_scores)) # perform the Sphinx query hits = self.DoSphinxQuery(self.query, index, comment) if item_ids: # add the statistics to the matches self._AddStats(hits, item_ids) # and other statistics hits['time_similarity'] = self.time_similarity return hits @CacheIO def DoSimQuery(self, item_ids): """Performs the actual simlarity search query. """ results = self.query_handler.query(item_ids, self.max_items) self.time_similarity = results.time return results.log_scores def DoSphinxQuery(self, query, index='*', comment=''): """Peforms a normal sphinx query. """ if isinstance(self.wrap_cl, FSphinxClient): return self.wrap_cl.Query(query) else: # check we don't loose the parsed query return self.wrap_cl.Query(query.sphinx) def _SetupSphinxClient(self, item_ids, log_scores): # this fixes a nasty bug in the sphinxapi with sockets timing out self.wrap_cl._timeout = None # override log_score_attr and exclude selected ids self.wrap_cl.SetOverride('log_score_attr', sphinxapi.SPH_ATTR_FLOAT, log_scores) if self.exclude_queried: self.wrap_cl.SetFilter('@id', item_ids, exclude=True) # only hits with non zero log scores are considered if the query is empty if not self.query.sphinx and self.allow_empty: self.wrap_cl.SetFilterFloatRange('log_score_attr', 0.0, 1.0, exclude=True) # further setup of the wrapped sphinx client if self.sphinx_setup: self.sphinx_setup(self.wrap_cl) def _AddStats(self, sphinx_results, item_ids): scores = self._GetDetailedScores( [match['id'] for match in sphinx_results['matches']], item_ids) for scores, match in zip(scores, sphinx_results['matches']): match['attrs']['@sim_scores'] = scores @CacheIO def _GetDetailedScores(self, result_ids, query_item_ids=None): scores = self.query_handler.get_detailed_scores( result_ids, query_item_ids, max_terms=self.max_terms) self.time_similarity = self.query_handler.time return scores def Clone(self, memo={}): """Creates a copy of this client. This makes sure the whole index is not recopied. """ return self.__deepcopy__(memo) def __deepcopy__(self, memo): cl = self.__class__() attrs = utils.save_attrs( self, [a for a in self.__dict__ if a not in ['query_handler']]) utils.load_attrs(cl, attrs) if self.query_handler: cl.query_handler = bsets.QueryHandler( self.query_handler.computed_index) return cl @classmethod def FromConfig(cls, path): """Creates a client from a config file. """ return FSphinxClient.FromConfig(path)
# setup the different facets cl.AttachFacets( Facet('organizaciones', attr='organizacion_attr', sql_col='alias', sql_table='Organizacion'), Facet('empresas', attr='empresa_attr', sql_col='alias', sql_table='Organizacion'), Facet('relacion_empresa', attr='relacionEmp_attr', sql_col='relationship', sql_table='tipoRelacionP20'), Facet('relacion_persona', attr='relacionPers_attr', sql_col='alias', sql_table='persona'), Facet('relacion_familiar', attr='relacionFam_attr', sql_col='alias', sql_table='persona'), ) # for all facets compute count, groupby and this score # setup sorting and ordering of each facet for f in cl.facets: #f.SetGroupFunc(group_func) # order the term alphabetically within each facet f.SetOrderBy('@term') # the query should always be parsed beforehand query_parser = QueryParser(MultiFieldQuery, user_sph_map={ 'organizaciones' : 'organizacion', 'empresas' : 'empresa', 'relacion_empresa' : 'cargos', 'relacion_persona' : 'personas_relacionadas', 'relacion_familiar' : 'familiares' }) cl.AttachQueryParser(query_parser)