Пример #1
0
 def Wrap(self, cl):
     """Use this method to wrap the sphinx client.
     """
     self.wrap_cl = cl
     if getattr(cl, 'query_parser', None):
         user_sph_map = cl.query_parser.kwargs.get('user_sph_map', {})
     else:
         user_sph_map = {}
     self.query_parser = QueryParser(QuerySimilar,
                                     user_sph_map=user_sph_map)
     return self
Пример #2
0
# provide search cache
#cl.AttachCache(redis_cache)

# setup the different facets
cl.AttachFacets(
    Facet('author', sql_col='author', sql_table='author_terms', sph_field='authors'),
    Facet('keyword', sql_col='tag', sql_table='title_terms', attr='title_attr', sph_field='title'),
    Facet('venue', sql_col='source_id', sql_table='venue_terms', sph_field='source_id'),
    Facet('year', sql_table=None)
)

# we want to group by counts
group_sort = '@count desc'

# setup sorting and ordering of each facet
for f in cl.facets:
    f.SetMaxNumValues(15)
    # group by a custom function
    f.SetGroupSort(group_sort)
    # order the term alphabetically within each facet
    f.SetOrderBy('@term', order='asc')

# the query should always be parsed beforehand
query_parser = QueryParser(MultiFieldQuery, user_sph_map={
    'author' : 'authors',
    'keyword' : 'title',
    'venue' : 'source_id',
    'year' : 'year',
})
cl.AttachQueryParser(query_parser)
Пример #3
0
          sql_table='plot_keyword_terms'),
    Facet('director'),
    Facet('actor'),
)

# for all facets compute count, groupby and this score
group_func = '''
sum(
    if (runtime_attr > 45,
        if (nb_votes_attr > 1000,
            if (nb_votes_attr < 10000, nb_votes_attr * user_rating_attr, 10000 * user_rating_attr),
        1000 * user_rating_attr),
    300 * user_rating_attr)
)'''

# setup sorting and ordering of each facet
for f in cl.facets:
    f.SetGroupFunc(group_func)
    # order the term alphabetically within each facet
    f.SetOrderBy('@term')

# the query should always be parsed beforehand
query_parser = QueryParser(MultiFieldQuery,
                           user_sph_map={
                               'genre': 'genres',
                               'keyword': 'plot_keywords',
                               'director': 'directors',
                               'actor': 'actors'
                           })
cl.AttachQueryParser(query_parser)
Пример #4
0
cl.AttachDBFetch(db_fetch)

# setup the different facets
cl.AttachFacets(
    Facet('author', sph_field='authors'),
    Facet('journal', sph_field='journal_title_abbreviation'),
    Facet('mesh', attr='mesh_attr', sql_col='mesh_term', sql_table='mesh_terms_terms'),
    Facet('year', sql_table=None),
)

# setup sorting and ordering of each facet
for f in cl.facets:
    # group by a custom function
    f.SetGroupFunc('sum(citation_count_attr)')
#    f.SetGroupSort('@count desc')
    # order the term alphabetically within each facet
    f.SetOrderBy('@term', order='asc')
    # fixes Sphinx bug: SetSelect with distributed indexes
    f._set_select = f._set_select + ', %s, %s, pmid_attr, citation_count_attr' % (f._attr, 'year_attr')
    # stop after having processed 500 results (see optimization)
    f.SetCutOff(500)

# the query should always be parsed beforehand
query_parser = QueryParser(MultiFieldQuery, user_sph_map={
    'author' : 'authors',
    'journal' : 'journal_title_abbreviation',
    'mesh' : 'mesh',
    'year' : 'year',
})
cl.AttachQueryParser(query_parser)
Пример #5
0
class SimClient(object):
    """Creates a wrapped sphinx client together with a computed index.
    
    The computed index is queried if a similary search query is encoutered.
    In this case the the function sphinx_setup is called in order to reset 
    the wrapped sphinx client.
    
    The log_score of each item is found in the Sphinx attribute "log_score_attr". 
    It must be set to 1 and declared as a float in your Sphinx configuration file.
    """
    def __init__(self, cl=None, query_handler=None, sphinx_setup=None, **opts):
        # essential options
        self.Wrap(cl)
        self.query_handler = query_handler
        self.SetSphinxSetup(sphinx_setup)
        # other options
        index_path = opts.get('index_path', '')
        if index_path:
            self.LoadIndex(index_path)
        self.max_items = opts.get('max_items', 1000)
        self.max_terms = opts.get('max_terms', 20)
        self.exclude_queried = opts.get('exclude_queried', True)
        self.allow_empty = opts.get('allow_empty', True)
        if self.allow_empty:
            QuerySimilar.ALLOW_EMPTY = True

    def __getattr__(self, name):
        return getattr(self.wrap_cl, name)

    def Wrap(self, cl):
        """Use this method to wrap the sphinx client.
        """
        self.wrap_cl = cl
        if getattr(cl, 'query_parser', None):
            user_sph_map = cl.query_parser.kwargs.get('user_sph_map', {})
        else:
            user_sph_map = {}
        self.query_parser = QueryParser(QuerySimilar,
                                        user_sph_map=user_sph_map)
        return self

    def LoadIndex(self, index_path):
        """Load the similarity search index in memory.
        """
        self.query_handler = bsets.QueryHandler(bsets.load_index(index_path))

    def SetSphinxSetup(self, setup):
        """Set the setup function which will be triggered in similarity search 
        on the sphinx client.
        
        This function takes a sphinx client and operates on it in order to
        change sorting mode or ranking etc ... 
        
        The Sphinx attribute "log_score_attr" holds each item log score.
        """
        self.sphinx_setup = setup

    def Query(self, query, index='*', comment=''):
        """If the query has item ids perform a similarity search query otherwise
        perform a normal sphinx query.
        """
        # parse the query which is assumed to be a string
        self.query = self.query_parser.Parse(query)
        self.time_similarity = 0

        item_ids = self.query.GetItemIds()
        if item_ids:
            # perform similarity search on the set of query items
            log_scores = self.DoSimQuery(item_ids)
            # setup the sphinx client with log scores
            self._SetupSphinxClient(item_ids, dict(log_scores))

        # perform the Sphinx query
        hits = self.DoSphinxQuery(self.query, index, comment)

        if item_ids:
            # add the statistics to the matches
            self._AddStats(hits, item_ids)

        # and other statistics
        hits['time_similarity'] = self.time_similarity

        return hits

    @CacheIO
    def DoSimQuery(self, item_ids):
        """Performs the actual simlarity search query.
        """
        results = self.query_handler.query(item_ids, self.max_items)
        self.time_similarity = results.time

        return results.log_scores

    def DoSphinxQuery(self, query, index='*', comment=''):
        """Peforms a normal sphinx query.
        """
        if isinstance(self.wrap_cl, FSphinxClient):
            return self.wrap_cl.Query(query)
        else:
            # check we don't loose the parsed query
            return self.wrap_cl.Query(query.sphinx)

    def _SetupSphinxClient(self, item_ids, log_scores):
        # this fixes a nasty bug in the sphinxapi with sockets timing out
        self.wrap_cl._timeout = None

        # override log_score_attr and exclude selected ids
        self.wrap_cl.SetOverride('log_score_attr', sphinxapi.SPH_ATTR_FLOAT,
                                 log_scores)
        if self.exclude_queried:
            self.wrap_cl.SetFilter('@id', item_ids, exclude=True)

        # only hits with non zero log scores are considered if the query is empty
        if not self.query.sphinx and self.allow_empty:
            self.wrap_cl.SetFilterFloatRange('log_score_attr',
                                             0.0,
                                             1.0,
                                             exclude=True)

        # further setup of the wrapped sphinx client
        if self.sphinx_setup:
            self.sphinx_setup(self.wrap_cl)

    def _AddStats(self, sphinx_results, item_ids):
        scores = self._GetDetailedScores(
            [match['id'] for match in sphinx_results['matches']], item_ids)
        for scores, match in zip(scores, sphinx_results['matches']):
            match['attrs']['@sim_scores'] = scores

    @CacheIO
    def _GetDetailedScores(self, result_ids, query_item_ids=None):
        scores = self.query_handler.get_detailed_scores(
            result_ids, query_item_ids, max_terms=self.max_terms)
        self.time_similarity = self.query_handler.time

        return scores

    def Clone(self, memo={}):
        """Creates a copy of this client.

        This makes sure the whole index is not recopied.
        """
        return self.__deepcopy__(memo)

    def __deepcopy__(self, memo):
        cl = self.__class__()
        attrs = utils.save_attrs(
            self, [a for a in self.__dict__ if a not in ['query_handler']])
        utils.load_attrs(cl, attrs)
        if self.query_handler:
            cl.query_handler = bsets.QueryHandler(
                self.query_handler.computed_index)
        return cl

    @classmethod
    def FromConfig(cls, path):
        """Creates a client from a config file.
        """
        return FSphinxClient.FromConfig(path)
# setup the different facets
cl.AttachFacets(
    Facet('organizaciones', attr='organizacion_attr', sql_col='alias', sql_table='Organizacion'),
    Facet('empresas', attr='empresa_attr', sql_col='alias', sql_table='Organizacion'),
    Facet('relacion_empresa', attr='relacionEmp_attr', sql_col='relationship', sql_table='tipoRelacionP20'),
    Facet('relacion_persona', attr='relacionPers_attr', sql_col='alias', sql_table='persona'),
    Facet('relacion_familiar', attr='relacionFam_attr', sql_col='alias', sql_table='persona'),
)

# for all facets compute count, groupby and this score


# setup sorting and ordering of each facet
for f in cl.facets:
    #f.SetGroupFunc(group_func)
    # order the term alphabetically within each facet
    f.SetOrderBy('@term')

# the query should always be parsed beforehand
query_parser = QueryParser(MultiFieldQuery, user_sph_map={
    'organizaciones' : 'organizacion',
    'empresas' : 'empresa',
    'relacion_empresa' : 'cargos',
    'relacion_persona' : 'personas_relacionadas',
    'relacion_familiar' : 'familiares'
})
cl.AttachQueryParser(query_parser)