Пример #1
0
    def testPairs(self):
        t1 = IIBTree([(1, 10), (3, 30), (7, 70)])
        t2 = IIBTree([(3, 30), (5, 50), (7, 7), (9, 90)])
        allkeys = [1, 3, 5, 7, 9]
        b1 = IIBucket(t1)
        b2 = IIBucket(t2)
        for x in t1, t2, b1, b2:
            for key in x.keys():
                self.assertEqual(key in allkeys, 1)
            for y in t1, t2, b1, b2:
                for w1, w2 in (0, 0), (1, 10), (10, 1), (2, 3):
                    # Test the union.
                    expected = []
                    for key in allkeys:
                        if x.has_key(key) or y.has_key(key):
                            result = x.get(key, 0) * w1 + y.get(key, 0) * w2
                            expected.append((key, result))
                    expected.sort()
                    got = mass_weightedUnion([(x, w1), (y, w2)])
                    self.assertEqual(expected, list(got.items()))
                    got = mass_weightedUnion([(y, w2), (x, w1)])
                    self.assertEqual(expected, list(got.items()))

                    # Test the intersection.
                    expected = []
                    for key in allkeys:
                        if x.has_key(key) and y.has_key(key):
                            result = x[key] * w1 + y[key] * w2
                            expected.append((key, result))
                    expected.sort()
                    got = mass_weightedIntersection([(x, w1), (y, w2)])
                    self.assertEqual(expected, list(got.items()))
                    got = mass_weightedIntersection([(y, w2), (x, w1)])
                    self.assertEqual(expected, list(got.items()))
Пример #2
0
 def convertScores(scores,
                   type=type,
                   TupleType=TupleType,
                   IIBTree=IIBTree):
     if type(scores) is not TupleType and type(scores) is not IIBTree():
         scores = IIBTree(scores)
     return scores
Пример #3
0
    def __init__(self, lexicon):
        self._lexicon = lexicon

        # wid -> {docid -> weight}; t -> D -> w(D, t)
        # Different indexers have different notions of term weight, but we
        # expect each indexer to use ._wordinfo to map wids to its notion
        # of a docid-to-weight map.
        # There are two kinds of OOV words:  wid 0 is explicitly OOV,
        # and it's possible that the lexicon will return a non-zero wid
        # for a word we don't currently know about.  For example, if we
        # unindex the last doc containing a particular word, that wid
        # remains in the lexicon, but is no longer in our _wordinfo map;
        # lexicons can also be shared across indices, and some other index
        # may introduce a lexicon word we've never seen.
        # A word is in-vocabulary for this index if and only if
        # _wordinfo.has_key(wid).  Note that wid 0 must not be a key.
        self._wordinfo = IOBTree()

        # docid -> weight
        # Different indexers have different notions of doc weight, but we
        # expect each indexer to use ._docweight to map docids to its
        # notion of what a doc weight is.
        self._docweight = IIBTree()

        # docid -> WidCode'd list of wids
        # Used for un-indexing, and for phrase search.
        self._docwords = IOBTree()

        # Use a BTree length for efficient length computation w/o conflicts
        self.length = Length()
        self.document_count = Length()
Пример #4
0
 def search_phrase(self, phrase):
     wids = self._lexicon.termToWordIds(phrase)
     cleaned_wids = self._remove_oov_wids(wids)
     if len(wids) != len(cleaned_wids):
         # At least one wid was OOV:  can't possibly find it.
         return IIBTree()
     scores = self._search_wids(wids)
     hits = mass_weightedIntersection(scores)
     if not hits:
         return hits
     code = WidCode.encode(wids)
     result = IIBTree()
     for docid, weight in hits.items():
         docwords = self._docwords[docid]
         if docwords.find(code) >= 0:
             result[docid] = weight
     return result
Пример #5
0
 def testIdentity(self):
     t = IIBTree([(1, 2)])
     b = IIBucket([(1, 2)])
     for x in t, b:
         for func in mass_weightedUnion, mass_weightedIntersection:
             result = func([(x, 1)])
             self.assertEqual(len(result), 1)
             self.assertEqual(list(result.items()), list(x.items()))
Пример #6
0
 def __init__(self, id=None, **kwargs):
     super(TracListing, self).__init__(id, **kwargs)
     # manage an index of parent-to-child, int parent ticket id key
     # to PersistentList of int ticket id value:
     self._children = IOBTree()
     # indexes for score and reward-ratio values:
     self._scores = IIBTree()  # int (ticket#) -> int (sum/score)
     self._reward = IOBTree()  # int (ticket#) -> float (ratio)
    def _apply_index(self, request):
        """Apply the index to query parameters given in 'request'.

        The argument should be a mapping object.

        If the request does not contain the needed parameters, then
        None is returned.

        If the request contains a parameter with the name of the
        column and this parameter is either a Record or a class
        instance then it is assumed that the parameters of this index
        are passed as attribute (Note: this is the recommended way to
        pass parameters since Zope 2.4)

        Otherwise two objects are returned.  The first object is a
        ResultSet containing the record numbers of the matching
        records.  The second object is a tuple containing the names of
        all data fields used.
        """
        if query_blocker.blocked:
            return
        record = parseIndexRequest(request, self.id)
        if record.keys is None:
            return None
        template_params = {
            'keys': record.keys,
        }
        query_body = self._apply_template(template_params)
        logger.info(query_body)
        es_kwargs = dict(
            index=index_name(),
            body=query_body,
            size=BATCH_SIZE,
            scroll='1m',
            _source_include=['rid'],
        )
        es = get_query_client()
        result = es.search(**es_kwargs)
        # initial return value, other batches to be applied

        def score(record):
            return int(10000 * float(record['_score']))

        retval = IIBTree()
        for r in result['hits']['hits']:
            retval[r['_source']['rid']] = score(r)

        total = result['hits']['total']
        if total > BATCH_SIZE:
            sid = result['_scroll_id']
            counter = BATCH_SIZE
            while counter < total:
                result = es.scroll(scroll_id=sid, scroll='1m')
                for record in result['hits']['hits']:
                    retval[record['_source']['rid']] = score(record)
                counter += BATCH_SIZE
        return retval, (self.id,)
Пример #8
0
 def clear(self):
     """ Complete reset """
     self._index = IOBTree()
     self._unindex = IIBTree()
     self._length = Length()
     if self._counter is None:
         self._counter = Length()
     else:
         self._increment_counter()
Пример #9
0
 def clear(self):
     self._index = IITreeSet()
     self._index_length = BTrees.Length.Length()
     self._index_value = 1
     self._unindex = IIBTree()
     self._length = BTrees.Length.Length()
     if self._counter is None:
         self._counter = BTrees.Length.Length()
     else:
         self._increment_counter()
Пример #10
0
 def __init__(self, name, root):
     # m_order maintains a newest-first mapping of int -> version id.
     # m_date maintains a mapping of a packed date (int # of minutes
     # since the epoch) to a lookup key in m_order. The two structures
     # are separate because we only support minute precision for date
     # lookups (and multiple versions could be added in a minute).
     self.date_created = time.time()
     self.m_order = IOBTree()
     self.m_date = IIBTree()
     self.name = name
     self.root = root
Пример #11
0
 def testScalarMultiply(self):
     t = IIBTree([(1, 2), (2, 3), (3, 4)])
     allkeys = [1, 2, 3]
     b = IIBucket(t)
     for x in t, b:
         self.assertEqual(list(x.keys()), allkeys)
         for func in mass_weightedUnion, mass_weightedIntersection:
             for factor in 0, 1, 5, 10:
                 result = func([(x, factor)])
                 self.assertEqual(allkeys, list(result.keys()))
                 for key in x.keys():
                     self.assertEqual(x[key] * factor, result[key])
Пример #12
0
 def test_walk_w_normal_btree(self):
     from BTrees.IIBTree import IIBTree
     obj = IIBTree()
     for i in range(1000):
         obj[i] = i
     walker = self._makeOne(obj)
     path = '/'
     parent = object()
     is_mapping = True
     keys = []
     kids = []
     lo = 0
     hi = None
     self.assertRaises(NotImplementedError, walker.walk)
Пример #13
0
 def _mass_add_wordinfo(self, wid2weight, docid):
     dicttype = type({})
     get_doc2score = self._wordinfo.get
     new_word_count = 0
     for wid, weight in wid2weight.items():
         doc2score = get_doc2score(wid)
         if doc2score is None:
             doc2score = {}
             new_word_count += 1
         elif (isinstance(doc2score, dicttype)
               and len(doc2score) == self.DICT_CUTOFF):
             doc2score = IIBTree(doc2score)
         doc2score[docid] = weight
         self._wordinfo[wid] = doc2score  # not redundant:  Persistency!
     self.length.change(new_word_count)
Пример #14
0
def optimize_dateindex(index):
    # migrate _unindex from OIBTree to IIBTree
    old_unindex = index._unindex
    if isinstance(old_unindex, IIBTree):
        return
    index._unindex = _unindex = IIBTree()
    logger.info('Converting to IIBTree for index `%s`.' % index.getId())
    for pos, (k, v) in enumerate(old_unindex.items()):
        _unindex[k] = v
        if pos and pos % 10000 == 0:
            transaction.savepoint(optimistic=True)
            logger.info('Processed %s items.' % pos)

    transaction.savepoint(optimistic=True)
    logger.info('Finished conversion.')
Пример #15
0
    def insertDocument(self, docid, widlist):
        Storage.insertDocument(self, docid, widlist)

        occurences = {}  # wid -> #(occurences)
        num_wids = float(len(widlist))
        for wid in widlist:
            if not occurences.has_key(wid):
                occurences[wid] = 1
            else:
                occurences[wid] += 1

        self._frequencies[docid] = IIBTree()
        tree = self._frequencies[docid]
        for wid, num in occurences.items():
            tree[wid] = num
Пример #16
0
    def insertForwardIndexEntry(self, entry, documentId, score=1):
        """Uses the information provided to update the indexes.

        The basic logic for choice of data structure is based on
        the number of entries as follows:

            1      tuple
            2-3    dictionary
            4+     bucket.
        """

        index = self._index
        indexRow = index.get(entry, None)

        if indexRow is not None:
            if type(indexRow) is TupleType:
                # Tuples are only used for rows which have only
                # a single entry.  Since we now need more, we'll
                # promote it to a mapping object (dictionary).

                # First, make sure we're not already in it, if so
                # update the score if necessary.
                if indexRow[0] == documentId:
                    if indexRow[1] != score:
                        indexRow = (documentId, score)
                        index[entry] = indexRow
                else:
                    indexRow = {
                        indexRow[0]: indexRow[1],
                        documentId: score,
                    }
                    index[entry] = indexRow
            else:
                if indexRow.get(documentId, -1) != score:
                    # score changed (or new entry)

                    if type(indexRow) is DictType:
                        indexRow[documentId] = score
                        if len(indexRow) > 3:
                            # Big enough to give it's own database record
                            indexRow = IIBTree(indexRow)
                        index[entry] = indexRow
                    else:
                        indexRow[documentId] = score
        else:
            # We don't have any information at this point, so we'll
            # put our first entry in, and use a tuple to save space
            index[entry] = (documentId, score)
Пример #17
0
def optimize_dateindex(index):
    # migrate _unindex from OIBTree to IIBTree
    old_unindex = index._unindex
    if isinstance(old_unindex, IIBTree):
        return
    index._unindex = _unindex = IIBTree()
    logger.info('Converting to IIBTree for index `%s`.', index.getId())
    for pos, (k, v) in enumerate(old_unindex.items()):
        _unindex[k] = v
        # Note: flake8 erroneously complains about module formatter.
        if pos and pos % 10000 == 0:  # noqa S001
            transaction.savepoint(optimistic=True)
            logger.info('Processed %s items.', pos)

    transaction.savepoint(optimistic=True)
    logger.info('Finished conversion.')
Пример #18
0
    def testMany(self):
        import random
        N = 15  # number of IIBTrees to feed in
        L = []
        commonkey = N * 1000
        allkeys = {commonkey: 1}
        for i in range(N):
            t = IIBTree()
            t[commonkey] = i
            for j in range(N - i):
                key = i + j
                allkeys[key] = 1
                t[key] = N * i + j
            L.append((t, i + 1))
        random.shuffle(L)
        allkeys = allkeys.keys()
        allkeys.sort()

        # Test the union.
        expected = []
        for key in allkeys:
            sum = 0
            for t, w in L:
                if t.has_key(key):
                    sum += t[key] * w
            expected.append((key, sum))
        # print 'union', expected
        got = mass_weightedUnion(L)
        self.assertEqual(expected, list(got.items()))

        # Test the intersection.
        expected = []
        for key in allkeys:
            sum = 0
            for t, w in L:
                if t.has_key(key):
                    sum += t[key] * w
                else:
                    break
            else:
                # We didn't break out of the loop so it's in the intersection.
                expected.append((key, sum))
        # print 'intersection', expected
        got = mass_weightedIntersection(L)
        self.assertEqual(expected, list(got.items()))
Пример #19
0
 def insert(self, idx, results, relnames=None, treePrefix=None):
     unindex = None
     for brain in results:
         # Use the first brain to get a reference to the index, then reuse
         # that reference
         unindex = unindex or brain.global_catalog._catalog.indexes[
             'path']._unindex
         path = brain.getPath()
         if treePrefix and not path.startswith(treePrefix):
             for p in unindex[brain.getRID()]:
                 if p.startswith(treePrefix):
                     path = p
                     break
         path = path.split('/', 3)[-1]
         for depth in xrange(path.count('/') + 1):
             comp = idx.setdefault(path, IIBTree())
             comp[depth] = comp.get(depth, 0) + 1
             path = path.rsplit('/', 1)[0]
Пример #20
0
 def __init__(self, datafs, writable=0, trans=0, pack=0):
     self.trans_limit = trans
     self.pack_limit = pack
     self.trans_count = 0
     self.pack_count = 0
     self.stopdict = get_stopdict()
     self.mh = mhlib.MH()
     self.filestorage = FileStorage(datafs, read_only=(not writable))
     self.database = DB(self.filestorage)
     self.connection = self.database.open()
     self.root = self.connection.root()
     try:
         self.index = self.root["index"]
     except KeyError:
         self.index = self.root["index"] = TextIndex()
     try:
         self.docpaths = self.root["docpaths"]
     except KeyError:
         self.docpaths = self.root["docpaths"] = IOBTree()
     try:
         self.doctimes = self.root["doctimes"]
     except KeyError:
         self.doctimes = self.root["doctimes"] = IIBTree()
     try:
         self.watchfolders = self.root["watchfolders"]
     except KeyError:
         self.watchfolders = self.root["watchfolders"] = {}
     self.path2docid = OIBTree()
     for docid in self.docpaths.keys():
         path = self.docpaths[docid]
         self.path2docid[path] = docid
     try:
         self.maxdocid = max(self.docpaths.keys())
     except ValueError:
         self.maxdocid = 0
     print len(self.docpaths), "Document ids"
     print len(self.path2docid), "Pathnames"
     print self.index.lexicon.length(), "Words"
Пример #21
0
    def _add_wordinfo(self, wid, f, docid):
        # Store a wordinfo in a dict as long as there are less than
        # DICT_CUTOFF docids in the dict.  Otherwise use an IIBTree.

        # The pickle of a dict is smaller than the pickle of an
        # IIBTree, substantially so for small mappings.  Thus, we use
        # a dictionary until the mapping reaches DICT_CUTOFF elements.

        # The cutoff is chosen based on the implementation
        # characteristics of Python dictionaries.  The dict hashtable
        # always has 2**N slots and is resized whenever it is 2/3s
        # full.  A pickled dict with 10 elts is half the size of an
        # IIBTree with 10 elts, and 10 happens to be 2/3s of 2**4.  So
        # choose 10 as the cutoff for now.

        # The IIBTree has a smaller in-memory representation than a
        # dictionary, so pickle size isn't the only consideration when
        # choosing the threshold.  The pickle of a 500-elt dict is 92%
        # of the size of the same IIBTree, but the dict uses more
        # space when it is live in memory.  An IIBTree stores two C
        # arrays of ints, one for the keys and one for the values.  It
        # holds up to 120 key-value pairs in a single bucket.
        doc2score = self._wordinfo.get(wid)
        if doc2score is None:
            doc2score = {}
            self.length.change(1)
        else:
            # _add_wordinfo() is called for each update.  If the map
            # size exceeds the DICT_CUTOFF, convert to an IIBTree.
            # Obscure:  First check the type.  If it's not a dict, it
            # can't need conversion, and then we can avoid an expensive
            # len(IIBTree).
            if (isinstance(doc2score, type({}))
                    and len(doc2score) == self.DICT_CUTOFF):
                doc2score = IIBTree(doc2score)
        doc2score[docid] = f
        self._wordinfo[wid] = doc2score  # not redundant:  Persistency!
Пример #22
0
def convert_to_booleanindex(catalog, index):
    if isinstance(index, BooleanIndex):
        return
    logger.info('Converting index `%s` to BooleanIndex.' % index.getId())
    index.__class__ = BooleanIndex
    index._p_changed = True
    catalog._catalog._p_changed = True

    # convert _unindex from IOBTree to IIBTree
    sets = {0: IITreeSet(), 1: IITreeSet()}
    old_unindex = index._unindex
    index._unindex = _unindex = IIBTree()
    for k, v in old_unindex.items():
        # docid to value (True, False)
        value = int(bool(v))
        _unindex[k] = value
        sets[value].add(k)
    del old_unindex

    # convert _index from OOBTree to IITreeSet and set lengths
    false_length = len(sets[0])
    true_length = len(sets[1])
    index._length = Length(false_length + true_length)
    # we put the smaller set into the index
    if false_length < true_length:
        index._index_value = 0
        index._index_length = Length(false_length)
        index._index = sets[0]
        del sets[1]
    else:
        index._index_value = 1
        index._index_length = Length(true_length)
        index._index = sets[1]
        del sets[0]
    transaction.savepoint(optimistic=True)
    logger.info('Finished conversion.')
    def _apply_index(self, request):
        """Apply the index to query parameters given in 'request'.

        The argument should be a mapping object.

        If the request does not contain the needed parameters, then
        None is returned.

        If the request contains a parameter with the name of the
        column and this parameter is either a Record or a class
        instance then it is assumed that the parameters of this index
        are passed as attribute (Note: this is the recommended way to
        pass parameters since Zope 2.4)

        Otherwise two objects are returned.  The first object is a
        ResultSet containing the record numbers of the matching
        records.  The second object is a tuple containing the names of
        all data fields used.
        """
        record = parseIndexRequest(request, self.id)
        if record.keys is None:
            return None
        keys = []
        for key in record.keys:
            key = key.replace("\\", "").replace('"', "")
            if not isinstance(key, bytes):
                key = key.encode("utf8")
            keys.append(key)
        template_params = {"keys": keys}
        __traceback_info__ = "template parameters: {0}".format(template_params)
        query_body = self._apply_template(template_params)
        logger.info(query_body)
        es_kwargs = dict(
            index=index_name(),
            body=query_body,
            size=BATCH_SIZE,
            scroll="1m",
            _source_includes=["rid"],
        )
        es = get_query_client()
        try:
            result = es.search(**es_kwargs)
        except RequestError:
            logger.info("Query failed:\n{0}".format(query_body))
            return None
        except TransportError:
            logger.exception("ElasticSearch failed")
            return None
        # initial return value, other batches to be applied

        def score(record):
            return int(10000 * float(record["_score"]))

        retval = IIBTree()
        for r in result["hits"]["hits"]:
            retval[r["_source"]["rid"]] = score(r)

        total = result["hits"]["total"]["value"]
        if total > BATCH_SIZE:
            sid = result["_scroll_id"]
            counter = BATCH_SIZE
            while counter < total:
                result = es.scroll(scroll_id=sid, scroll="1m")
                for record in result["hits"]["hits"]:
                    retval[record["_source"]["rid"]] = score(record)
                counter += BATCH_SIZE
        return retval, (self.id,)
Пример #24
0
    def _apply_index(self, request):
        """Apply the index to query parameters given in 'request'.

        The argument should be a mapping object.

        If the request does not contain the needed parameters, then
        None is returned.

        If the request contains a parameter with the name of the
        column and this parameter is either a Record or a class
        instance then it is assumed that the parameters of this index
        are passed as attribute (Note: this is the recommended way to
        pass parameters since Zope 2.4)

        Otherwise two objects are returned.  The first object is a
        ResultSet containing the record numbers of the matching
        records.  The second object is a tuple containing the names of
        all data fields used.
        """
        config = get_configuration()
        timeout = getattr(config, 'request_timeout', 20)
        search_fields = getattr(config, 'search_fields', None)
        if not search_fields:
            search_fields = SEARCH_FIELDS
        search_fields = search_fields.split()
        logger.info(search_fields)
        if query_blocker.blocked:
            return
        record = parseIndexRequest(request, self.id)
        if record.keys is None:
            return None
        es = get_query_client()
        search = Search(using=es, index=index_name())
        search = search.params(request_timeout=timeout)
        search = search.sort('rid', '_id')
        search = search.source(include='rid')
        query_string = record.keys[0].decode('utf8')
        logger.info(query_string)
        if '*' in query_string:
            query_string = query_string.replace('*', ' ')
        query_string = query_string.strip()
        search = search.query('simple_query_string',
                              query=query_string,
                              fields=search_fields)
        results_count = search.count()
        search = search.params(request_timeout=timeout,
                               size=BATCH_SIZE,
                               track_scores=True)
        # setup highlighting
        for field in search_fields:
            name = field.split('^')[0]
            if name == 'title':
                # title shows up in results anyway
                continue
            search = search.highlight(name, fragment_size=FRAGMENT_SIZE)

        # initial return value, other batches to be applied
        retval = IIBTree()
        highlights = OOBTree()
        last_seen = None
        count = 0
        batch_count = results_count / BATCH_SIZE
        if results_count % BATCH_SIZE != 0:
            batch_count = batch_count + 1
        for i in xrange(batch_count):
            if last_seen is not None:
                search = search.update_from_dict({'search_after': last_seen})
            try:
                results = search.execute(ignore_cache=True)
            except TransportError:
                # No es client, return empty results
                logger.exception('ElasticSearch client not available.')
                return IIBTree(), (self.id, )

            for r in results:
                rid = getattr(r, 'rid', None)
                if rid is not None:
                    retval[rid] = int(10000 * float(r.meta.score))
                    # Index query returns only rids, so we need
                    # to save highlights for later use
                    highlight_list = []
                    if getattr(r.meta, 'highlight', None) is not None:
                        for key in dir(r.meta.highlight):
                            highlight_list.extend(r.meta.highlight[key])
                    highlights[r.meta.id] = highlight_list
                last_seen = [rid, r.meta.id]
                count = count + 1

        # store highlights
        try:
            annotations = IAnnotations(self.REQUEST)
            annotations[HIGHLIGHT_KEY] = highlights
        except TypeError:
            # maybe we are in a test
            pass

        return retval, (self.id, )
Пример #25
0
 def clear(self):
     self._index = IITreeSet()
     self._index_length = BTrees.Length.Length()
     self._index_value = 1
     self._unindex = IIBTree()
     self._length = BTrees.Length.Length()
Пример #26
0
 def clear(self):
     """ Complete reset """
     self._index = IOBTree()
     self._unindex = IIBTree()
     self._length = Length()
Пример #27
0
    def _apply_index(self, request, cid=''):
        """Apply query specified by request, a mapping containing the query.

        Returns two objects on success: the resultSet containing the
        matching record numbers, and a tuple containing the names of
        the fields used.

        Returns None if request is not valid for this index.
        """
        if disable_solr:
            return None

        cm = self.connection_manager
        q = []  # List of query texts to pass as "q"
        queried = []  # List of field names queried
        stored = []  # List of stored field names
        solr_params = {}

        # Get the Solr parameters from the catalog query
        if request.has_key('solr_params'):
            solr_params.update(request['solr_params'])

        # Include parameters from field queries
        for field in cm.schema.fields:
            name = field.name
            if field.stored:
                stored.append(name)
            if not request.has_key(name):
                continue

            field_query = self._decode_param(request[name])
            field_params = field.handler.parse_query(field, field_query)
            if field_params:
                queried.append(name)
                for k in field_params:
                    to_add = field_params[k]
                    if k not in solr_params:
                        solr_params[k] = to_add
                    else:
                        # add to the list
                        v = solr_params[k]
                        if not isinstance(v, list):
                            v = [v]
                            solr_params[k] = v
                        if isinstance(to_add, basestring):
                            v.append(to_add)
                        else:
                            v.extend(to_add)

        if not solr_params:
            return None

        solr_params['fields'] = cm.schema.uniqueKey
        # We only add highlighting for any field that is marked as stored.
        # 'queried' returns the list of fields queried,
        # a specific list of names will narrow the list.
        to_highlight = []
        hfields = solr_params.get('highlight', None)
        if hfields and stored:
            if hfields == 'queried':
                solr_params['highlight'] = queried
            for fname in hfields:
                if fname in stored:
                    to_highlight.append(fname)
                else:
                    log.debug(
                        "Requested field isn't marked as 'stored', "
                        "cannot enable highlighting: %s", fname)
            solr_params['highlight'] = to_highlight
        if not solr_params.get('q'):
            # Solr requires a 'q' parameter, so provide an
            # all-inclusive one. If the query is using dismax, then
            # use the 'q.alt' parameter since dismax does not know how
            # to parse '*:*' in the 'q' param.
            if solr_params.get('defType', '') == 'dismax':
                solr_params['q.alt'] = '*:*'
                solr_params['q'] = ''
            else:
                solr_params['q'] = '*:*'

        # Additional fields can be added into the query above in the
        # field_params check. The 'q' variable cannot be sent to solr
        # multiple times (as is the case when it is a list). Only the
        # first instance of the 'q' param will be recognized by solr, so
        # we turn it back into a string here.
        #
        # XXX: Should the logic for field_params be changed above?
        if isinstance(solr_params['q'], list):
            solr_params['q'] = ' '.join(solr_params['q'])

        # Decode all strings using list from `expected_encodings`,
        # then transcode to UTF-8
        transcoded_params = self._transcode_params(solr_params)

        log.debug("querying: %r", solr_params)
        response = cm.connection.query(**transcoded_params)
        if request.has_key('solr_callback'):
            # Call a function with the Solr response object
            callback = request['solr_callback']
            callback(response)

        # Since highlighting can be either enabled by default in the Solr
        # config, or as a query parameter we just check to see if the
        # response has any highlighting returned.
        if hasattr(response, 'highlighting'):
            catalog = get_catalog(self, name=self.catalog_name)
            if catalog:
                hkey = tuple(
                    sorted([(fname, request.get(fname)) for fname in queried]))
                if not issubclass(catalog._v_brains, HighlightingBrain) or \
                   (hasattr(catalog._v_brains, 'highlighting_key') and \
                    catalog._v_brains.highlighting_key != hkey) or \
                   (hasattr(catalog._v_brains, 'catalog_name') and \
                    catalog._v_brains.catalog_name != self.catalog_name):
                    # We use an inline class here so that the brain has
                    # enough data to retrieve the stored highlighting data
                    class myhighlightingbrains(HighlightingBrain):
                        highlighting_key = hkey
                        highlighting = response.highlighting

                    catalog.useBrains(myhighlightingbrains)
                    log.debug("Creating new custom brain class, hkey: '%s'",
                              hkey)
                else:
                    catalog._v_brains.highlighting = response.highlighting
                    log.debug("Using existing custom brain class, hkey: '%s'",
                              hkey)
            else:
                log.debug(
                    "Cannot retrieve catalog '%s', highlighting unavailable",
                    self.catalog_name)

        uniqueKey = cm.schema.uniqueKey
        result = IIBTree()
        for r in response:
            result[int(r[uniqueKey])] = int(r.get('score', 0) * 1000)

        return result, queried
Пример #28
0
 def clear(self):
     self._doc2wid = IOBTree()  # docid -> [wordids]
     self._wid2doc = IOBTree()  # wordid -> [docids]
     self._docweight = IIBTree()  # docid -> (# terms in document)
     self._length = BTrees.Length.Length()
Пример #29
0
    def _apply_index(self, request):
        """Apply the index to query parameters given in 'request'.

        The argument should be a mapping object.

        If the request does not contain the needed parameters, then
        None is returned.

        If the request contains a parameter with the name of the
        column and this parameter is either a Record or a class
        instance then it is assumed that the parameters of this index
        are passed as attribute (Note: this is the recommended way to
        pass parameters since Zope 2.4)

        Otherwise two objects are returned.  The first object is a
        ResultSet containing the record numbers of the matching
        records.  The second object is a tuple containing the names of
        all data fields used.
        """
        config = get_configuration()
        timeout = getattr(config, 'request_timeout', 20)
        search_fields = getattr(config, 'search_fields', None)
        if not search_fields:
            search_fields = SEARCH_FIELDS
        search_fields = search_fields.split()
        if query_blocker.blocked:
            return
        record = parseIndexRequest(request, self.id)
        if record.keys is None:
            return None
        es = get_query_client()
        search = Search(using=es, index=index_name())
        search = search.params(
            request_timeout=timeout,
            size=BATCH_SIZE,
            preserve_order=True,
        )
        search = search.source(include='rid')
        query_string = record.keys[0]
        if query_string and query_string.startswith('*'):
            # plone.app.querystring contains op sends a leading *, remove it
            query_string = query_string[1:]
        search = search.query('simple_query_string',
                              query=query_string,
                              fields=search_fields)
        # setup highlighting
        for field in search_fields:
            name = field.split('^')[0]
            if name == 'title':
                # title shows up in results anyway
                continue
            search = search.highlight(name, fragment_size=FRAGMENT_SIZE)

        try:
            result = search.scan()
        except TransportError:
            # No es client, return empty results
            logger.exception('ElasticSearch client not available.')
            return IIBTree(), (self.id, )
        # initial return value, other batches to be applied

        retval = IIBTree()
        highlights = OOBTree()
        for r in result:
            if getattr(r, 'rid', None) is None:
                # something was indexed with no rid. Ignore for now.
                # this is only for highlights, so no big deal if we
                # skip one
                continue
            retval[r.rid] = int(10000 * float(r.meta.score))
            # Index query returns only rids, so we need
            # to save highlights for later use
            highlight_list = []
            if getattr(r.meta, 'highlight', None) is not None:
                for key in dir(r.meta.highlight):
                    highlight_list.extend(r.meta.highlight[key])
            highlights[r.meta.id] = highlight_list

        # store highlights
        try:
            annotations = IAnnotations(self.REQUEST)
            annotations[HIGHLIGHT_KEY] = highlights
        except TypeError:
            # maybe we are in a test
            pass

        return retval, (self.id, )
Пример #30
0
 def _makeOne(self):
     from BTrees.IIBTree import IIBTree
     return IIBTree()