Exemplo n.º 1
0
    def group(self, seq):
        sortIndex = self._sortIndex
        sortReverse = self._sortReverse
        ns = len(seq)
        ni = len(sortIndex)
        if ns >= 0.1 * ni:
            # result large compared to index -- sort via index
            handled = IISet()
            hn = 0
            _load = getattr(sortIndex, '_load', None)
            if _load is None:
                # not an optimized index
                items = sortIndex.items()

                _load = lambda (x1, x2): x2
                if sortReverse: items.reverse()
            elif sortReverse:
                gRO = getattr(sortIndex, 'getReverseOrder', None)
                items = gRO and gRO()
                if items is None:
                    items = list(sortIndex._index.keys())
                    items.reverse()
            else:
                items = sortIndex._index.keys()
            for i in items:
                ids = intersection(seq, _load(i))
                if ids:
                    handled.update(ids)
                    hn += len(ids)
                    yield i, ids
            if hn != len(seq): yield None, difference(seq, handled)
        else:
            # result relatively small -- sort via result
            m = OOBTree()
            keyFor = getattr(sortIndex, 'keyForDocument', None)
            # work around "nogopip" bug: it defines "keyForDocument" as an integer
            if not callable(keyFor):
                # this will fail, when the index neither defines a reasonable
                # "keyForDocument" nor "documentToKeyMap". In this case,
                # the index cannot be used for sorting.
                keyFor = lambda doc, map=sortIndex.documentToKeyMap(): map[doc]
            noValue = IITreeSet()
            for doc in seq.keys():
                try:
                    k = keyFor(doc)
                except KeyError:
                    noValue.insert(doc)
                    continue
                l = m.get(k)
                if l is None: l = m[k] = IITreeSet()
                l.insert(doc)
            items = m.items()
            if sortReverse:
                items = list(items)
                items.reverse()
            for i in items:
                yield i
            if noValue: yield None, noValue
Exemplo n.º 2
0
 def group(self, seq):
   sortIndex = self._sortIndex; sortReverse = self._sortReverse
   ns = len(seq); ni = len(sortIndex)
   if ns >= 0.1 * ni:
     # result large compared to index -- sort via index
     handled = IISet(); hn = 0
     _load = getattr(sortIndex, '_load', None)
     if _load is None:
       # not an optimized index
       items = sortIndex.items()
       
       _load = lambda (x1, x2): x2
       if sortReverse: items.reverse()
     elif sortReverse:
       gRO = getattr(sortIndex, 'getReverseOrder', None)
       items = gRO and gRO()
       if items is None:
         items = list(sortIndex._index.keys()); items.reverse()
     else: items = sortIndex._index.keys()
     for i in items:
       ids = intersection(seq, _load(i))
       if ids:
         handled.update(ids); hn += len(ids)
         yield i, ids
     if hn != len(seq): yield None, difference(seq, handled)
   else:
     # result relatively small -- sort via result
     m = OOBTree()
     keyFor = getattr(sortIndex, 'keyForDocument', None)
     # work around "nogopip" bug: it defines "keyForDocument" as an integer
     if not callable(keyFor):
       # this will fail, when the index neither defines a reasonable
       # "keyForDocument" nor "documentToKeyMap". In this case,
       # the index cannot be used for sorting.
       keyFor = lambda doc, map=sortIndex.documentToKeyMap(): map[doc]
     noValue = IITreeSet()
     for doc in seq.keys():
       try: k = keyFor(doc)
       except KeyError: noValue.insert(doc); continue
       l = m.get(k)
       if l is None: l = m[k] = IITreeSet()
       l.insert(doc)
     items = m.items()
     if sortReverse: items = list(items); items.reverse()
     for i in items: yield i
     if noValue: yield None, noValue
Exemplo n.º 3
0
    def lookupWordsBySimilarity(self, word):       
        """ perform a similarity lookup """

        lst = self._lexicon.getSimiliarWords(word)

        docids = IISet()
        used_words = {} 

        getwid = self._lexicon.getWordId

        for word, threshold in lst:
            used_words[word] = threshold
            wid = getwid(word)

            docids.update( self._storage.get(wid) )

        return ResultSet(docids, used_words)
Exemplo n.º 4
0
    def _lookup(self, words, do_autoexpand=1):
        """ search a word or a list of words in the lexicon and 
            return a ResultSet of found documents.
        """

        docids = IISet()
        used_words = {} 

        #  remove stopwords from data
        if self.use_stopwords:
            words = self.use_stopwords.process( words ) 

        if self.use_thesaurus and self.thesaurus_mode == 'expand_always':
            TH = ThesaurusRegistry.get(self.use_thesaurus)
            for word in words[:]:
                r = TH.getTermsFor(word)
                words.extend(r)

        for word in words:

            # perform casefolding if necessary
            if self.splitter_casefolding:
                word = word.lower()

            if self.use_normalizer:
                word = NormalizerRegistry.get(self.use_normalizer).process(word)    
 
            used_words[word] = 1.0

            wid = self._lexicon.getWordId(word)

            # Retrieve list of docIds for this wordid
            if wid is not None:
                docids.update( self._storage.get(wid) )

            # perform autoexpansion of terms by performing
            # a search using right-truncation
            if do_autoexpand and self.autoexpand and len(word) >= self.autoexpand_limit:
                rs = self.lookupWordsByTruncation(word, right=1)
                docids.update(rs.docIds())
                wlen = len(word)
                for w in rs.words().keys():
                    used_words[w] = TRUNC_WEIGHT[len(w)-wlen]

        return ResultSet(docids, used_words)
Exemplo n.º 5
0
 def group(self, seq):
   sortIndex = self._sortIndex; sortReverse = self._sortReverse
   ns = len(seq); ni = len(sortIndex)
   if ns >= 0.1 * ni:
     # result large compared to index -- sort via index
     handled = IISet(); hn = 0
     _load = getattr(sortIndex, '_load', None)
     if _load is None:
       # not an optimized index
       items = sortIndex.items()
       
       _load = lambda (x1, x2): x2
       if sortReverse: items.reverse()
     elif sortReverse:
       gRO = getattr(sortIndex, 'getReverseOrder', None)
       items = gRO and gRO()
       if items is None:
         items = list(sortIndex._index.keys()); items.reverse()
     else: items = sortIndex._index.keys()
     for i in items:
       ids = intersection(seq, _load(i))
       if ids:
         handled.update(ids); hn += len(ids)
         yield i, ids
     if hn != len(seq): yield None, difference(seq, handled)
   else:
     # result relatively small -- sort via result
     keyFor = sortIndex.keyForDocument; m = OOBTree()
     noValue = IITreeSet()
     for doc in seq.keys():
       try: k = keyFor(doc)
       except KeyError: noValue.insert(doc); continue
       l = m.get(k)
       if l is None: l = m[k] = IITreeSet()
       l.insert(doc)
     items = m.items()
     if sortReverse: items = list(items); items.reverse()
     for i in items: yield i
     if noValue: yield None, noValue