예제 #1
0
파일: text.py 프로젝트: barkinet/zerodb
    def _mass_add_wordinfo(self, wid2weight, docid):
        dicttype = type({})
        # self._wordinfo - IOBTree of docid -> weight trees
        get_doc2score = self._wordinfo.get
        new_word_count = 0

        # Fill up cache for performance over the network
        wids = wid2weight.keys()
        parallel_traversal(self._wordinfo, wids)
        parallel_traversal(map(get_doc2score, wids), [docid] * len(wids))

        from time import time
        for wid, weight in wid2weight.items():
            doc2score = get_doc2score(wid)
            if doc2score is None:
                doc2score = {}
                new_word_count += 1
            elif (isinstance(doc2score, dicttype) and
                  len(doc2score) == self.DICT_CUTOFF):
                doc2score = self.family.IF.BTree(doc2score)
            doc2score[docid] = weight
            self._wordinfo[wid] = doc2score  # not redundant:  Persistency!
        try:
            self.wordCount.change(new_word_count)
        except AttributeError:
            # upgrade wordCount to Length object
            self.wordCount = Length(len(self._wordinfo))
예제 #2
0
    def index_doc(self, docid, text):
        if docid in self._docwords:
            return self._reindex_doc(docid, text)

        wids = self._lexicon.sourceToWordIds(text)

        # XXX Counter is slow. If it is an issue, need to include C module
        # http://stackoverflow.com/questions/2522152/python-is-a-dictionary-slow-to-find-frequency-of-each-character
        widcnt = Counter(wids)
        widset = widcnt.keys()
        widcode = PersistentWid.encode_wid(wids if self.keep_phrases else widset)
        self._docwords[docid] = widcode

        if widset:
            weights, lengths = self._get_doctrees(widset)
            docscores = self._get_widscores(widcnt, docid)
            parallel_traversal(*zip(*[(weights[w], docscores[w]) for w in widset]))
            prefetch(list(lengths.values()) + [self.documentCount])

            for w in widset:
                weights[w].add(docscores[w])
                lengths[w].change(1)

        self.documentCount.change(1)

        return len(wids)
예제 #3
0
파일: text.py 프로젝트: yonglehou/zerodb
    def _mass_add_wordinfo(self, wid2weight, docid):
        dicttype = type({})
        # self._wordinfo - IOBTree of docid -> weight trees
        get_doc2score = self._wordinfo.get
        new_word_count = 0

        # Fill up cache for performance over the network
        wids = wid2weight.keys()
        parallel_traversal(self._wordinfo, wids)
        parallel_traversal(map(get_doc2score, wids), [docid] * len(wids))

        from time import time
        for wid, weight in wid2weight.items():
            doc2score = get_doc2score(wid)
            if doc2score is None:
                doc2score = {}
                new_word_count += 1
            elif (isinstance(doc2score, dicttype)
                  and len(doc2score) == self.DICT_CUTOFF):
                doc2score = self.family.IF.BTree(doc2score)
            doc2score[docid] = weight
            self._wordinfo[wid] = doc2score  # not redundant:  Persistency!
        try:
            self.wordCount.change(new_word_count)
        except AttributeError:
            # upgrade wordCount to Length object
            self.wordCount = Length(len(self._wordinfo))
예제 #4
0
 def termToWordIds(self, text):
     last = _text2list(text)
     for element in self._pipeline:
         last = element.process(last)
     wids = []
     if len(last) > 1:
         parallel_traversal(self._wids, last)
     for word in last:
         wids.append(self._wids.get(word, 0))
     return wids
예제 #5
0
파일: text_okapi.py 프로젝트: zerodb/zerodb
    def _search_wids(self, wids):
        # Bulk-fetch all the info we want to use
        if len(wids) > 1:
            parallel_traversal(self._wordinfo, wids)
        prefetch_trees([self._wordinfo[wid] for wid in wids])

        docids = list(set(itertools.chain(*[self._wordinfo[wid].keys() for wid in wids])))
        if len(docids) > 1:
            parallel_traversal(self._docweight, docids)

        return super(OkapiIndex, self)._search_wids(wids)
예제 #6
0
    def _search_wids(self, wids):
        # Bulk-fetch all the info we want to use
        if len(wids) > 1:
            parallel_traversal(self._wordinfo, wids)
        prefetch_trees([self._wordinfo[wid] for wid in wids])

        docids = list(set(itertools.chain(
            *[self._wordinfo[wid].keys() for wid in wids])))
        if len(docids) > 1:
            parallel_traversal(self._docweight, docids)

        return super(OkapiIndex, self)._search_wids(wids)
예제 #7
0
파일: text.py 프로젝트: barkinet/zerodb
 def sourceToWordIds(self, text):
     if text is None:
         text = ''
     last = _text2list(text)
     for element in self._pipeline:
         last = element.process(last)
     if not isinstance(self.wordCount, Length):
         # Make sure wordCount is overridden with a BTrees.Length.Length
         self.wordCount = Length(self.wordCount())
     # Strategically unload the length value so that we get the most
     # recent value written to the database to minimize conflicting wids
     # Because length is independent, this will load the most
     # recent value stored, regardless of whether MVCC is enabled
     self.wordCount._p_deactivate()
     parallel_traversal(self._wids, last)
     return list(map(self._getWordIdCreate, last))
예제 #8
0
 def sourceToWordIds(self, text):
     if text is None:
         text = ''
     last = _text2list(text)
     for element in self._pipeline:
         last = element.process(last)
     if not isinstance(self.wordCount, Length):
         # Make sure wordCount is overridden with a BTrees.Length.Length
         self.wordCount = Length(self.wordCount())
     # Strategically unload the length value so that we get the most
     # recent value written to the database to minimize conflicting wids
     # Because length is independent, this will load the most
     # recent value stored, regardless of whether MVCC is enabled
     self.wordCount._p_deactivate()
     parallel_traversal(self._wids, last)
     return list(map(self._getWordIdCreate, last))
예제 #9
0
 def unindex_doc(self, docid):
     if docid not in self._docwords:
         return
     wids = self.get_words(docid)
     ctr = Counter(wids)
     wids = list(ctr)
     weights, lengths = self._get_doctrees(wids)
     scores = self._get_widscores(ctr, docid)
     parallel_traversal(*zip(*[(weights[w], scores[w]) for w in wids]))
     for w in wids:
         lengths[w].change(-1)
         weights[w].remove(scores[w])
         if lengths[w].value == 0:
             del self._wordinfo[w]
     del self._docwords[docid]
     self.documentCount.change(-1)
예제 #10
0
    def _reindex_doc(self, docid, text):
        # We should change Length only for new wids used
        old_wids = self.get_words(docid)
        old_ctr = Counter(old_wids)
        old_widset = set(old_ctr)
        new_wids = self._lexicon.sourceToWordIds(text)
        new_ctr = Counter(new_wids)
        new_widset = set(new_ctr)
        removed_wids = old_widset - new_widset
        added_wids = new_widset - old_widset
        all_wids = list(new_widset | old_widset)

        weights, lengths = self._get_doctrees(all_wids)

        for w in removed_wids:
            lengths[w].change(-1)
        for w in added_wids:
            lengths[w].change(1)

        old_docscores = self._get_widscores(old_ctr, docid)
        new_docscores = self._get_widscores(new_ctr, docid)
        parallel_traversal(*zip(*[
            (weights[w], old_docscores.get(w) or new_docscores.get(w))
            for w in all_wids]))
        # We should update all the weights if len(old_wids) != len(new_wids)
        # ...and that is generally the case, so we update always
        for w in old_widset:
            try:
                weights[w].remove(old_docscores[w])
            except KeyError:
                # This should never happen
                # If it does, it's a bad sign, though we should still work
                logging.error("Old weight-docid pair not found!")
        for w in new_widset:
            weights[w].add(new_docscores[w])

        self._docwords[docid] = PersistentWid.encode_wid(new_wids if self.keep_phrases else new_widset)

        return len(new_wids)
예제 #11
0
    def _get_doctrees(self, wids):
        """
        Gets persistent objects used for indexes for wids
        returns: {wid -> TreeSet((weight, docid))}, {wid -> Length}
        """
        weights = {}
        lengths = {}
        parallel_traversal(self._wordinfo, wids)

        for wid in wids:
            record = self._wordinfo.get(wid)
            if record is None:
                length = Length(0)
                wdocid = self.family.OO.TreeSet()
                self._wordinfo[wid] = (wdocid, length)
                self.wordCount.change(1)
            else:
                wdocid, length = record

            weights[wid] = wdocid
            lengths[wid] = length

        return weights, lengths
예제 #12
0
 def _remove_oov_wids(self, wids):
     parallel_traversal(self._wordinfo, set(wids))
     return filter(self._wordinfo.has_key, wids)