예제 #1
0
 def _makeOne(self, family=None):
     from zope.index.text.lexicon import Lexicon
     from zope.index.text.lexicon import Splitter
     if family is None:
         family = self._getBTreesFamily()
     lexicon = Lexicon(Splitter())
     return self._getTargetClass()(lexicon, family=family)
예제 #2
0
    def tfIdfBlock(self, data, field): 
        '''Creates TF/IDF canopy of a given set of data'''

        class CustomStopWordRemover(object):
            stop_words = self.stop_words[field].copy()

            def process(self, lst):
                return [w for w in lst if not w in self.stop_words]

        index = TextIndex(Lexicon(Splitter(), CustomStopWordRemover()))

        index.index = CosineIndex(index.lexicon)

        index_to_id = {}
        base_tokens = {}

        for i, (record_id, doc) in enumerate(data, 1) :
            index_to_id[i] = record_id
            base_tokens[i] = doc
            index.index_doc(i, doc)

        canopies = (tfidf._createCanopies(index,
                                          base_tokens, 
                                          threshold, 
                                          field)
                    for threshold in self.tfidf_fields[field])

        for canopy in canopies :
            key, index_canopy = canopy
            id_canopy = dict((index_to_id[k], index_to_id[v]) 
                             for k,v in index_canopy.iteritems())
            self.canopies[key] = defaultdict(str, id_canopy)
예제 #3
0
파일: catalog.py 프로젝트: Jickelsen/Arche
def _default_indexes():
    return {
        'title':
        CatalogFieldIndex(get_title),
        'description':
        CatalogFieldIndex(get_description),
        'type_name':
        CatalogFieldIndex(get_type_name),
        'sortable_title':
        CatalogFieldIndex(get_sortable_title),
        'path':
        CatalogPathIndex(get_path),
        'searchable_text':
        CatalogTextIndex(get_searchable_text,
                         lexicon=Lexicon(Splitter(), CaseNormalizer())),
        'uid':
        CatalogFieldIndex(get_uid),
        'tags':
        CatalogKeywordIndex(get_tags),
        'search_visible':
        CatalogFieldIndex(get_search_visible),
        'date':
        CatalogFieldIndex(get_date),
        'modified':
        CatalogFieldIndex(get_modified),
        'created':
        CatalogFieldIndex(get_created),
        'wf_state':
        CatalogFieldIndex(get_wf_state),
        'workflow':
        CatalogFieldIndex(get_workflow),
    }.items()
예제 #4
0
 def _makeIndexAndParser(self):
     from zope.index.text.lexicon import Lexicon
     from zope.index.text.lexicon import Splitter
     from zope.index.text.queryparser import QueryParser
     lexicon = Lexicon(Splitter())
     parser = QueryParser(lexicon)
     index = FauxIndex()
     return index, parser
예제 #5
0
파일: tfidf.py 프로젝트: dwyerk/dedupe
    def __init__(self, field, stop_words=[]):
        self.field = field

        splitter = Splitter()
        stop_word_remover = CustomStopWordRemover(stop_words)
        operator_escaper = OperatorEscaper()
        lexicon = Lexicon(splitter, stop_word_remover, operator_escaper)

        self._index = TextIndex(lexicon)
        self._index.index = CosineIndex(self._index.lexicon)

        self._i_to_id = {}
        self._parseTerms = self._index.lexicon.parseTerms
예제 #6
0
    def __init__(self, lexicon=None, index=None):
        """Provisional constructor.

        This creates the lexicon and index if not passed in.
        """
        _explicit_lexicon = True
        if lexicon is None:
            _explicit_lexicon = False
            lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
        if index is None:
            index = OkapiIndex(lexicon)
        self.lexicon = _explicit_lexicon and lexicon or index.lexicon
        self.index = index
예제 #7
0
 def __init__(self, discriminator, lexicon=None, index=None):
     _lexicon = lexicon
     if lexicon is None:
         _lexicon = Lexicon(
             Splitter(),
             CaseNormalizer(),
             StopWordRemover(),
         )
     if index is None:
         index = OkapiIndex(_lexicon, family=self.family)
     super(TextIndex, self).__init__(discriminator, lexicon, index)
     if lexicon is None:
         self.lexicon = index.lexicon
     self.index = index
     self.clear()
예제 #8
0
def stopWords(data) :
    index = TextIndex(Lexicon(Splitter()))

    for i, (_, doc) in enumerate(data, 1) :
        index.index_doc(i, doc)

    doc_freq = [(len(index.index._wordinfo[wid]), word) 
                for word, wid in index.lexicon.items()]

    doc_freq.sort(reverse=True)

    N = float(index.index.documentCount())
    threshold = int(max(1000, N * 0.05))

    stop_words = set([])

    for frequency, word in doc_freq :
        if frequency > threshold :
            stop_words.add(word)
        else :
            break

    return stop_words
예제 #9
0
 def _makeLexicon(self, *pipeline):
     from zope.index.text.lexicon import Lexicon
     from zope.index.text.lexicon import Splitter
     if not pipeline:
         pipeline = (Splitter(),)
     return Lexicon(*pipeline)
예제 #10
0
 def _makeLexicon(self):
     from zope.index.text.lexicon import Lexicon
     return Lexicon(*self._makePipeline())
예제 #11
0
def _create_full_text_index(self, language):
    lexicon = Lexicon(
        FullTextIndexProcessor(language, self.stemming)
    )
    return OkapiIndex(lexicon)