Пример #1
0
    def process(self, session, data):
        # input should be:
        #   {txt : {'text' : txt, 'occurences' : 1, 'proxLoc' : []}}
        # only thing we can push through is txt
        # output is as from tokenizer (eg to be merged)

        if not data:
            return data

        kw = {}
        first = data.popitem()
        prox = first[1].has_key('proxLoc')
        data[first[0]] = first[1]
        if type(data) == dict:
            for k in data.keys():
                rdr = lucene.StringReader(data[k]['text'])
                res = self.analyzer.tokenStream('data', rdr)

                # can also get offset information from terms
                toks = [t.term() for t in res]

                kw[k] = {'text': toks, 'occurences': 1}
                if prox:
                    kw[k]['proxLoc'] = data[k]['proxLoc']
        return kw
Пример #2
0
 def process_string(self, session, data):
     rdr = lucene.StringReader(data)
     toks = self.tokenizer(rdr)
     return zip([(t.term(), t.startOffset()) for t in toks])
Пример #3
0
 def process_string(self, session, data):
     rdr = lucene.StringReader(data)
     toks = self.tokenizer(rdr)
     return [t.term() for t in toks]