Пример #1
0
def iterTerms(n, text, emmit_head_tail=False):
    '''Iterate n-gram terms in given text and return a generator.
    
    All English in
    terms will be lower case. All first and last terms in a sentence will be
    emitted another term in the result with 'B' and 'E' prefix.
    
    For example:
    
    'C1C2C3'
    
    in uni-gram term will be emitted as
    
    ['C1', 'BC1', 'C2', 'C3', 'EC3']
    
    where the C1, C2 and C3 are Chinese words.
    
    '''
    for sentence in splitSentence(text):
        first = True
        term = None
        for term in util.ngram(n, sentence):
            term = term.lower()
            yield term
            if first:
                if emmit_head_tail:
                    yield 'B' + term
                first = False
        if term is not None:
            if emmit_head_tail:
                yield 'E' + term
Пример #2
0
def iterTerms(n, text, emmit_head_tail=False):
    """Iterate n-gram terms in given text and return a generator. 
    
    All English in 
    terms will be lower case. All first and last terms in a sentence will be
    emitted another term in the result with 'B' and 'E' prefix.
    
    For example:
    
    'C1C2C3'
    
    in uni-gram term will be emitted as
    
    ['C1', 'BC1', 'C2', 'C3', 'EC3']
    
    where the C1, C2 and C3 are Chinese words. 
    
    """
    for sentence in splitSentence(text):
        first = True
        term = None
        for term in util.ngram(n, sentence):
            term = term.lower()
            yield term
            if first:
                if emmit_head_tail:
                    yield 'B' + term
                first = False
        if term is not None:
            if emmit_head_tail:
                yield 'E' + term
Пример #3
0
 def splitTerms(self, text, categories=None):
     '''Split text into terms, categories is a list of category to read
     lexicon data from, if it is empty, it means to get data from all
     categories
     
     '''
     all_category = self.getCategoryList()
     if not categories:
         categories = all_category
     c_list = []
     for name in categories:
         c = self.getCategory(name)
         if not c:
             self.logger.error('Category %s not exist', name)
             continue
         c_list.append(c)
     grams = []
     for n in range(1, self.ngram + 1):
         terms = []
         for term in util.ngram(n, text):
             score = self._getTermScore(term, n, c_list)
             self.logger.debug('Term=%s, Score=%s', term, score)
             terms.append((term, score))
         grams.append(terms)
     terms, best_score = findBestSegment(grams)
     self.logger.debug('Best score: %s', best_score)
     return terms
Пример #4
0
 def splitTerms(self, text, categories=None):
     """Split text into terms, categories is a list of category to read
     lexicon data from, if it is empty, it means to get data from all
     categories
     
     """
     all_category = self.getCategoryList()
     if not categories:
         categories = all_category
     c_list = []
     for name in categories:
         c = self.getCategory(name)
         if not c:
             self.logger.error('Category %s not exist', name)
             continue
         c_list.append(c)
     grams = []
     for n in xrange(1, self.ngram+1):
         terms = []
         for term in util.ngram(n, text):
             score = self._getTermScore(term, n, c_list)
             self.logger.debug('Term=%s, Score=%s', term, score)
             terms.append((term, score))
         grams.append(terms)
     terms, best_score = findBestSegment(grams)
     self.logger.debug('Best score: %s', best_score)
     return terms