def iterTerms(n, text, emmit_head_tail=False): '''Iterate n-gram terms in given text and return a generator. All English in terms will be lower case. All first and last terms in a sentence will be emitted another term in the result with 'B' and 'E' prefix. For example: 'C1C2C3' in uni-gram term will be emitted as ['C1', 'BC1', 'C2', 'C3', 'EC3'] where the C1, C2 and C3 are Chinese words. ''' for sentence in splitSentence(text): first = True term = None for term in util.ngram(n, sentence): term = term.lower() yield term if first: if emmit_head_tail: yield 'B' + term first = False if term is not None: if emmit_head_tail: yield 'E' + term
def iterTerms(n, text, emmit_head_tail=False): """Iterate n-gram terms in given text and return a generator. All English in terms will be lower case. All first and last terms in a sentence will be emitted another term in the result with 'B' and 'E' prefix. For example: 'C1C2C3' in uni-gram term will be emitted as ['C1', 'BC1', 'C2', 'C3', 'EC3'] where the C1, C2 and C3 are Chinese words. """ for sentence in splitSentence(text): first = True term = None for term in util.ngram(n, sentence): term = term.lower() yield term if first: if emmit_head_tail: yield 'B' + term first = False if term is not None: if emmit_head_tail: yield 'E' + term
def splitTerms(self, text, categories=None): '''Split text into terms, categories is a list of category to read lexicon data from, if it is empty, it means to get data from all categories ''' all_category = self.getCategoryList() if not categories: categories = all_category c_list = [] for name in categories: c = self.getCategory(name) if not c: self.logger.error('Category %s not exist', name) continue c_list.append(c) grams = [] for n in range(1, self.ngram + 1): terms = [] for term in util.ngram(n, text): score = self._getTermScore(term, n, c_list) self.logger.debug('Term=%s, Score=%s', term, score) terms.append((term, score)) grams.append(terms) terms, best_score = findBestSegment(grams) self.logger.debug('Best score: %s', best_score) return terms
def splitTerms(self, text, categories=None): """Split text into terms, categories is a list of category to read lexicon data from, if it is empty, it means to get data from all categories """ all_category = self.getCategoryList() if not categories: categories = all_category c_list = [] for name in categories: c = self.getCategory(name) if not c: self.logger.error('Category %s not exist', name) continue c_list.append(c) grams = [] for n in xrange(1, self.ngram+1): terms = [] for term in util.ngram(n, text): score = self._getTermScore(term, n, c_list) self.logger.debug('Term=%s, Score=%s', term, score) terms.append((term, score)) grams.append(terms) terms, best_score = findBestSegment(grams) self.logger.debug('Best score: %s', best_score) return terms