def extract(self, taggedTerms): """See interfaces.ITermExtractor""" terms = {} # Phase 1: A little state machine is used to build simple and # composite terms. multiterm = [] state = SEARCH while taggedTerms: term, tag, norm = taggedTerms.pop(0) if state == SEARCH and tag.startswith('N'): state = NOUN _add(term, norm, multiterm, terms) elif state == SEARCH and tag == 'JJ' and term[0].isupper(): state = NOUN _add(term, norm, multiterm, terms) elif state == NOUN and tag.startswith('N'): _add(term, norm, multiterm, terms) elif state == NOUN and not tag.startswith('N'): state = SEARCH if len(multiterm) > 1: word = ' '.join([word for word, norm in multiterm]) terms.setdefault(word, 0) terms[word] += 1 multiterm = [] # Phase 2: Only select the terms that fulfill the filter criteria. # Also create the term strength. return [ (word, occur, len(word.split())) for word, occur in terms.items() if self.filter(word, occur, len(word.split()))]
def get_wordnet_pos(tag): #to get tag of words as adjective,verb,noun or adv if tag.startswith('J'): return wordnet.ADJ elif tag.startswith('V'): return wordnet.VERB elif tag.startswith('N'): return wordnet.NOUN elif tag.startswith('R'): return wordnet.ADV else: return ''
def extract(self, taggedTerms, splits, KEEP_ORIGINAL_SPACING, RETURN_BIO=False): """See interfaces.ITermExtractor""" """ Warning: This is desctructive to taggedTerms + splits (has side effects) """ terms = {} bio_encoding = [] # Phase 1: A little state machine is used to build simple and # composite terms. multiterm = [] state = SEARCH assert len(taggedTerms) == len(splits) while taggedTerms: term, tag, norm = taggedTerms.pop(0) split = splits.pop(0) if state == SEARCH and tag.startswith('N'): state = NOUN _add(term, norm, split, multiterm, terms) bio_encoding.append("B") elif state == SEARCH and tag == 'JJ' and term[0].isupper(): state = NOUN _add(term, norm, split, multiterm, terms) bio_encoding.append("I") elif state == NOUN and tag.startswith('N'): _add(term, norm, split, multiterm, terms) bio_encoding.append("I") elif state == NOUN and not tag.startswith('N'): state = SEARCH if len(multiterm) > 0: _keepterm(multiterm, terms, KEEP_ORIGINAL_SPACING) multiterm = [] bio_encoding.append("O") else: bio_encoding.append("O") # Potentially keep the last term, if there is one. -jpt if len(multiterm) > 0: _keepterm(multiterm, terms, KEEP_ORIGINAL_SPACING) multiterm = [] if RETURN_BIO: return bio_encoding # Phase 2: Only select the terms that fulfill the filter criteria. # Also create the term strength. return [(word, occur, len(word.split())) for word, occur in list(terms.items()) if self.filter(word, occur, len(word.split()))]
def extract(self, taggedTerms, splits, KEEP_ORIGINAL_SPACING, RETURN_BIO=False): """See interfaces.ITermExtractor""" """ Warning: This is desctructive to taggedTerms + splits (has side effects) """ terms = {} bio_encoding = [] # Phase 1: A little state machine is used to build simple and # composite terms. multiterm = [] state = SEARCH assert len(taggedTerms) == len(splits) while taggedTerms: term, tag, norm = taggedTerms.pop(0) split = splits.pop(0) if state == SEARCH and tag.startswith("N"): state = NOUN _add(term, norm, split, multiterm, terms) bio_encoding.append("B") elif state == SEARCH and tag == "JJ" and term[0].isupper(): state = NOUN _add(term, norm, split, multiterm, terms) bio_encoding.append("I") elif state == NOUN and tag.startswith("N"): _add(term, norm, split, multiterm, terms) bio_encoding.append("I") elif state == NOUN and not tag.startswith("N"): state = SEARCH if len(multiterm) > 0: _keepterm(multiterm, terms, KEEP_ORIGINAL_SPACING) multiterm = [] bio_encoding.append("O") else: bio_encoding.append("O") # Potentially keep the last term, if there is one. -jpt if len(multiterm) > 0: _keepterm(multiterm, terms, KEEP_ORIGINAL_SPACING) multiterm = [] if RETURN_BIO: return bio_encoding # Phase 2: Only select the terms that fulfill the filter criteria. # Also create the term strength. return [ (word, occur, len(word.split())) for word, occur in terms.items() if self.filter(word, occur, len(word.split())) ]