def tag(cls, text): """Class method that returns tags given some text""" if not text: return [] text = text.replace("'", "") cap_type = capitalization_type(text) bt = BasicTokenizer() tokens = bt.tokenize(text) pos = nltk.pos_tag(tokens) log.info('POS before lower casing:%s', str(pos)) if cap_type == CapType.ALLCAPS: # If the headline is in AllCAPS then the POS tagger # produces too many proper nouns, hence we de-capitilize text first tokens = bt.tokenize(text.lower()) pos = nltk.pos_tag(tokens) log.info('POS after lower casing:%s', str(pos)) # Only return those tokens whose pos is in the include list tags = [t[0] for t in pos if t[1] in pos_include] # Now exclude stopwords... tags = [t for t in tags if not t in stop_words] # Call Singularize tags = [singularize(t) for t in tags] # We want to preserve the order of tags purely for esthetic value # hence we will not use set() # We will also preserve uppercased tags if they are the first occurence tags_ = CIList() for t in tags: if t in tags_: continue if len(t) < 2: continue tags_.append(t) return tags_
def tag(cls, text): """Class method that returns tags given some text""" text = text.replace("'", "") cap_type = capitalization_type(text) bt = BasicTokenizer() if cap_type == 'ALLCAPS': context = bt.tokenize(text.lower()) else: context = bt.tokenize(text) tags = [] for i in range(len(context)): features = featurize(i, context) d = dict( word=context[i], context=text, features=features, tokens=features, tags=tags) m, s = apply_multinomial_NB(C, V, prior, condprob, d) if m == 'ham': tags.append(context[i]) # Strip out stopwords... tags = [t for t in tags if t not in stop_words] # Call Singularize tags = [singularize(t) for t in tags] # We want to preserve the order of tags purely for esthetic value # hence we will not use set() # We will also preserve uppercased tags if they are the first occurence tags_ = CIList() for t in tags: if t in tags_: continue if len(t) < 2: continue tags_.append(t) return tags_