def invertConditionalFreqDist(CFDist): iCFDist = ConditionalFreqDist() Stemmer = PorterStemmer() for cond in CFDist.conditions(): for val in CFDist[cond].samples(): sense = cond.split("_")[0] # Cut off any POS for tok in val: if type(tok) == str: iCFDist[Stemmer.raw_stem(tok)].inc(sense, CFDist[cond].count(val)) return iCFDist
def invertConditionalFreqDist(CFDist): iCFDist = ConditionalFreqDist() Stemmer=PorterStemmer() for cond in CFDist.conditions(): for val in CFDist[cond].samples(): sense = cond.split('_')[0] #Cut off any POS for tok in val: if type(tok) == str: iCFDist[Stemmer.raw_stem(tok)].inc(sense,CFDist[cond].count(val)) return iCFDist
def _split_tagged_tokens(tagged_tokens): from nltk.set import MutableSet from nltk.stemmer.porter import PorterStemmer words = [] ws = [] word_set = MutableSet() tags = [] ts = [] tag_set = MutableSet() stemmer = PorterStemmer() for token in tagged_tokens: for sub_token in token['SUBTOKENS']: w = sub_token['TEXT'].lower() # make them lower case #w = stemmer.stem_word(w) # oh, and stem them too #w = token.type().base() t = sub_token['TAG'] word_set.insert(w) tag_set.insert(t) ws.append(w) ts.append(t) if t == '.': words.append(ws) ws = [] tags.append(ts) ts = [] return words, word_set.elements(), tags, tag_set.elements()
def remove_stem(list): new_list = [] for row in list: new_row = [] for i in row: new_row.append(PorterStemmer().stem(i)) new_list.append(new_row) return new_list
def demo(): # Create a simple regular expression based stemmer stemmer = RegexpStemmer('ing$|s$|e$') _demo_stemmer(stemmer) from nltk.stemmer.porter import PorterStemmer stemmer = PorterStemmer() _demo_stemmer(stemmer)
def demo(): from pprint import pprint # load stoplist stoplist = _unwrap_tokens(stopwords.read('english')['WORDS']) # load a bit of the brown corpus items = brown.items('humor') tagged_tokens = brown.read(items[0]) from nltk.tokenreader import TaggedTokenReader time_flies = TaggedTokenReader().read_token( 'Time/NN fly/VB like/IN an/DT arrow/NN') # create the tagger, using WordNet dictionary = WordNetDictionary(stoplist, None, brown_nouns, brown_verbs, brown_adjs, brown_advs) # window of -+ 5 words tagger = LeskWordSenseTagger(5, dictionary, WordNetStemmer(), True, 'bag') print 'Running with 5 word window, bag of words, WordNet' pretty_print(tagger.tag(time_flies), dictionary) pretty_print(tagger.tag(tagged_tokens[:200]), dictionary) # now change to set of words tagger = LeskWordSenseTagger(5, dictionary, WordNetStemmer(), True, 'set') print 'Running with 5 word window, set of words, WordNet' pretty_print(tagger.tag(time_flies), dictionary) pretty_print(tagger.tag(tagged_tokens[:200]), dictionary) # create the tagger, using roget print 'Creating Roget dictionary (may take a while)...' stemmer = PorterStemmer() dictionary = RogetDictionary(stoplist, stemmer) tagger = LeskWordSenseTagger(5, dictionary, stemmer, True, 'set') print 'Running with 5 word window, set of words, Roget' pretty_print(tagger.tag(time_flies), dictionary) pretty_print(tagger.tag(tagged_tokens[:200]), dictionary) # use the simulated annealing tagger, with WordNet dictionary = WordNetDictionary(stoplist, None, brown_nouns, brown_verbs, brown_adjs, brown_advs) tagger = SimulatedAnnealingWordSenseTagger( dictionary, [20 * (0.5**n) for n in range(100)], WordNetStemmer(), True, 'bag') print 'Running with bag of words, WordNet, simulated annealing tagger' pretty_print(tagger.tag(time_flies), dictionary) pretty_print(tagger.tag(tagged_tokens[:200]), dictionary)
def unigramTag(self, dirStats, tagged): from nltk.stemmer.porter import PorterStemmer for t in tagged['SUBTOKENS']: PorterStemmer().stem(t) self.SenseTagger.tag(tagged) print 'unigramTag: tagged --',tagged TagString = '[???' for t in tagged['SUBTOKENS']: try: TagString += ' ['+t['SENSE']+' '+t['TEXT']+']' except KeyError: print "Couldn't find Sense tag for", t, 'in', tagged TagString += '[??? ' + +t['TEXT']+']' TagString += ']' dirStats.parse_list[-1] = TagString print 'Tagged:', TagString return None
def extractSurfaceSemantics(token,parent): global Senses POS=getPartOfSpeech(token,parent) tokenSenses = {} text = token['TEXT'].lower() default = token['TEXT'].upper() if POS in ['N', 'V', 'ADV', 'ADJ']: try: #Redo as test = foo while not tokenSensesword: try: foo ; except KeyError: foo = next foo tokenSenses = Senses[text] except KeyError: logger.warning('extractSurfaceSemantics : Text not in tagged senses: %s', text) try: #logger.warning('extractSurfaceSemantics : Previously unseen word but in WordNet?: %s', text) # stringified range of possible senses without spaces tokenSenses = {POS : range(1,len(pywordnet.getWord(text,POS).getSenses())+1)} except KeyError: try: logger.warning('extractSurfaceSemantics : Inflected version of WordNet word? %s', text) if text.endswith('s'): text = text[:-1] tokenSenses = Senses[text] else: stemmer = PorterStemmer() # Update WordNetStemmer to NLTK 1.4 API stemmer.stem(token) text = token['STEM'] tokenSenses = Senses[text] except KeyError: text = token['TEXT'].lower() try: logger.warning('extractSurfaceSemantics : Misspelling / typo of WordNet word? %s', text) spellchecker = enchant.DictWithPWL('en_US', Lexicon) s = '' for s in spellchecker.suggest(text): if s in Senses: tokenSenses = Senses[s] break if not tokenSenses and spellchecker.suggest(text): s = spellchecker.suggest(text)[0] tokenSenses = {POS : range(1,len(pywordnet.getWord(s,POS).getSenses())+1)} if s and Options.Spellcheck: logger.warning('extractSurfaceSemantics : Found spelling correction %s for %s', s,text) text = s #logger.debug('*** extractSurfaceSemantics : Implement spelling correction. *** ') #raise KeyError except KeyError: logger.error('extractSurfaceSemantics : Unknown token: %s', text) return default # Handle experienced typos. if 'see' in tokenSenses: ### FIXME adding to dict for typos that are other words text = tokenSenses['see'] try: tokenSenses = Senses[text] except: return default # Handle morphology variants that wordnet understands. elif isinstance(tokenSenses, tuple): text,tokenSenses[POS] = tokenSenses[POS] try: return '_'.join([text,POS,','.join([str(i) for i in tokenSenses[POS]])]) except KeyError: #logger.warning('extractSurfaceSemantics : Expected POS %s for token %s, Got %s, Using %s', # POS, token, tokenSenses.keys(), tokenSenses.keys()[0]) if tokenSenses.keys(): POS = token['POS'] = tokenSenses.keys()[0] return '_'.join([text,POS,','.join([str(i) for i in tokenSenses.values()[0]])]) except Exception,e: logger.error('extractSurfaceSemantics: %s: Could not find sense %s for token %s', e, POS, token) #tokenSenses, text
def extractSurfaceSemantics(token, parent): global Senses POS = getPartOfSpeech(token, parent) tokenSenses = {} text = token["TEXT"].lower() default = token["TEXT"].upper() if POS in ["N", "V", "ADV", "ADJ"]: try: # Redo as test = foo while not tokenSensesword: try: foo ; except KeyError: foo = next foo tokenSenses = Senses[text] except KeyError: logger.warning("extractSurfaceSemantics : Text not in tagged senses: %s", text) try: # logger.warning('extractSurfaceSemantics : Previously unseen word but in WordNet?: %s', text) # stringified range of possible senses without spaces tokenSenses = {POS: range(1, len(pywordnet.getWord(text, POS).getSenses()) + 1)} except KeyError: try: logger.warning("extractSurfaceSemantics : Inflected version of WordNet word? %s", text) if text.endswith("s"): text = text[:-1] tokenSenses = Senses[text] else: stemmer = PorterStemmer() # Update WordNetStemmer to NLTK 1.4 API stemmer.stem(token) text = token["STEM"] tokenSenses = Senses[text] except KeyError: text = token["TEXT"].lower() try: logger.warning("extractSurfaceSemantics : Misspelling / typo of WordNet word? %s", text) spellchecker = enchant.DictWithPWL("en_US", Lexicon) s = "" for s in spellchecker.suggest(text): if s in Senses: tokenSenses = Senses[s] break if not tokenSenses and spellchecker.suggest(text): s = spellchecker.suggest(text)[0] tokenSenses = {POS: range(1, len(pywordnet.getWord(s, POS).getSenses()) + 1)} if s and Options.Spellcheck: logger.warning("extractSurfaceSemantics : Found spelling correction %s for %s", s, text) text = s # logger.debug('*** extractSurfaceSemantics : Implement spelling correction. *** ') # raise KeyError except KeyError: logger.error("extractSurfaceSemantics : Unknown token: %s", text) return default # Handle experienced typos. if "see" in tokenSenses: ### FIXME adding to dict for typos that are other words text = tokenSenses["see"] try: tokenSenses = Senses[text] except: return default # Handle morphology variants that wordnet understands. elif isinstance(tokenSenses, tuple): text, tokenSenses[POS] = tokenSenses[POS] try: return "_".join([text, POS, ",".join([str(i) for i in tokenSenses[POS]])]) except KeyError: # logger.warning('extractSurfaceSemantics : Expected POS %s for token %s, Got %s, Using %s', # POS, token, tokenSenses.keys(), tokenSenses.keys()[0]) if tokenSenses.keys(): POS = token["POS"] = tokenSenses.keys()[0] return "_".join([text, POS, ",".join([str(i) for i in tokenSenses.values()[0]])]) except Exception, e: logger.error( "extractSurfaceSemantics: %s: Could not find sense %s for token %s", e, POS, token ) # tokenSenses, text