def __init__(self,tokens): stopwords = set([('5','stars')]) num_words = len(tokens) finder = BigramCollocationFinder.from_words(tokens) bigram_measures = nltk.collocations.BigramAssocMeasures() finder.apply_freq_filter(int(.0002*num_words)) # some parameter tunin? bigrams = finder.nbest(bigram_measures.pmi,15) tfinder = TrigramCollocationFinder.from_words(tokens) trigram_measures = nltk.collocations.TrigramAssocMeasures() tfinder.apply_freq_filter(int(.0001*num_words)) trigrams = tfinder.nbest(trigram_measures.pmi,10) # merge bigrams and trigrams phrases = bigrams combined = [] for bigram in bigrams: other_bigrams = list(set(bigrams) - set(bigram)) for other_bigram in other_bigrams: if bigram[1] == other_bigram[0]: combined.append((bigram[0],bigram[1],other_bigram[1])) for trigram in trigrams: if trigram in set(combined): phrases.append(trigram) phrases.remove((trigram[0],trigram[1])) phrases.remove((trigram[1],trigram[2])) p = POS() self.phrases = [phrase for phrase in phrases if p.percent_noun(phrase[-1]) > 0.5 and phrase not in stopwords]
def __init__(self, tokens): stopwords = set([('5', 'stars')]) num_words = len(tokens) finder = BigramCollocationFinder.from_words(tokens) bigram_measures = nltk.collocations.BigramAssocMeasures() finder.apply_freq_filter(int(.0002 * num_words)) # some parameter tunin? bigrams = finder.nbest(bigram_measures.pmi, 15) tfinder = TrigramCollocationFinder.from_words(tokens) trigram_measures = nltk.collocations.TrigramAssocMeasures() tfinder.apply_freq_filter(int(.0001 * num_words)) trigrams = tfinder.nbest(trigram_measures.pmi, 10) # merge bigrams and trigrams phrases = bigrams combined = [] for bigram in bigrams: other_bigrams = list(set(bigrams) - set(bigram)) for other_bigram in other_bigrams: if bigram[1] == other_bigram[0]: combined.append((bigram[0], bigram[1], other_bigram[1])) for trigram in trigrams: if trigram in set(combined): phrases.append(trigram) phrases.remove((trigram[0], trigram[1])) phrases.remove((trigram[1], trigram[2])) p = POS() self.phrases = [ phrase for phrase in phrases if p.percent_noun(phrase[-1]) > 0.5 and phrase not in stopwords ]
def __init__(self,tokens,category): # these generic words will be ignored stopwords = ['product','price','reviews','unit','model', 'purchase','amount','item'] words = category.split() # add the category and its plural (crude) to stopwords for el in words: stopwords.append(el) stopwords.append(el+'s') # will have to change this! self.tokens = tokens self.num_words = len(self.tokens) # calculate freq dist from tokens self.unigram_fd = nltk.FreqDist(self.tokens) self.unique_words = len(self.unigram_fd) # get frequent unigram nouns pos = POS() common_unigrams = self.unigram_fd.most_common(int(.02*self.unique_words)) self.unigrams = [pair for pair in common_unigrams \ if pair[0] not in stopwords and pos.percent_noun(pair[0]) > 0.5] # use threshold? get slightly better w/o the threshold. # threshold = .001 # self.unigrams = [pair for pair in common_unigrams \ # if pair[1] > int(threshold*self.num_words) and pair[0] not in stopwords # and pos.percent_noun(pair[0]) > 0.5] # create a pandas DataFrame indexed by word, review corpus zipped = zip(*self.unigrams) df_reviews = pd.DataFrame(list(zipped[1]),index=list(zipped[0]), columns=['count_reviews']) # a list of words from 'generic' corpus generic_words = self.chat_words() # create a pandas DataFrame indexed by word, generic self.generic_words = [w.lower() for w in generic_words] generic_fd = nltk.FreqDist(generic_words)+\ nltk.FreqDist(nltk.bigrams(generic_words)) + \ nltk.FreqDist(nltk.trigrams(generic_words)) zipped_generic = zip(*generic_fd.items()) df_generic = pd.DataFrame(list(zipped_generic[1]),\ index=list(zipped_generic[0]),columns=['count_generic']) # merge the two on words df = df_reviews.join(df_generic) self.df = df.fillna(0) # compute generic freq. term_freq = self.term_freq_log() inverse_generic_freq = self.inverse_generic_freq() self.scores = term_freq * inverse_generic_freq self.scores.sort() self.unigrams = list(reversed(self.scores.index))
def frequent_nouns_counts(self, tokens): """Return frequently mentioned nouns WITH COUNTS""" unigram_fd = nltk.FreqDist(tokens) pos = POS() common_unigrams = unigram_fd.most_common(int(self.top_pct * len(unigram_fd))) nouns = [ pair for pair in common_unigrams if pair[0] not in self.stopwords() and pos.percent_noun(pair[0]) > 0.5 ] return nouns
def frequent_nouns_counts(self, tokens): """Return frequently mentioned nouns WITH COUNTS""" unigram_fd = nltk.FreqDist(tokens) pos = POS() common_unigrams = unigram_fd.most_common( int(self.top_pct * len(unigram_fd))) nouns = [ pair for pair in common_unigrams if pair[0] not in self.stopwords() and pos.percent_noun(pair[0]) > 0.5 ] return nouns
def phrases(self, tokens): bigrams = self.top_bigrams(tokens) trigrams = self.top_trigrams(tokens) phrases = self.merged_phrases(bigrams, trigrams) # clean the phrases to be NOUN PHRASES and w/o phrasal stopwords. p = POS() stopwords = {('5', 'stars')} phrases = [phrase for phrase in phrases if p.percent_noun(phrase[-1]) > 0.5 and phrase not in stopwords] return phrases
def phrases(self, tokens): bigrams = self.top_bigrams(tokens) trigrams = self.top_trigrams(tokens) phrases = self.merged_phrases(bigrams, trigrams) # clean the phrases to be NOUN PHRASES and w/o phrasal stopwords. p = POS() stopwords = {('5', 'stars')} phrases = [ phrase for phrase in phrases if p.percent_noun(phrase[-1]) > 0.5 and phrase not in stopwords ] return phrases
def __init__(self, llwl='Brown', llNL=2, percen=80, NE=True, Col=True, Gram=True, Chu=True): ''' @param llwl:LogLikleyHood Corpa name ('Brown','AmE06','BE06') @param llNL:LogLikleyHood @param percen: Presision of output default = 20, 20% returned @param NE: Uses NE default True @param Col: Uses Collocation default True @param Gram: Uses N-Grams default True @param Chu: Uses Chunking default True ''' self.NEs = NE self.Col = Col self.Gram = Gram self.Chu = Chu self.p = percen print 'Starting to build ', llwl self.LL = LogLikelihood(wordlist=llwl, NLength=llNL) print 'LL Loaded' self.POS = POS() print 'POS Loaded' self.GD = GetData() print 'GD Loaded' self.Cu = Chunker(self.POS) print 'Cu Loaded' self.FL = Filter() print 'FL Loaded' self.CC = Collocation(self.POS) print 'CC Loaded' self.Ng = NGram() print 'Ng Loaded' self.S = Select(percentil=self.p) print 'S Loaded' self.To = Tokenize(self.FL) print 'To Loaded'
from SSBParser import SSBParser from POS import POS from unknown_word_handler import unknown_word_handler parser = SSBParser() train_clean_file = 'BROWN-clean.pos.txt' test_clean_file = 'SnapshotBROWN-clean.pos.txt' parser.gen_clean_file('BROWN.pos.all',train_clean_file) parser.gen_clean_file('SnapshotBROWN.pos.all.txt',test_clean_file) print('1(i) : Baseline statistical tagger implemented for entire Brown corpus.') print('***********************************************************************') print('1(ii) : Calculating performance for snapshot') pos = POS(train_clean_file,test_clean_file) print('***********************************************************************') print('1(iii) : Calculating performance for news collected from web. please see news.txt to see the input file') uwh = unknown_word_handler('news.txt','news-clean.txt',pos) test_clean_file = 'news-clean.txt' pos = POS(train_clean_file,test_clean_file)
def __init__(self, tokens, category): # these generic words will be ignored stopwords = [ 'product', 'price', 'reviews', 'unit', 'model', 'purchase', 'amount', 'item' ] words = category.split() # add the category and its plural (crude) to stopwords for el in words: stopwords.append(el) stopwords.append(el + 's') # will have to change this! self.tokens = tokens self.num_words = len(self.tokens) # calculate freq dist from tokens self.unigram_fd = nltk.FreqDist(self.tokens) self.unique_words = len(self.unigram_fd) # get frequent unigram nouns pos = POS() common_unigrams = self.unigram_fd.most_common( int(.02 * self.unique_words)) self.unigrams = [pair for pair in common_unigrams \ if pair[0] not in stopwords and pos.percent_noun(pair[0]) > 0.5] # use threshold? get slightly better w/o the threshold. # threshold = .001 # self.unigrams = [pair for pair in common_unigrams \ # if pair[1] > int(threshold*self.num_words) and pair[0] not in stopwords # and pos.percent_noun(pair[0]) > 0.5] # create a pandas DataFrame indexed by word, review corpus zipped = zip(*self.unigrams) df_reviews = pd.DataFrame(list(zipped[1]), index=list(zipped[0]), columns=['count_reviews']) # a list of words from 'generic' corpus generic_words = self.chat_words() # create a pandas DataFrame indexed by word, generic self.generic_words = [w.lower() for w in generic_words] generic_fd = nltk.FreqDist(generic_words)+\ nltk.FreqDist(nltk.bigrams(generic_words)) + \ nltk.FreqDist(nltk.trigrams(generic_words)) zipped_generic = zip(*generic_fd.items()) df_generic = pd.DataFrame(list(zipped_generic[1]),\ index=list(zipped_generic[0]),columns=['count_generic']) # merge the two on words df = df_reviews.join(df_generic) self.df = df.fillna(0) # compute generic freq. term_freq = self.term_freq_log() inverse_generic_freq = self.inverse_generic_freq() self.scores = term_freq * inverse_generic_freq self.scores.sort() self.unigrams = list(reversed(self.scores.index))
class runable(object): ''' Class for selecting keywords and extracting keywords from online contentent. ''' def __init__(self, llwl='Brown', llNL=2, percen=80, NE=True, Col=True, Gram=True, Chu=True): ''' @param llwl:LogLikleyHood Corpa name ('Brown','AmE06','BE06') @param llNL:LogLikleyHood @param percen: Presision of output default = 20, 20% returned @param NE: Uses NE default True @param Col: Uses Collocation default True @param Gram: Uses N-Grams default True @param Chu: Uses Chunking default True ''' self.NEs = NE self.Col = Col self.Gram = Gram self.Chu = Chu self.p = percen print 'Starting to build ', llwl self.LL = LogLikelihood(wordlist=llwl, NLength=llNL) print 'LL Loaded' self.POS = POS() print 'POS Loaded' self.GD = GetData() print 'GD Loaded' self.Cu = Chunker(self.POS) print 'Cu Loaded' self.FL = Filter() print 'FL Loaded' self.CC = Collocation(self.POS) print 'CC Loaded' self.Ng = NGram() print 'Ng Loaded' self.S = Select(percentil=self.p) print 'S Loaded' self.To = Tokenize(self.FL) print 'To Loaded' def Select(self, url, depth): ''' Determin the best keywords for a webpage. @param url: the base url to start sampaling from @param depth: the depth of the website to be sampled @return: the list of selected keywords, ordered with the highest rated words to the lower bownd of array. ''' #Get data from web page text = self.GD.getWebPage(url, depth) #Tokonize sentance and words tok = self.To.Tok(text) #POS tag the text pos = self.POS.POSTag(tok, 'tok') #Log Likly Hood log = self.LL.calcualte(tok) #Collocations if self.Col == True: col = self.CC.col(pos, tok) else: col = [] #NE Extraction if self.NEs == True: ne = self.Cu.Chunks(pos, nodes=['PERSON', 'ORGANIZATION', 'LOCATION']) else: ne = [] #Extract NP if self.Chu == True: chu = [self.Cu.parse(p) for p in pos] else: chu = [] #Creat N-gram if self.Gram == True: ga = self.Ng.Grams(pos, n=6) else: ga = [] return self.S.keywords(ne, ga, col, chu, log)