def extrapolate(self, sent): # tags the part of speech in each word tagged = pos_tag(word_tokenize(sent)) tag_list = [] for item in tagged: tag_list.append(list(item)) # puts nouns and verbs in their base form for idx, item in enumerate(tag_list): if item[1][0] == 'V': tag_list[idx][0] = wnl().lemmatize(item[0],'v') elif item[1] == 'NN' or item[1] == 'NNS': tag_list[idx][0] = wnl().lemmatize(item[0],'n') synonyms = [[] for i in range(len(tag_list))] # finds synonyms for each wnoun, verb, adj in tag_list -> puts in corresponding index in synonyms for idx, item in enumerate(tag_list): if item[1][0] == 'V': synonyms[idx] = self.find_synonyms(item[0], wordnet.VERB) #for v in synonyms[idx]: # v = en.verb.past(v) elif item[1] == 'NN' or item[1] == 'NNS': synonyms[idx] = self.find_synonyms(item[0], wordnet.NOUN) elif item[1][0] == 'J': synonyms[idx] = self.find_synonyms(item[0], wordnet.ADJ) # gets rid of duplicates for si, s in enumerate(synonyms): s = list(set(s)) # print(tag_list[si][0], ": ", s) self.sent_syns = synonyms search_sent = [] # creates a list of similar sentences to search for for idx, item in enumerate(tag_list): # looks for synonyms at the corresponding index for s in synonyms[idx]: temp = sub(r"\b%s\b" %item[0], s, sent) search_sent.append(temp) # will get rid of duplicates once i make it hashable search_sent = list(set(search_sent)) # print("\nSample list of synonymous sentences:") # for i in range(min(len(search_sent), 20)): # print(search_sent[i]) return search_sent
def calc_fd(self): """ This function calculates the frequency distributions of unigrams, bigrams and trigrams from all the scraped ads (combined). It also finds the bigrams and trigrams that are likely combinations of words (i.e. those that make sense). Output: fd_mono - unigram frequency distribution. fd_bi - bigram frequency distribution. fd_tri - trigram frequency distribution. goodbi - bigrams that have high measure of pointwise mutual information goodtri - trigrams that have high measure of pointwise mutual information """ all_mono = [] all_bi = [] all_tri = [] lmt = wnl() combined_ads = [] #Concatenante unigrams, bigrams and trigrams from different ads together so #that we don't need to make frequency distributions for each one (we only care #about the collective anyway). for ad in self.ads: combined_ads = combined_ads+ad btemp = nltk.bigrams(ad) all_bi += btemp ttemp = nltk.trigrams(ad) all_tri += ttemp #Lemmatize unigrams (this prevents things like cat and cats from being #counted as different words. mono_stem = [lmt.lemmatize(w) for w in ad] all_mono += mono_stem #Do some collocation analysis using pmi - pointwise mutual information. #This measures how likely it is that a bigram/trigram of words actually make #sense together. bigram_measures = nltk.collocations.BigramAssocMeasures() trigram_measures = nltk.collocations.TrigramAssocMeasures() finder = BigramCollocationFinder.from_words(combined_ads) #Only keep grams that occur >25 times and pmi>100 finder.apply_freq_filter(0.5*self.n_ad) goodbi = finder.nbest(bigram_measures.pmi, 100) finder = TrigramCollocationFinder.from_words(combined_ads) finder.apply_freq_filter(0.5*self.n_ad) goodtri = finder.nbest(trigram_measures.pmi, 100) #Calculate frequency distributions. fd_mono = nltk.FreqDist(all_mono) fd_bi = nltk.FreqDist(all_bi) fd_bi = KEY_TUPLE_TO_LIST(fd_bi) fd_tri = nltk.FreqDist(all_tri) fd_tri = KEY_TUPLE_TO_LIST(fd_tri) goodbi = TUPLE_TO_LIST(goodbi) goodtri = TUPLE_TO_LIST(goodtri) return fd_mono, fd_bi, fd_tri, goodbi, goodtri
def calc_all_fd(self): """ This function creates a mega-frequency distribution dictionary. It takes a list of frequency distributions (from each ad) for each job and puts all of this info into a single dictionary keyed to job name. Result: self.fd_all - the mega-dictionary described above. """ fd_all = [] lmt = wnl() for ad in self.ads: btemp = nltk.bigrams(ad) fd_btemp = nltk.FreqDist(btemp) fd_btemp = KEY_TUPLE_TO_LIST(fd_btemp) ttemp = nltk.trigrams(ad) fd_ttemp = nltk.FreqDist(ttemp) fd_ttemp = KEY_TUPLE_TO_LIST(fd_ttemp) mono_stem = [lmt.lemmatize(w) for w in ad] fd_mtemp = nltk.FreqDist(mono_stem) fd_all.append( dict(fd_mtemp.items()+fd_btemp.items()+fd_ttemp.items()) ) self.fd_all = fd_all
def clean(comment, dtm=False, lemmatize=False, stop_words=False): no_http = re.sub(r'''https?://[\w/._-]+''', '', comment) no_at = re.sub(r'@\w+', '', no_http) no_punc = re.sub(f'[{string.punctuation}]', '', no_at) no_nums = re.sub(r'[0-9]+', '', no_punc) cleaned = nltk.word_tokenize(no_nums) if lemmatize: lemma = wnl() cleaned = [lemma.lemmatize(i) for i in cleaned] if stop_words: #lang = language(comment) stop_words = set(stopwords.words('english')) cleaned = [i for i in cleaned if i not in stop_words and len(i) > 1] if dtm: cv = cvec() cv_df = cv.fit_transform(cleaned) cleaned = pd.DataFrame(cv_df.toarray(), columns=cv.get_feature_names()) return cleaned
def lemmatize_nouns_in_tweet(self, tweet): #print('Nouns lemmatization..., end= ' ') #timestamp1 = time.time() l = ' '.join([wnl().lemmatize(t) for t in tweet.split()]) #timestamp2 = time.time() #print('{0:.2f} seconds'.format(timestamp2 - timestamp1)) return l
def lemmatize(self, token_list): lemmatizer = wnl() lemmed_tokens = [] tagged = nltk.pos_tag(token_list) for word, pos_tag in tagged: word = ''.join( [letter for letter in word if letter in self.alphabet]) lem_word = lemmatizer.lemmatize(word, pos=self._get_wordnet_pos(pos_tag)) lemmed_tokens.append(lem_word) return lemmed_tokens
def gen_text_score(word_one, word_two): print "Generating similarity score" jacard_coefficient = 0.0 lemma_for_senta = [ wnl().lemmatize(word.lower().strip(), tag) for word, tag in tags_for_sent(sentence_a) if check_for_tags_and_stopwords(word, tag) ] lemma_for_sentb = [ wnl().lemmatize(word.lower().strip(), tag) for word, tag in tags_for_sent(sentence_b) if check_for_tags_and_stopwords(word, tag) ] jacard_coefficient = len( set(lemma_for_senta).intersection(lemma_for_sentb)) / float( len(set(lemma_for_senta).union(lemma_for_sentb))) return jacard_coefficient
def __ngramwords(self, txtdata): stop_words = set(stopwords.words('english')) # set stop words type # temp words list temp_wordlist = [] # analysis words with nltk senlist = nltk.sent_tokenize(txtdata, language="english") wordlist = [] for sent in senlist: wordlist.append(nltk.word_tokenize(sent, language="english")) # add tag for each words tags = [] for tokens in wordlist: tags.append(nltk.pos_tag(tokens, lang='eng')) # filter words by vocabulary type for sent in tags: for words in sent: # filter stop words if words[0] in stop_words: continue # print(words[0] + ":" + words[1]) if words[1][:2] in self.word_type_list_In and words[1] not in self.word_type_list_Ex \ and str(words[0]).lower() not in self.word_list_Ex: if words[1][:2] == "VB": # change verb to parent tense add_word = wnl().lemmatize(words[0], "v") elif words[1][:2] == "NN": # change noun to morphy add_word = wordnet.morphy(words[0]) elif words[1][:3] in ("JJR", "JJS"): # change adj to ordinal add_word = wnl().lemmatize(word=words[0], pos=wordnet.ADJ) elif words[1][:3] in ("RBR", "RBS"): # change adv to ordinal add_word = wnl().lemmatize(word=words[0], pos=wordnet.ADV) else: add_word = words[0] temp_wordlist.append(add_word) # create n-gram words ngram_wordslist = self.__word_grams(temp_wordlist, 1, 3) return ngram_wordslist
def pickNounAndLemmatize(self, sentences): wordList = [] for sentence in sentences: temp = [] words = sentence.split(" ") words = list(filter(None, words)) tag_tuples = nltk.pos_tag(words) for tup in tag_tuples: if 'NN' in tup[1]: #regex #lemmatize #stopwordCheck word = re.sub('[^A-Za-z]+', '', tup[0]) word = word.lower() lmtzr = wnl() word = lmtzr.lemmatize(word) if (self.not_stopword(word) and len(word) > 2): temp.append(word) wordList.append(temp) return wordList
def process_text(message: str): # Create Stop words stop_words = (stopwords.words("english")) stop_words += CUSTOMIZED_STOP_WORDS # Remove punctuation text = re.sub("[^a-zA-Z]", ' ', message) # Convert to lowercase text = text.lower() # remove digits, characters text = re.sub('(\\d|\\W)+', " ", text) # Create tokens text = list(set(nltk.word_tokenize(text))) text = [wnl().lemmatize(word) for word in text if not word in stop_words] for i in range(len(text)): if text[i] in similar_words.similar_words_dict: text[i] = similar_words.similar_words_dict[text[i]] return text
def process(question): if const.showlog==1: print("[wiki-buddy] Parsing user question...") question=question.replace("?","") spquestion=question.split() tagwords=pos_tag(spquestion) verbindex=[] caveat="" splitnouns=False rawqtype=[word for word, pos in tagwords if pos=='WDT' or pos=='WP' or pos=='WP$' or pos=='WRB'] rawverb=[word for word, pos in tagwords if pos=='VB' or pos=='VBD' or pos=='VBG' or pos=='VBN' or pos=='VBP' or pos=='VBZ' or pos=='TO'] for a in range(0,len(spquestion)): for n in range(0,len(rawqtype)): if rawqtype[n] in spquestion[a]: spquestion[a]="" for n in range(0,len(rawverb)): if rawverb[n] in spquestion[a]: spquestion[a]="" verbindex.append(a) if verbindex[0]>1: for n in range(1,verbindex[0]): caveat=caveat+" "+spquestion[n] caveat=caveat.strip() question=question.replace(caveat," ") spquestion=question.split() tagwords=pos_tag(spquestion) pindex=[word for word, pos in tagwords if pos=='NNP' or pos=='NNPS'] if len(pindex)>0: rawnoun=[word for word, pos in tagwords if pos=='NN' or pos=='NNS' or pos=='NNP' or pos=='NNPS' or pos=='IN' or pos=='CC' or pos=='CD' or pos=='JJ' or pos=='JJR' or pos=='JJS'] else: rawnoun=[word for word, pos in tagwords if pos=='NN' or pos=='NNS' or pos=='IN' or pos=='CC' or pos=='CD' or pos=='JJ' or pos=='JJR' or pos=='JJS'] useless=[word for word, pos in tagwords if pos=='DT'] for a in range(0,len(spquestion)): for n in range(0,len(rawnoun)): if rawnoun[n] in spquestion[a]: spquestion[a]="" for n in range(0,len(useless)): if useless[n] in spquestion[a]: spquestion[a]="" question=" ".join(spquestion) qtype=" ".join(rawqtype) for n in range(0,len(rawverb)): if rawverb[n] in const.omitverblist: rawverb[n]="" verb=" ".join(rawverb) keyword=" ".join(rawnoun) verb=verb.strip() caveat=caveat+" "+verb qtype=qtype.strip() keyword=keyword.strip() caveat=caveat.strip() qtype="["+qtype.lower()+"]" keyword=wnl().lemmatize(keyword) caveat=caveat.strip() if caveat=="" and " " in keyword: keyword,caveat=splitkey(keyword) tokenizer=nltk.data.load('tokenizers/punkt/english.pickle') if const.showlog==1: print("[wiki-buddy] Reading Wikipedia articles for keyword '"+keyword+"'...") display_url="" try: rawdata=wikipedia.page(title = keyword,auto_suggest = True) fulltext=rawdata.content display_url=rawdata.url except: fulltext="" try: summary=wikipedia.summary(keyword) except: summary="" try: categories=wikipedia.page(title = keyword,auto_suggest = True).categories except: categories=[] if fulltext!="": sentences=nltk.sent_tokenize(fulltext) for n in range(0,len(const.omitpuctlist)): fulltext=fulltext.replace(const.omitpuctlist[n],"") elif summary!="": sentences=nltk.sent_tokenize(summary) for n in range(0,len(const.omitpuctlist)): summary=summary.replace(const.omitpuctlist[n],"") else: qtype="[null]" sentences=[] words=fulltext.split() for n in range(0,len(words)): words[n] = wnl().lemmatize(words[n]) if const.showlog==1: print("[wiki-buddy] User question was processed into the following chunks.") print(" Question type: "+qtype) print(" Keyword: "+keyword) print(" Caveat: "+caveat) return qtype,keyword,caveat,fulltext,summary,sentences,words,categories,display_url
def nltk_stemming(word): lmtzr = wnl() return lmtzr.lemmatize(word)
def remove_junk(tm, tb, tt, gb, gt): """ This code cleans the keyword list (i.e. removes similar words, etc.) Input: tm - dictionary of unigram relevance scores. tb - dictionary of bigram relevance scores. tt - dictionary of trigram relevance scores. gb - bigrams with high pmi scores. gt - trigrams with high pmi scores. Output: tm, tb and tt cleaned. """ #Filter out all of the bigrams/trigrams with low pmi scores (i.e. only keep the #ones in gb and gt. tempb = {} for term in gb: tempb[term] = tb.get(term,0) tb = tempb tempt = {} for term in gt: tempt[term] = tt.get(term,0) tt = tempt #Get rid of terms that contain the job name. job_name = self.name if(' ' in str(job_name)): sjn = job_name.split() for w in sjn: tm.pop(w,None) if(len(sjn)==2): tb.pop(job_name,None) elif(len(sjn)==3): tt.pop(job_name,None) else: tm.pop(job_name,None) #Set the stopwords + other words that seem to come up often but obviously make #no sense. sw = stopwords.words('english')+\ ['yes', 'no', 'your', 'youll', 'benefits', 'go', 'river', 'amp',\ 'us', 'e', 'permit','requires','work','types', 'dot', 'without',\ 'plus', 'must', 'way', 'new', 'job', 'click', 'http', 'winning',\ '/', 'intended', 'youre', 'location', 'conditions', 'sized',\ 'use', 'may', 'june', 'year', 'o', 'g', 'n', 'take', 'right',\ 'term', 'always', 'existing', 'onto', 'youve', 'experience',\ 'really', 'ensure', 'difference', 'ensures', 'v', 'years', 'onto'] monopop = [] bipop = [] tripop = [] #Remove stop words (or terms containing stop words). for key in tm.keys(): if(key in sw): monopop.append(key) for key in tb.keys(): k = key.split() for w in k: if(w in sw): bipop.append(key) break for key in tt.keys(): k = key.split() for w in k: if(w in sw): tripop.append(key) break for p in set(monopop): tm.pop(p,None) for p in set(bipop): tb.pop(p,None) for p in set(tripop): tt.pop(p,None) #Take care of trigrams with duplicating bigrams (i.e. banker residential #brokerage and residential brokerage company, etc.): remove the one with the #lower relevance score (or if equal, just keep one). monopop = [] bipop = [] tripop = [] maxr = {} for term, rel in tt.items(): k = term.split() c1 = k[0]+' '+k[1] c2 = k[1]+' '+k[2] if(maxr.get(c1,0) <= rel): maxr[c1] = term if(maxr.get(c2,0) <= rel): maxr[c2] = term for term, rel in tt.items(): k = term.split() c1 = k[0]+' '+k[1] c2 = k[1]+' '+k[2] if(maxr.get(c1,0) != term and maxr.get(c2,0) != term): tripop.append(term) #Do some lemmatizing on the units making up bigrams and trigrams to get rid of #similar words or bigrams contained in trigrams (unigrams contained in bigrams). lmt = wnl() for term, rel in tt.items(): k = term.split() c1 = k[0]+' '+k[1] c2 = k[1]+' '+k[2] if(math.fabs(rel-tb.get(c1,1e5)) < 140. and rel > 70.): bipop.append(c1) if(math.fabs(rel-tb.get(c2,1e5)) < 140. and rel > 70.): bipop.append(c2) if(c1 == job_name or c2 == job_name): tripop.append(term) kl = [lmt.lemmatize(w) for w in k] if( (math.fabs(rel-tm.get(kl[0],rel))<140. or \ math.fabs(rel-tm.get(kl[1],rel))<140. or \ math.fabs(rel-tm.get(kl[2],rel))<140.) and rel > 34. ): monopop += kl for term, rel in tb.items(): k = term.split() kl = [lmt.lemmatize(w) for w in k] if( (math.fabs(rel-tm.get(kl[0],rel))<70. or \ math.fabs(rel-tm.get(kl[1],rel))<70.) and rel > 17. ): monopop += kl #Do some stemming on unigrams to get rid of similar words. stemmer = nltk.PorterStemmer() for t1, r1 in tm.items(): if(r1 < 17.): continue st1 = stemmer.stem(t1) for t2, r2 in tm.items(): if(t1 == t2 or r2 < 17.): continue st2 = stemmer.stem(t2) if(r1 < r2): small = t1 else: small = t2 if( (st1 == st2) or (st1 == st2+"e") or (st1+"e" == st2) or \ (st1[:-1] == st2) or (st1 == st2[:-1]) or \ (st1[:-1] == st2[:-1]) ): monopop.append(small) for p in set(monopop): tm.pop(p,None) for p in set(bipop): tb.pop(p,None) for p in set(tripop): tt.pop(p,None) return tm, tb, tt