def analyzer(self, question): def is_noun(tag): return tag in ['NN', 'NNS', 'NNP', 'NNPS'] def is_verb(tag): return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'] def is_adverb(tag): return tag in ['RB', 'RBR', 'RBS'] def is_adjective(tag): return tag in ['JJ', 'JJR', 'JJS'] def penn_to_wn(tag): if is_adjective(tag): return wn.ADJ elif is_noun(tag): return wn.NOUN elif is_adverb(tag): return wn.ADV elif is_verb(tag): return wn.VERB return wn.NOUN # "How do i view my course on Canvas" keywords_list = [] tagged_sent = nltk.pos_tag(word_tokenize(question)) tokenizer = [] mongo_dict = {} for word_tuple in tagged_sent: if word_tuple[0] not in self.stop_words and word_tuple[0]: word_list = list(word_tuple) word_list[0] = re.sub('[!?%$*.@]', '', word_list[0]) tokenizer.append(tuple(word_list)) for tag in tokenizer: print("tag", tag[0]) print(self.dictionary.synonym(tag[0].lower())) if tag[1] == 'NNP': keywords_list.append(tag[0].lower()) else: wn_tag = penn_to_wn(tag[1]) word = WordNetLemmatizer().lemmatize(tag[0], wn_tag) print("word -->", word) print("self.dictionary.synonym(word.lower()) -->", self.dictionary.synonym(word.lower())) keywords_list.append(word.lower()) synonym_list = self.dictionary.synonym(word.lower()) if synonym_list: keywords_list.extend(synonym_list) mongo_dict["keywords"] = list(set(keywords_list)) mongo_dict[ "text"] = "Yes, Canvas can be integrated with products like: McGraw-Hill Connect, Macmillan Education, Cengage Learning MindTap, and Pearson's MyLab & Mastering. \ Please visit: http://www.sjsu.edu/ecampus/teaching-tools/canvas/integrating-publisher-db/index.html for more information." print(mongo_dict) self.dbclient.insert(mongo_dict)
def removeNoise(tokens, stopWords=()): cleaned_tokens = [] for token, tag in pos_tag(tokens): token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\ '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token) token = re.sub("(@[A-Za-z0-9_]+)", "", token) pos = lemmatize_sentence(tag) token = WordNetLemmatizer().lemmatize(token, pos) if len(token) > 0 and token not in string.punctuation and token.lower( ) not in stopWords: cleaned_tokens.append(token.lower()) return cleaned_tokens
def calcola_fattore_normalizzazione(all_ss, parola): nome = WordNetLemmatizer().lemmatize(parola, 'n') verbo = WordNetLemmatizer().lemmatize(parola, 'v') aggettivo = WordNetLemmatizer().lemmatize(parola, 'a') avverbio = WordNetLemmatizer().lemmatize(parola, 'r') fattore = 0.1 for ss in all_ss: for l in ss.lemmas(): if l.name().lower() == nome.lower() or l.name().lower() == verbo.lower() \ or l.name().lower() == aggettivo.lower() or l.name().lower() == avverbio.lower(): if l.count() > fattore: fattore = l.count() return fattore
def calcola_posizione_lemma(ss, parola): nome = WordNetLemmatizer().lemmatize(parola, 'n') verbo = WordNetLemmatizer().lemmatize(parola, 'v') aggettivo = WordNetLemmatizer().lemmatize(parola, 'a') avverbio = WordNetLemmatizer().lemmatize(parola, 'r') i = 0 for l in ss.lemmas(): if l.name().lower() == nome.lower() or l.name().lower() == verbo.lower()\ or l.name().lower() == aggettivo.lower() or l.name().lower() == avverbio.lower(): return i i += 1 return -1
def semantic_distractor_de(model, word): semantic_lst = [] lemma = WordNetLemmatizer().lemmatize(word, pos="n") lemma = lemma.lower() for w in model.most_similar(lemma,[],10): semantic_lst.append(w[0]) return semantic_lst
def generate_distractor(model, word): distractor_set = set() #distractor_set.update(lst) print("answer:", word) print("1.(semantic)", semantic_distractor(model, word)) #distractor_set.update(hypernym_distractor(word)) print("2.(shape)" ,shape_distractor(word)) print("3.(hypernym)", hypernym_distractor(word)) #distractor_set.update(hyponym_distractor(word)) print("4.(hyponym)", hyponym_distractor(word)) #distractor_set.update(synonym_distractor(word)) #distractor_set.update(antonym_distractor(word)) antonym_lst = list(set(antonym_distractor(word))) print("5.(antonym)", antonym_lst) if len(antonym_lst) != 0: first_antonym = antonym_lst[0] print("6.(antonym's hypernym)", hypernym_distractor(first_antonym)) print("7.(antonym's hyponym)", hypernym_distractor(first_antonym) ) print("8.(antonym's shape)", shape_distractor(first_antonym)) antonym_semantic_lst = [] antonym_lemma = WordNetLemmatizer().lemmatize(word, pos ="n") antonym_lemma = antonym_lemma.lower() for w in model.most_similar(antonym_lemma,[],5): antonym_semantic_lst.append(w[0]) print("9.(antonym's semantic)", antonym_semantic_lst) #distractor_set.update(shape_distractor(word)) print("* avoid synonyms:", set(synonym_distractor(word))- set(word) , "\n")
def Q2b(): # nltk.download('wordnet') text = nltk.load('text.txt', encoding='gbk') # code for Q2a token_list = nltk.sent_tokenize(text) english_punctuations = [ ',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '"', '\'s', '``', "''", "-" ] token_list = [nltk.word_tokenize(sen) for sen in token_list] new_token = [] for sens in token_list: sens = [word for word in sens if word not in english_punctuations] new_token.append(sens) new_token = [nltk.pos_tag(sen) for sen in new_token] print(new_token) lemmatized = [] for sen in new_token: for word in sen: if "V" in word[1]: w = WordNetLemmatizer().lemmatize(word[0].lower(), 'v') else: w = WordNetLemmatizer().lemmatize(word[0], 'n') lemmatized.append(w.lower()) # test = [WordNetLemmatizer().lemmatize(new_token)] # print(new_token[1]) print(lemmatized)
def noiseRemoval(reviewTokens, stop_words=()): # print("review token", reviewTokens) cleaned_tokens = [] for token, tag in pos_tag(reviewTokens): token = re.sub("[http[s]?://(!@#$;:!*%)(&^~])", '', token) # print("token" , token) token = re.sub(r"http\S+", '', token) token = re.sub("(@[A-Za-z0-9_]+)", '', token) token = re.sub("[@#:),’]", '', token) token = re.sub(r'^https?:\/\/.*[\r\n]*', '', token) #token = re.sub("'’", '', token) #print(token," ",tag) #print(token) if tag.startswith("VB"): pos = 'v' elif tag.startswith('NN'): pos = 'n' else: pos = 'a' rootWord = WordNetLemmatizer().lemmatize(token, pos) rootWord = rootWord.lower() if rootWord not in stop_words and rootWord not in string.punctuation: cleaned_tokens.append(rootWord) return cleaned_tokens
def find_lemma_opinion(word): if 'not ' in word: word = word.replace('not ', '') word = WordNetLemmatizer().lemmatize(word,'s') word = 'not ' + word else: word = WordNetLemmatizer().lemmatize(word,'s') return word.lower()
def unify_word(word): # went -> go, apples -> apple, BIG -> big """unify verb tense and noun singular""" ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v' for wt in [ADJ, ADJ_SAT, ADV, NOUN, VERB]: try: word = WordNetLemmatizer().lemmatize(word, pos=wt) except: pass return word.lower()
def countexp_noun(para, searchText): cnt = 0 searchWords = [WordNetLemmatizer().lemmatize(s.lower(), 'n') for s in searchText] for stxt in searchWords: for word in para.split(): w1 = WordNetLemmatizer().lemmatize(word.lower(), 'n') if stxt.lower() == w1.lower(): cnt = cnt + 1 break return cnt
def clean(string): string = gensim.parsing.preprocessing.remove_stopwords(string) string = nltk.word_tokenize(string) stopwords = set(nltk.corpus.stopwords.words('english')) cleaned_string = [] for i in string: i = WordNetLemmatizer().lemmatize(i.lower()) if len(i) <= 2: continue elif i in stopwords: continue else: cleaned_string.append(i) return list(set(cleaned_string))
def extractSubjectObject(self, sentence): sub = None obj = None for word in sentence: if sub == None and word[7] == "nsubj": sub = word[2].translate(string.maketrans("", ""), string.punctuation) sub = sub.lower() sub = WordNetLemmatizer().lemmatize(sub, pos="n") if obj == None and word[7] == "dobj": obj = word[2].translate(string.maketrans("", ""), string.punctuation) obj = obj.lower() obj = WordNetLemmatizer().lemmatize(obj, pos="n") # The sentence did not contain a verb, so we need to back-off to # using the tokens tagged with NN (word[4]). Brute-force take the first # two tokens tagged with NN. # # Example use : A big cow in a field. -> cow, field. if sub == None: for word in sentence: if word[4] == "NN": sub = word[2].translate(string.maketrans("", ""), string.punctuation) sub = sub.lower() sub = WordNetLemmatizer().lemmatize(sub, pos="n") break if obj == None: for word in sentence: if word[4] == "NN": proposal = word[2].translate(string.maketrans("", ""), string.punctuation) proposal = proposal.lower() proposal = WordNetLemmatizer().lemmatize(proposal, pos="n") if proposal != sub: obj = proposal break return sub, obj
def to_word_list(query: str): """ Tworzy ze stringa liste słów Słowa za zmienone przy pomocy NLTK Sprawdzany jest iloczyn słow ze słownikiem słow angielskich :param query: Dane do modyfikacji :return: """ with open("usage_files/words.txt") as word_file: english_words = set(word.strip().lower() for word in word_file) tags = pos_tag((using_translate(query))) a = [] for tag in tags: wn_tag = penn_to_wn(tag[1]) word = WordNetLemmatizer().lemmatize(tag[0], wn_tag) if word.lower() in english_words: a.append(word) if len(a) == 0: raise ValueError("First 5000 words are not in english") return a
def Q3(): p = porter.PorterStemmer() stopwords = [] with open('stopwords.txt', 'r') as f: for line in f: stopwords.append(line.rstrip()) f.close() # print(stopwords) temp = requests.get("https://www.bbc.com/news/world-us-canada-49871909") temp.encoding = 'utf-8' soup = BeautifulSoup(temp.content, 'html.parser') text_1 = soup.find('div', {'class': 'story-body__inner'}).findAll('p') # text_1.remove('<p>') text_1 = [part.get_text() for part in text_1] text_1 = [nltk.word_tokenize(sen) for sen in text_1] english_punctuations = [ ',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '"', '\'s', '``', "''", "-" ] text_1 = [[word for word in sens if word not in english_punctuations] for sens in text_1] text_1 = [[word for word in sens if word not in stopwords] for sens in text_1] text_1 = [nltk.pos_tag(sen) for sen in text_1] # print(text_1) result = [] for sen in text_1: for word in sen: if "V" in word[1]: w = WordNetLemmatizer().lemmatize(word[0].lower(), 'v') elif "N" in word[1]: w = WordNetLemmatizer().lemmatize(word[0], 'n') else: w = p.stem(word[0]) result.append(w.lower()) # print(result) fdist = FreqDist(result) tops = fdist.most_common(40) print(tops)
def extractSubjectObject(self, sentence): sub = None obj = None for word in sentence: if sub == None and word[7] == "nsubj": sub = word[2].translate(string.maketrans("",""), string.punctuation) sub = sub.lower() sub = WordNetLemmatizer().lemmatize(sub, pos="n") if obj == None and word[7] == "dobj": obj = word[2].translate(string.maketrans("",""), string.punctuation) obj = obj.lower() obj = WordNetLemmatizer().lemmatize(obj, pos="n") # The sentence did not contain a verb, so we need to back-off to # using the tokens tagged with NN (word[4]). Brute-force take the first # two tokens tagged with NN. # # Example use : A big cow in a field. -> cow, field. if sub == None: for word in sentence: if word[4] == "NN": sub = word[2].translate(string.maketrans("",""), string.punctuation) sub = sub.lower() sub = WordNetLemmatizer().lemmatize(sub, pos="n") break if obj == None: for word in sentence: if word[4] == "NN": proposal = word[2].translate(string.maketrans("",""), string.punctuation) proposal = proposal.lower() proposal = WordNetLemmatizer().lemmatize(proposal, pos="n") if proposal != sub: obj = proposal break return sub, obj
def analyzer(self, question): # "How do i view my course on Canvas" def is_noun(tag): return tag in ['NN', 'NNS', 'NNP', 'NNPS'] def is_verb(tag): return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'] def is_adverb(tag): return tag in ['RB', 'RBR', 'RBS'] def is_adjective(tag): return tag in ['JJ', 'JJR', 'JJS'] def penn_to_wn(tag): if is_adjective(tag): return wn.ADJ elif is_noun(tag): return wn.NOUN elif is_adverb(tag): return wn.ADV elif is_verb(tag): return wn.VERB return wn.NOUN keywords_list = [] tagged_sent = nltk.pos_tag(word_tokenize(question)) for tag in tagged_sent: if tag[0].lower() not in self.stop_words and tag: wn_tag = penn_to_wn(tag[1]) word = WordNetLemmatizer().lemmatize(tag[0], wn_tag) keywords_list.append(word.lower()) print('------------------------------------') print(keywords_list, "keywords_list") print('------------------------------------') response = self.dbclient.findAll(keywords_list) return response
def reduced_form(word): ''' Reduce a word to its root to adequately compare with words from cluster''' w = WordNetLemmatizer().lemmatize(word) return w.lower()
def find_lemma_aspect(word): word = WordNetLemmatizer().lemmatize(word,'n') return word.lower()