def getMaybeWords(self, text_ls): ignoreWords = ["","have","her","there","the","be","to","of","and","a","in","that","it","for","on","with","as","at","this","but","his","by","from","they","or","an","will","would","so","even","is","be","am","are"]; word_ls = [] for text in text_ls: word_ls += wordpunct_tokenize(text) frequencies = {} st = LancasterStemmer() for word in word_ls: if not word[0].isalpha(): continue if word in ignoreWords: continue word_stem = st.stem(word) if word_stem in frequencies: frequencies[word_stem] += 1 else: frequencies[word_stem] = 1 sorted_frequencies = sorted(frequencies.iteritems(), key = operator.itemgetter(1), reverse = True) #print sorted_frequencies max_words = 30 if len(sorted_frequencies) < max_words: max_words = len(sorted_frequencies) word_tuples = sorted_frequencies[0:max_words] words = [tuple[0] for tuple in word_tuples] print words return words
def build_analyzer(self): """ Return a callable that handles preprocessing and tokenization """ preprocess = self.build_preprocessor() tokenize = self.build_tokenizer() stemmer = LancasterStemmer() filter_meta = lambda doc: ' '.join([w for w in doc.split() if not w.startswith('~')]) parse_words = lambda doc: tokenize(preprocess(filter_meta(self.decode(doc)))) stem_words = lambda doc: [stemmer.stem(t) for t in parse_words(doc)] meta_func = lambda prefix: lambda doc: (t for t in self.decode(doc).split() if t.startswith(prefix)) feat_func_map = { 'word': lambda doc: self._word_ngrams(parse_words(doc), self.get_stop_words()), 'stem': lambda doc: self._word_ngrams(stem_words(doc), self.get_stop_words()), '1st': lambda doc: ('~T:1st' for i in parse_words(doc) if i in first_person_words), '3rd': lambda doc: ('~T:3rd' for i in parse_words(doc) if i in third_person_words), 'tag': lambda doc: self._word_ngrams([t[1] for t in nltk.pos_tag(parse_words(doc))]), 'length': lambda doc: ['~L:%d' % (len(parse_words(doc)) / 5)], 'genre': meta_func('~G'), 'rating': meta_func('~Ra'), 'votes': meta_func('~V'), 'lang': meta_func('~La'), 'country': meta_func('~Co'), 'year': meta_func('~Y'), 'runtime': meta_func('~Rt'), 'type': meta_func('~T') } func_list = [feat_func_map.get(flag.strip()) for flag in self.analyzer.split(':')] \ if type(self.analyzer) is str else None if not func_list: raise ValueError('%s is not a valid tokenization scheme/analyzer' % self.analyzer) else: return lambda doc: itertools.chain.from_iterable(f(doc) for f in func_list if callable(f))
def prepare_corpus(raw_documents): # remove punctuation print "Removing Punctuation" import string exclude = set(string.punctuation) raw_documents = [''.join(ch for ch in s if ch not in exclude) for s in raw_documents] # remove common words print "Calculating Stoplist" stoplist = set([x.rstrip() for x in codecs.open("stop_list.txt", encoding='utf-8') if not x.startswith("#")]) stoplist = stoplist.union(set(nltk.corpus.stopwords.words("english"))) # print stoplist print "Removing Stoplist and Stemming" from nltk.stem.lancaster import LancasterStemmer st = LancasterStemmer() texts = [[st.stem(word) for word in document.lower().split() if word not in stoplist] for document in raw_documents] # remove words that appear only once print "Removing Single Variables" all_tokens = sum(texts, []) tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) texts = [[word for word in text if word not in tokens_once] for text in texts] return texts
def tokenize_rest(text): wnl = WordNetLemmatizer() st = LancasterStemmer() words = nltk.word_tokenize(text) postag = nltk.pos_tag(words) tokens = [] whfound=False for word in words: if word[0:2].lower() == 'wh' and not whfound: tokens.append({word.lower():'wh'}) whfound = True continue elem=wnl.lemmatize(word) stem = st.stem(elem) synd = wn.synsets(stem) if not synd: stem = stemmer(elem) synd = wn.synsets(stem) if not synd: stem = elem synd = wn.synsets(stem) dbelement=detect(stem) if dbelement: for every_elem in dbelement: tokens.append({word:every_elem}) print "\n Rest of possible Tokens" print tokens return tokens
def stem_tweet(tweet, stemmer_type = "lancaster"): """ :param tweet: string representing tweet :param stemmer_type: type of stemmer used (default value is lancaster) :return: stemmed tweet :type tweet: str :type stemmer_type: str """ tokens = nltk.word_tokenize(tweet) stemmed_tokens = [] if stemmer_type == "lancaster": stemmer = LancasterStemmer() elif stemmer_type == "snowball": stemmer = SnowballStemmer("english") elif stemmer_type == "porter": stemmer = PorterStemmer() elif stemmer_type == "regexp": stemmer = RegexpStemmer("english") else: return None for token in tokens: stemmed_tokens.append(stemmer.stem(token)) ret_tw = "".join([" "+i if not i.startswith("'") and i not in string.punctuation else i for i in stemmed_tokens]).strip() return ret_tw
def train_lsi_model(self, texts, num_of_toptics=10): texts_tokenized = [[word.lower() for word in word_tokenize(text)] for text in texts] # remove the stop words and punctuations english_stop_words = stopwords.words('english') english_punctuations = [',', '.', ':', '?', '(', ')', '[', ']', '@', '&', '!', '*', '#', '$', '%'] texts_filtered = [[word for word in text_tokenized if (not word in english_punctuations) and (not word in english_stop_words)] for text_tokenized in texts_tokenized] # stem the word st = LancasterStemmer() texts_stemed = [[st.stem(word) for word in text_filtered] for text_filtered in texts_filtered] all_stems = sum(texts_stemed, []) stem_once = set(stem for stem in set(all_stems) if all_stems.count(stem) == 1) cleaned_texts = [[stem for stem in text if stem not in stem_once] for text in texts_stemed] dictionary = corpora.Dictionary(cleaned_texts) corpus = [dictionary.doc2bow(text) for text in cleaned_texts] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_of_toptics) result = lsi[corpus] return result
def parse_raw_data(self, new_art): self.startClass=default_timer() tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(new_art.body) stemmer = LancasterStemmer() article_dic = new_art.words global_dic = self.raw_dictionary for word in tokens: word = word.lower() if(False == self.is_stop_word(word) and word.isnumeric()==False): s_word = stemmer.stem(word) # s_word = word ## it is not a stop word, check if the word ## is already part of the article dictionary. ## if yes, increment the count else add it. ## If you are adding check if it is part of ## the big corpus, if yes increment the count ## of number of articles with that word. self.globalWordCount+=1 new_art.doc_len = new_art.doc_len + 1 if(s_word in article_dic): article_dic[s_word].wrd_count+=1 global_dic[s_word].wrd_count+=1 else: article_dic[s_word] = local_word_attributes(1) if (s_word in global_dic): global_dic[s_word].art_count+=1 global_dic[s_word].wrd_count+=1 else: global_dic[s_word] = global_word_attributes(1,1, 1, 0)
def word_standardize(sentences): tokens = [] sentences_st = [] for sent in sentences: tokens.extend(word_tokenize(sent)) sentences_st.append(word_tokenize(sent)) words = tokens st = LancasterStemmer() words = [w.lower() for w in words] words = [w for w in words if not w in stopwords.words('english')] words = [w for w in words if not w in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'] st_words = [st.stem(w) for w in words] sent_result = [] for sent in sentences_st: sent = [w.lower() for w in sent] sent = [w for w in sent if not w in stopwords.words('english')] sent = [w for w in sent if not w in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'] sent_result.append(sent) return st_words, sent_result
def tweetTokenizer(tweet_text): st = LancasterStemmer() twitterWords = tweet_text.split() #remove stop words using NLTK corpus twitterWords = [word.lower() for word in twitterWords] twitterWords = [w for w in twitterWords if not w in stopwords.words('english')] #remove custom list of stop words using experimentation noiseWords = ["i'm", "like", "get", "don't", "it's", "go", "lol", "got", "one", "know", "@", "good", "want", "can't", "need", "see", "people", "going", "back", "really", "u", "think", "right", "never", "day", "time", "never", "that's", "even", ",", "." "make", "wanna", "you're", "come", "-", "still", "much", "someone", "today", "gonna", "new", "would", "take", "always", "im", "i'll", "best", "'", "feel", "getting", "say", "tonight", "last", "ever", "better", "i've", "look", "f*****g", "way", "could", "!", "oh" "tomorrow", "night", "first", "miss", "ain't", "thank", "2", "bad" "little", "thanks", "something", "wait", "&", "`", "oh", "make", "bad", "let","stop", "well", "tell"] twitterWords = [w for w in twitterWords if not w in noiseWords] twitterWords = [st.stem(w) for w in twitterWords] return twitterWords
def predict_category_subcategory(book_name): data_set1 = pandas.Series(book_name.encode('ascii')) #Data Preprocessing data_set1 = data_set1.dropna(axis=0,how='any') data_set1 = data_set1.str.lower() #Manual removal List remove_list = ['edition','ed','edn', 'vol' , 'vol.' , '-' ,'i'] data_set1[0] =' '.join([i for i in data_set1[0].split() if i not in remove_list]) data_set1 = data_set1.apply(lambda x :re.sub(r'\w*\d\w*', '', x).strip()) data_set1 = data_set1.apply(lambda x :re.sub(r'\([^)]*\)', ' ', x)) data_set1 = data_set1.apply(lambda x :re.sub('[^A-Za-z0-9]+', ' ', x)) #data_set['Category ID'] = data_set['Category ID']+"|"+data_set['Subcategory ID'] #Stemming the book titles stemmer = LancasterStemmer() data_set1[0]=" ".join([stemmer.stem(i) for i in data_set1[0].split()]) clf = joblib.load(os.path.join(BASE_DIR+"/learners/",'category_predict.pkl')) ans = clf.predict(data_set1) sub_clf = joblib.load(os.path.join(BASE_DIR+"/learners/",'subcategory_predict.pkl')) sub_ans = sub_clf.predict(data_set1) return [ans[0],sub_ans[0]]
def parse_validation(validation_path): validation_list = [] with open(validation_path) as f: for line in f: strs = line.split('|') word_dict = {} validation_list.append(word_dict) word_dict["word"] = strs[0].strip() word_dict["real_sense"] = int(strs[1]) sentence_list = [] word_dict["sentence"] = sentence_list lmtzr = WordNetLemmatizer() ls = LancasterStemmer() single_words = re.findall("(\w+|%%)",strs[2]) double_mod_found = False word_count = 0 for single_word in single_words: if single_word == "%%": if not double_mod_found: word_dict["target_word_idx"] = word_count+1 double_mod_found = True continue lemmed = lmtzr.lemmatize(single_word) stemmed = ls.stem(lemmed) if not stemmed in glob_Lucene: sentence_list.append(stemmed) word_count += 1 return validation_list
def lemmstem(sentences): ''' This function is responsible for perfoming the lemmarization and stemming of the words Input: A list of trees containing the sentences. All words are classificated by their NE type Output: Lemmatized/Stemmized sentences ''' lmtzr = WordNetLemmatizer() st = LancasterStemmer() dic = {'VB' :wordnet.VERB, 'NN': wordnet.NOUN, 'JJ':wordnet.ADJ, 'RB':wordnet.ADV } for sent in sentences: lvsidx=sent.treepositions('leaves') for pos in lvsidx: word=sent[pos][0] tag = sent[pos][1] rtag = tag[0:2] if rtag in dic: lemm=lmtzr.lemmatize( word, dic[rtag] ) stem=st.stem(lemm) #print word, lemm, stem #Linia maldita sent[pos]=(word, tag, stem) else: sent[pos]=(word, tag, word) return sentences
def stemming(words): wordsAfterStemming=[] st=LancasterStemmer() for x in words: y=st.stem(x) wordsAfterStemming.append(y) return wordsAfterStemming
def readText(textFile): examples = [] count = 0 lexicon_en = {} lexicon_ge = {} stem_en = LancasterStemmer() stem_ge = nltk.stem.snowball.GermanStemmer() for line in open(textFile): count+=1 if count % 1000 == 0: print count lans = line.lower().strip().split("|||") #german = [stem_ge.stem(x.decode('utf-8')) for x in lans[0].strip().split(" ")] german = lans[0].strip().split(" ") german = process(german) for wordx in german: for word in wordx: if word not in lexicon_ge: lexicon_ge[word]=1 else: lexicon_ge[word]+=1 eng = [stem_en.stem(x.decode('utf-8')) for x in lans[1].strip().split(" ")] #parse_en = pattern.en.parse(" ".join(eng)) eng = lans[1].strip().split(" ") for word in eng: if word not in lexicon_en: lexicon_en[word]=1 else: lexicon_en[word]+=1 examples.append(Example(german,eng)) return examples, lexicon_en, lexicon_ge
def remove_stems(file): new_file = [] punctuation = re.compile(r'[.,"?!:;]') lemmatizer = WordNetLemmatizer() stemmer = LancasterStemmer() for raw_post in file: post = raw_post[1] token = nltk.word_tokenize(post) token_tags = nltk.pos_tag(token) new_token = [] for word in token_tags: # Removes punctuations and change it to lower case original_word = punctuation.sub("", word[0].lower()) # Stems each word to their roots, but using lemmatizer then Lancaster stemmed_word = lemmatizer.lemmatize(original_word) if original_word == stemmed_word: stemmed_word = stemmer.stem(stemmed_word) # Removes stopwords that are defined in the nltk library if stemmed_word not in nltk.corpus.stopwords.words('english') and stemmed_word != '': new_token.append((stemmed_word, word[1])) new_file.append((raw_post[0], new_token)) return new_file
def get_pretrained_vector(session, word2vec_model, vocab_path, vocab_size, vectors): print(vectors) with gfile.GFile(vocab_path, mode="r") as vocab_file: st = LancasterStemmer() counter = 0 counter_w2v = 0.0 while counter < vocab_size: vocab_w = vocab_file.readline().replace("\n", "") # vocab_w = st.stem(vocab_w) # for each word in vocabulary check if w2v vector exist and inject. # otherwise dont change value initialise randomly. if word2vec_model and vocab_w and word2vec_model.__contains__(vocab_w) and counter > 3: w2w_word_vector = word2vec_model.get_vector(vocab_w) print("word:%s c:%i w2v size %i" % (vocab_w, counter, w2w_word_vector.size)) vectors[counter] = w2w_word_vector counter_w2v += 1 else: vocab_w_st = st.stem(vocab_w) if word2vec_model and vocab_w_st and word2vec_model.__contains__(vocab_w_st): w2w_word_vector = word2vec_model.get_vector(vocab_w_st) print("st_word:%s c:%i w2v size %i" % (vocab_w_st, counter, w2w_word_vector.size)) vectors[counter] = w2w_word_vector counter_w2v += 1 else: if not vocab_w: print("no more words.") break counter += 1 print("injected %f per cent" % (100 * counter_w2v / counter)) print(vectors) return vectors
def processRawData(self, inputPath, outputPath): raw = pickle.load(open(inputPath, "r")) data = [] genres = set([]) count = 0 st = LancasterStemmer() for key in raw.keys(): movie = raw[key] # if no genre or synopsis data if 'genres' not in movie or 'synopsis' not in movie: continue if len(movie['genres'])==0 or movie['synopsis'] == '': continue temp = {} temp['genres'] = movie['genres'] for g in temp['genres']: genres.add(g) # trim out the punctuation and transform to lowercase #replace_punctuation = string.maketrans(string.punctuation, ' '*len(string.punctuation)) s = str(movie['synopsis']) s = s.translate(string.maketrans("",""), string.punctuation) s = re.sub(' +', ' ', s).strip() s = " ".join(st.stem(word) for word in s.split(" ")) temp['synopsis'] = s.lower() data.append(temp) count += 1 # output as a pickle file file = open(outputPath, 'wb') pickle.dump(data, file) print 'processed ' + str(count) + ' movies' return genres
def preprocess(reviews): import nltk from nltk.tokenize import word_tokenize review_tokenized = [[word.lower() for word in word_tokenize(review.decode('utf-8'))] for review in reviews] #print "review tokenize done" #remove stop words from nltk.corpus import stopwords english_stopwords = stopwords.words('english') review_filterd_stopwords = [[word for word in review if not word in english_stopwords] for review in review_tokenized] #print 'remove stop words done' #remove punctuations english_punctuations = [',','.',':',';','?','(',')','&','!','@','#','$','%'] review_filtered = [[word for word in review if not word in english_punctuations] for review in review_filterd_stopwords] #print 'remove punctuations done' #stemming from nltk.stem.lancaster import LancasterStemmer st = LancasterStemmer() review_stemmed = [[st.stem(word) for word in review] for review in review_filtered] #print 'stemming done' return review_stemmed
def overlapping_text(text_1, text_2): st = LancasterStemmer() cachedStopWords = get_stopwords() text_1_list = ([st.stem(word) for word in text_1.split() if word not in cachedStopWords]) text_2_list = ([st.stem(word) for word in text_2.split() if word not in cachedStopWords]) return jaccard_dist(text_1_list, text_2_list) '''
def preprocess(content): stopset = set(stopwords.words('english')) #replace punctuation and tag with space tokens = word_tokenize(re.sub(r'<p>|</p>|[^A-Za-z ]', ' ', content.lower())) pos_list = pos_tag(tokens) s_tokens = list() #noun and verb only for pos in pos_list: #print pos[1] #if pos[1] in ['NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']: if pos[1] in ['NN', 'NNS']: s_tokens.append(pos[0]) wordfreq = FreqDist(s_tokens) stemfreq = dict() st = LancasterStemmer() for word, freq in wordfreq.items(): #stopwords if word in stopset: del wordfreq[word] continue #tiny words if len(word) <= 2: del wordfreq[word] continue #stemmer stem = st.stem(word) try: stemfreq[stem]+=freq except: stemfreq[stem]=freq return stemfreq
def simplify_old(s): res = '' st = LancasterStemmer() text = nltk.word_tokenize(s) tags = nltk.pos_tag(text) for tag in tags: word = tag[0] if f.checkPos(tag[1]): if word in model: word_stem = st.stem(word) top_words = model.most_similar(positive=[word], topn = 20) candidate_list = [w[0] for w in top_words] freq_list = [fdist[w] for w in candidate_list] c_f_list = zip(candidate_list, freq_list) ordered_list = sorted(c_f_list, key=lambda c_f_list:c_f_list[1], reverse=True) word_freq = fdist[word] # synonmys = f.getSynonmys(word) ## get synonmys from wordnet # print synonmys for w in ordered_list: if not f.freq_diff(word_freq, w[1]): ## break for loop if candidate word frequency does not exceed the word frequency by a threshold break if st.stem(w[0]) != word_stem and f.samePos(word, w[0]): ##exclude morphological derivations and same pos word = w[0] ### do not use wordnet # if w[0] in synonmys: # word = w[0] # else: # for syn in synonmys: # if st.stem(w[0]) == st.stem(syn): # word = w[0] res = res + word + ' ' return res
def mapper(): #list of fields in positional order expected in inbound #forum node data. fieldnames = ['id', 'title', 'tag_names', 'author_id', 'body', 'node_type', 'parent_id', 'abs_parent_id', 'added_at', 'score', 'state_string', 'last_edited_id', 'last_activity_by_id', 'last_activity_at', 'active_revision_id', 'extra', 'extra_ref_id', 'extra_count', 'marked'] reader = csv.DictReader(sys.stdin, delimiter='\t', fieldnames=fieldnames) stemmer = LancasterStemmer() stopw = stopwords.words('english') split_pattern = re.compile('[\W.!?:;"()<>[\]#$=\-/]') for line in reader: pid = line['id'] body = line['body'] # split body into words words = split_pattern.split(body) # map the stemmer function across all the words. # and use the Counter to create a dict # of counted stems. Remove english stopwords. stem_counts = Counter((stemmer.stem(x) for x in words if x not in stopw)) # emit the stem, count and node id # for reduction into the reverse index for stem, count in stem_counts.items(): print "{stem}\t{node_id}\t{count}".format(stem=stem, node_id=pid, count=count)
def filt(string): ret = string # Filter all punctuation from string for p in punctuation: ret = ret.replace(p, '') # Replace hyphens with spaces ret = ret.replace('-', ' ') oldret = ret ret = "" # Filter all stop words from string for word in oldret.split(): if (word in allStopWords) or len (word) <= 1: pass else: ret += word.lower() + " " st = LancasterStemmer() steamed = "" for word in ret.split(): try: steamed += str(st.stem(word)) + " " except UnicodeDecodeError: pass return steamed
def stem_text(text): stm = LancasterStemmer() tokens = text.split() words = [stm.stem(w) for w in tokens] snt = " ".join(words) return snt
def lemmatizer_newsheadlines() : lancaster_stemmer = LancasterStemmer() frl=open("C:/Users/rajas/Downloads/csv_files-2014-12-10/csv files/lemma1.csv","rU") fr=open("C:/Users/rajas/Downloads/csv_files-2014-12-10/csv files/sample.csv","rU") fw=open("C:/Users/rajas/Downloads/csv_files-2014-12-10/csv files/lemmaheadlines.csv","w") for headline in fr: if len(headline)>0: headlinelist=headline.split(",") if len(headlinelist)==3: headlinewords=headlinelist[1].split(" ") print(headlinewords) for word in headlinewords: wordcor=(((word.replace("?","")).replace(":","")).replace("\"","")) headlineword=(lancaster_stemmer.stem(wordcor)).lower() print(headlineword) # for line in frl: # crimelist=line.split(",") # crimeword=((crimelist[1].replace("\"","")).strip()).lower() # print(crimeword+str(i)) # i+=1 dictcrime=lemmadict() if headlineword in dictcrime: print(headlineword+"yipee") fw.write(headlineword+","+headlinelist[0]+","+headlinelist[1]+"\n") break; frl.close() fw.close() fr.close()
def process(reviews): #separate splitor from nltk.tokenize import word_tokenize review_tokenized = [[word.lower() for word in word_tokenize(review.decode('utf-8'))] for review in reviews] #remove stop words from nltk.corpus import stopwords english_stopwords = stopwords.words('english') review_filterd_stopwords = [[word for word in review if not word in english_stopwords] for review in review_tokenized] #remove punctuations english_punctuations = [',','.','...', ':',';','?','(',')','&','!','@','#','$','%'] review_filtered = [[word for word in review if not word in english_punctuations] for review in review_filterd_stopwords] #stemming from nltk.stem.lancaster import LancasterStemmer st = LancasterStemmer() review_stemmed = [[st.stem(word) for word in review] for review in review_filtered] #remove word whose frequency is less than 5 all_stems = sum(review_stemmed, []) stems_lt_three = set(stem for stem in set(all_stems) if all_stems.count(stem) == 1) final_review = [[stem for stem in text if stem not in stems_lt_three] for text in review_stemmed] return final_review
def extractRawTrainingData(text, stopWords, stemming = None): st = LancasterStemmer() rawData = [] text.readline() print("********** Extract From Raw Training Data **********") if stopWords: sign = 'ON' else: sign = 'OFF' print("Stopwords:" + sign) if stemming: sign = 'ON' else: sign = 'OFF' print("Stemming:" + sign) prevId = 0 print("Extracting...") for line in text: lineTokens = line.strip('\n').split('\t') sentenceId = int(lineTokens[1]) ''' if sentenceId > prevId: prevId = sentenceId sentenceStr = lineTokens[2] sentiment = int(lineTokens[3]) sentenceTokens = re.sub("\s+", " ", sentenceStr).split(' ') if stemming: sentenceTokens = map(lambda x:unicode(st.stem(x).lower()),sentenceTokens) else: sentenceTokens = map(lambda x:unicode(x.lower()),sentenceTokens) sentenceTokens = stripWords(sentenceTokens,stopWords) entry = {"sentenceId":sentenceId, "sentence":sentenceTokens, "sentiment":sentiment} rawData.append(entry) ''' sentenceStr = lineTokens[2] sentiment = int(lineTokens[3]) sentenceTokens = re.sub("\s+", " ", sentenceStr).split(' ') if stemming: sentenceTokens = map(lambda x:unicode(st.stem(x).lower()),sentenceTokens) else: sentenceTokens = map(lambda x:unicode(x.lower()),sentenceTokens) sentenceTokens = stripWords(sentenceTokens,stopWords) entry = {"sentenceId":sentenceId, "sentence":sentenceTokens, "sentiment":sentiment} rawData.append(entry) print("Done") print(len(rawData)) return rawData
def stem_funct(str): res = '' #Use NLTK's stemmer st = LancasterStemmer() #Stem each word and append the result in the string for word in str.split(' '): res += ' ' + st.stem(word) return res
def containKeywords(text, keywords): letters_only = re.sub("[^a-zA-Z0-9]", " ", text) lower_case = letters_only.lower() words = lower_case.split() words = [word for word in words if not word in stopwords.words("english")] st = LancasterStemmer() stemmed = [st.stem(word) for word in words] return (any(i in stemmed for i in keywords))
def tokenizeRawText(self): for each in self.rawtext: st = LancasterStemmer() try: ev = st.stem(each.lower()) self.rawTokens.append(ev) except UnicodeDecodeError as e: self.unicodeErrors = self.unicodeErrors + 1
# things we need in general import sys import pickle import json import ijson.backends.yajl2 as ijson # things we need for NLP import nltk from nltk.stem.lancaster import LancasterStemmer stemmer = LancasterStemmer() # things we need for Tensorflow import os import numpy as np import tflearn import tensorflow as tf import random #get all the inputs training_input = sys.argv[1] training_logs = sys.argv[2] model_output = sys.argv[3] training_data_file= sys.argv[4] words_file = str(training_data_file) + ".words" classes_file = str(training_data_file) + ".classes" documents_file = str(training_data_file) + ".documents" training_text_file= str(training_data_file) + ".txt" # helper methods def load_json(filepath): return json.load(open(filepath, "r")) def save_json(data, filepath):
def __init__(self): self.stemmer = LancasterStemmer() with open('intents_data.json', 'r') as json_data: self.intents = json.load(json_data)
def text_to_wordlist(text, remove_stop_words=True, stem_words=False, lemma=True): # Clean the text, with the option to remove stop_words and to stem words. # Clean the text text = text.rstrip('?') text = text.rstrip(',') text = re.sub(r"[^A-Za-z0-9]", " ", text) text = re.sub(r"what's", "", text) text = re.sub(r"What's", "", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"can't", "cannot ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"I'm", "I am", text) text = re.sub(r" m ", " am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r"60k", " 60000 ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e-mail", "email", text) text = re.sub(r"\s{2,}", " ", text) text = re.sub(r"quikly", "quickly", text) text = re.sub(r" usa ", " America ", text) text = re.sub(r" USA ", " America ", text) text = re.sub(r" u s ", " America ", text) text = re.sub(r" uk ", " England ", text) text = re.sub(r" UK ", " England ", text) text = re.sub(r"india", "India", text) text = re.sub(r"switzerland", "Switzerland", text) text = re.sub(r"china", "China", text) text = re.sub(r"chinese", "Chinese", text) text = re.sub(r"imrovement", "improvement", text) text = re.sub(r"intially", "initially", text) text = re.sub(r"quora", "Quora", text) text = re.sub(r" dms ", "direct messages ", text) text = re.sub(r"demonitization", "demonetization", text) text = re.sub(r"actived", "active", text) text = re.sub(r"kms", " kilometers ", text) text = re.sub(r"KMs", " kilometers ", text) text = re.sub(r" cs ", " computer science ", text) text = re.sub(r" upvotes ", " up votes ", text) text = re.sub(r" iPhone ", " phone ", text) text = re.sub(r"\0rs ", " rs ", text) text = re.sub(r"calender", "calendar", text) text = re.sub(r"ios", "operating system", text) text = re.sub(r"gps", "GPS", text) text = re.sub(r"gst", "GST", text) text = re.sub(r"programing", "programming", text) text = re.sub(r"bestfriend", "best friend", text) text = re.sub(r"dna", "DNA", text) text = re.sub(r"III", "3", text) text = re.sub(r"the US", "America", text) text = re.sub(r"Astrology", "astrology", text) text = re.sub(r"Method", "method", text) text = re.sub(r"Find", "find", text) text = re.sub(r"banglore", "Banglore", text) text = re.sub(r" J K ", " JK ", text) # Remove punctuation from text text = ''.join([c for c in text if c not in punctuation]) if remove_stop_words: text = text.split() text = [w for w in text if not w in stop_words] text = " ".join(text) # Optionally, shorten words to their stems if stem_words: text = text.split() #stemmer = SnowballStemmer('english') #stemmed_words = [stemmer.stem(word) for word in text] stemmed_words = [nltk.PorterStemmer().stem_word(word.lower()) for word in text] text = " ".join(stemmed_words) # Return a list of words after lemma if lemma: text = text.split() lancaster_stemmer = LancasterStemmer() lemma_words = [lancaster_stemmer.stem(word.lower()) for word in text] text = " ".join(lemma_words) return(text)
import nltk # nltk.download('punkt') from nltk.stem.lancaster import LancasterStemmer stemmer = LancasterStemmer() import numpy import tflearn import tensorflow import random import json import pickle with open("intents.json") as file: data = json.load(file) # avoid preprocessings if it has already done try: with open("data.pickle", "rb") as f: words, labels, training, output = pickle.load(f) except: words = [] # tokenized words in patterns of all tags labels = [] # distinct tags docs_x = [] # list of all words docs_y = [] # list of tags associated with the words in docs_x ignore_letters = [",", "!", ".", ":", "?"] for intent in data["intents"]: for pattern in intent["patterns"]: wrds = nltk.word_tokenize(pattern) words.extend(wrds) docs_x.append(wrds)
''' Stemming: same meaning words treated as 1 word. Reduce words to their stem words. Part of Speech Tagging: Tag words based on whether it is a noun, verb, adjective, etc. ''' import nltk from nltk.tokenize import word_tokenize #Stemmer used: Lancaster Stemmer text = "Mary closed on closing night when she was in the mood to close." from nltk.stem.lancaster import LancasterStemmer st = LancasterStemmer() stemmedWords = [st.stem(word) for word in word_tokenize(text)] print(stemmedWords) #Tagging - Noun, Verb, Adverb, Adjective, etc #NNP-Proper Noun, VBD-Verb, NN-Noun, PRP-Pronoun #nltk.download('averaged_perceptron_tagger') pos_taggs = nltk.pos_tag(word_tokenize(text)) print(pos_taggs)
for i in range(0, len(x)): text = x[i] num_char_w = len(text) feature_set[i].append(num_char_w) #Number of characters without whitespace (6) for i in range(0, len(x)): text = x[i] num_char = 0 for j in range(0, len(text)): if text[j] != ' ': num_char = num_char + 1 feature_set[i].append(num_char) tkr = RegexpTokenizer('[a-zA-Z0-9@]+') stemmer = LancasterStemmer() tokenized_corpus = [] for i, news in enumerate(x): tokens = [stemmer.stem(t) for t in tkr.tokenize(news)] tokenized_corpus.append(tokens) #Number of unique words (7) for i in range(0, len(tokenized_corpus)): text = tokenized_corpus[i] s = set(text) unq = len(s) feature_set[i].append(len(s)) #Lexical Density or Complexity- Number of Unique Tokens Divided by total number of words (8)
def lanStem(self, token): ls = LancasterStemmer() for w in token: print(ls.stem(w))
# use natural language toolkit import json import nltk from nltk.stem.lancaster import LancasterStemmer # word stemmer stemmer = LancasterStemmer() training_data = [] #with open("new.txt", "r") as read_file: # data = read_file.readlines() #for i in data: # training_data.append(i) #training_data=training_data.split(",") #print(training_data[1]) training_data.append({ "Class": "Description", "Question": "What is Filename injection Path traversel ?" }) training_data.append({ "Class": "Description", "Question": "What does Filename injection Path traversel mean ?" }) training_data.append({ "Class": "Description", "Question": "Tell me something about Filename injection Path traversel ?" }) training_data.append({
class ChatBot(object): instance = None @classmethod def getBot(cls): if cls.instance is None: cls.instance = ChatBot() return cls.instance def __init__(self): print("Init") if self.instance is not None: raise ValueError("Did you forgot to call getBot function ? ") self.stemmer = LancasterStemmer() data = pickle.load(open(path.getPath('trained_data'), "rb")) self.words = data['words'] self.classes = data['classes'] train_x = data['train_x'] train_y = data['train_y'] with open(path.getJsonPath()) as json_data: self.intents = json.load(json_data) net = tflearn.input_data(shape=[None, len(train_x[0])]) net = tflearn.fully_connected(net, 8) net = tflearn.fully_connected(net, 8) net = tflearn.fully_connected(net, len(train_y[0]), activation='softmax') net = tflearn.regression(net) self.model = tflearn.DNN(net, tensorboard_dir=path.getPath('train_logs')) self.model.load(path.getPath('model.tflearn')) def clean_up_sentence(self, sentence): sentence_words = nltk.word_tokenize(sentence) sentence_words = [ self.stemmer.stem(word.lower()) for word in sentence_words ] return sentence_words def bow(self, sentence, words, show_details=False): sentence_words = self.clean_up_sentence(sentence) bag = [0] * len(words) for s in sentence_words: for i, w in enumerate(words): if w == s: bag[i] = 1 if show_details: print("found in bag: %s" % w) return np.array(bag) def classify(self, sentence): ERROR_THRESHOLD = 0.25 results = self.model.predict([self.bow(sentence, self.words)])[0] results = [[i, r] for i, r in enumerate(results) if r > ERROR_THRESHOLD] results.sort(key=lambda x: x[1], reverse=True) return_list = [] for r in results: return_list.append((self.classes[r[0]], r[1])) return return_list def response(self, sentence, userID='111', show_details=False): results = self.classify(sentence) context = {} if results: while results: for i in self.intents['intents']: if i['tag'] == results[0][0]: if 'context_set' in i: if show_details: print('context:', i['context_set']) context[userID] = i['context_set'] if not 'context_filter' in i or \ (userID in context and 'context_filter' in i and i['context_filter'] == context[ userID]): if show_details: print('tag:', i['tag']) return random.choice(i['responses']) return "I can't guess"
# For build the final list of words (for inverted index) we remove the columns: Preptime Cooktime Recipeyield f = open("ricette.csv", "r", encoding='utf-8-sig') ricette = [] for row in csv.reader(f, delimiter='\t'): if row: a = [] a.extend(row[:3]) a.extend(row[6:]) ricette.append(a) f.close() #%% # Define some function that we used succesively stop = stopwords.words('english') tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+') st = LancasterStemmer() #%% # We created a list with all the aliments that contains lactose Intol = [] f = open("intol.txt") for row in csv.reader(f, delimiter='\t'): Intol.append(row[0]) f.close() text_i = " ".join(Intol).lower() # Tokenization and stemming tokens_i = tokenizer.tokenize(text_i) Intol_stem = [] for w in tokens_i: Intol_stem.append(st.stem(w)) Intol_stem = set(Intol_stem) #%%
print() # LDA repeat(2, 6, cleaned_corpus) # remove words 'love' and 'like' sw = stopwords.words('english') + ['love', 'Love', 'like', 'Like'] repeat('LDA', 2, 4, cleaned_corpus, sw=sw) # stem stemmed_corpus = [] for poem in cleaned_corpus: stemmed_poem = [] for word in poem: stemmed_poem.append(LancasterStemmer().stem(word)) stemmed_corpus.append(''.join(stemmed_poem)) repeat('LDA', 2, 4, stemmed_corpus, sw=sw) repeat('LDA', 3, 5, stemmed_corpus, sw=(sw + ['one', 'know', 'would'])) # nouns only nouns_corpus = [] for poem in df['POS']: poem_nouns = [] for word in poem: if word[1] == 'NN': poem_nouns.append(word[0] + ' ') nouns_corpus.append(''.join(poem_nouns)) repeat('LDA', 2, 5, nouns_corpus, sw=sw)
import nltk from nltk.stem.lancaster import LancasterStemmer stemmer = LancasterStemmer() import numpy as np import scipy import pandas import tflearn import tensorflow as tf import random import json import speech_recognition as sr # restore all of our data structures import pickle data = pickle.load(open("training_data", "rb")) words = data['words'] classes = data['classes'] train_x = data['train_x'] train_y = data['train_y'] # import our chat-bot intents file import json
import nltk nltk.download('punkt') from nltk.stem.lancaster import LancasterStemmer stemmer = LancasterStemmer() import tensorflow as tf import numpy as np import tflearn import random import json from google.colab import files files.upload() #import chat bot intents file with open('intents.json') as json_data: intents = json.load(json_data) #running intent file words = [] classes = [] documents = [] ignore = ['?'] #looping through each sentence in the json file's pattern for intent in intents['intents']: for pattern in intent['patterns']: #tokeninzing each word in the sentence w = nltk.word_tokenize(pattern)
def train_model(): with open('intents.json') as json_data: intents = json.load(json_data) words = [] #Design the Vocabulary (unique words) classes = [] documents = [] ignore_words = ['?'] # loop through each sentence in our intents patterns for intent in intents['intents']: for pattern in intent['patterns']: # tokenize each word in the sentence w = nltk.word_tokenize(pattern) # add to our words list words.extend(w) # add to documents in our corpus documents.append((w, intent['tag'])) # add to our classes list if intent['tag'] not in classes: classes.append(intent['tag']) stemmer = LancasterStemmer() # stem and lower each word and remove duplicates words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words] words = sorted(list(set(words))) # remove duplicates classes = sorted(list(set(classes))) # create our training data training = [] # create an empty array for our output output_empty = [0] * len(classes) # training set, bag of words for each sentence for doc in documents: # initialize our bag of words bag = [] # list of tokenized words for the pattern (pattern = what user says) pattern_words = doc[0] # stem each word pattern_words = [stemmer.stem(word.lower()) for word in pattern_words] # create our bag of words array # mark the presence of words as a boolean value, 0 for absent, 1 for present. for w in words: bag.append(1) if w in pattern_words else bag.append(0) # output is a '0' for each tag and '1' for current tag output_row = list(output_empty) output_row[classes.index(doc[1])] = 1 training.append([bag, output_row]) # shuffle our features and turn into np.array random.shuffle(training) training = np.array(training) # create train and test lists train_x = list(training[:, 0]) train_y = list(training[:, 1]) # reset underlying graph data tf.reset_default_graph() # Build neural network net = tflearn.input_data(shape=[None, len(train_x[0])]) net = tflearn.fully_connected(net, 8) net = tflearn.fully_connected(net, 8) net = tflearn.fully_connected(net, len(train_y[0]), activation='softmax') net = tflearn.regression(net) # Define model and setup tensorboard model = tflearn.DNN(net, tensorboard_dir='tflearn_logs') # Start training (apply gradient descent algorithm) model.fit(train_x, train_y, n_epoch=1000, batch_size=8, show_metric=True) model.save('model.tflearn') # save all of our data structures import pickle pickle.dump( { 'words': words, 'classes': classes, 'train_x': train_x, 'train_y': train_y }, open("training_data", "wb")) #train_model()
import nltk text = "strange lying saved discusses men builds" print("原始文本:") print(text) # 词干提取与词形还原之前先进行分词 tokens = nltk.word_tokenize(text) # stemming-提取词干 # 导入stem.porter和Lancaster工具包 from nltk.stem.porter import PorterStemmer from nltk.stem.lancaster import LancasterStemmer # 实例化PosterStemmer对象 porter_stemmer = PorterStemmer() # 实例化LancasterStemmer对象 lancaster_stemmer = LancasterStemmer() # 新建stemmed_list和lancaster_list数组,用于分别存放PorterStemmer和LancasterStemmer的结果 stemmed_list = [] lancaster_list = [] for token in tokens: stemmed_list.append(porter_stemmer.stem(token)) lancaster_list.append(lancaster_stemmer.stem(token)) print("提取词干结果:") print("1.PorterStemmer:", stemmed_list) print("2.LancasterStemmer:", lancaster_list) # Lemmatization-词形还原 # nltk的Lemmatization是基于WordNet实现的,导入WordNetLemmatizer。 from nltk.stem import WordNetLemmatizer wordnet_lemmatizer = WordNetLemmatizer()
for i in range(len(list_raw1)): list_tok1.append(nltk.word_tokenize(list_raw1[i].lower())) for i in range(len(list_raw2)): list_tok2.append(nltk.word_tokenize(list_raw2[i].lower())) list_pos1 = [] list_pos2 = [] for i in range(len(list_raw1)): list_pos1.append(nltk.pos_tag(list_tok1[i])) for i in range(len(list_raw2)): list_pos2.append(nltk.pos_tag(list_tok2[i])) st = LancasterStemmer() #print (pos) #grammar = "NP: {<DT>?<JJ>*<NN>}" #cp = nltk.RegexpParser(grammar) #result = cp.parse(pos) #print (result) #result.draw() list1 = [] list2 = [] for j in range(len(list_pos1)): list1 = [] for i in range(len(list_pos1[j])): if list_pos1[j][i][1] == 'NN': list1.append(list_pos1[j][i][0]) if list_pos1[j][i][1] == 'JJ':
def fit(self, module=None): if not module: module = self.module intents = {} for intent in module.intents: if intent.patterns: intents[intent.name] = {"patterns": []} for pattern in intent.patterns: intents[intent.name]['patterns'].append(pattern.text) garbage_training_intents = Intent().select().where( Intent.agent != module.id) intents['not_found'] = {"patterns": []} for intent in garbage_training_intents: if intent.patterns: for pattern in intent.patterns: intents['not_found']['patterns'].append(pattern.text) vocabulary = [] classes = [] documents = [] ignore_words = ['?'] for intent_name in intents: intent = intents[intent_name] for pattern in intent['patterns']: w = nltk.word_tokenize(pattern) vocabulary.extend(w) documents.append((w, intent_name)) if intent_name not in classes: classes.append(intent_name) stemmer = LancasterStemmer() vocabulary = [ stemmer.stem(w.lower()) for w in vocabulary if w not in ignore_words ] vocabulary = sorted(list(set(vocabulary))) classes = sorted(list(set(classes))) training = [] output_empty = [0] * len(classes) for doc in documents: bag = [] pattern_words = doc[0] pattern_words = [ stemmer.stem(word.lower()) for word in pattern_words ] for word in vocabulary: bag.append(1) if word in pattern_words else bag.append(0) output_row = list(output_empty) output_row[classes.index(doc[1])] = 1 training.append([bag, output_row]) random.shuffle(training) training = np.array(training) train_x = list(training[:, 0]) train_y = list(training[:, 1]) tf_model = Sequential() tf_model.add( Dense(128, input_shape=(len(train_x[0]), ), activation='relu')) tf_model.add(Dropout(0.5)) tf_model.add(Dense(64, activation='relu')) tf_model.add(Dropout(0.5)) tf_model.add(Dense(len(train_y[0]), activation='softmax')) sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) tf_model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy']) tf_model.fit(np.array(train_x), np.array(train_y), epochs=200, batch_size=5, verbose=1) save_model(tf_model, 'chat/' + module.name + '.h5', True) #converter = tf.lite.TFLiteConverter.from_keras_model_file('chat/model.h5') #tflite_model = converter.convert() #open("chat/model.tflite", "wb").write(tflite_model); with open("chat/" + module.name + ".pkl", "wb") as dataFile: pickle.dump( { 'vocabulary': vocabulary, 'classes': classes, 'train_x': train_x, 'train_y': train_y }, dataFile)
class ChatterBox(object): models = {} def __init__(self, agent=None, required_models=[], error_threshold=0.25, full_init=True): self.agent = agent self.required_models = required_models self.error_threshold = error_threshold self.stemmer = LancasterStemmer() self.graph = None self.session = None if full_init: self.initialize_agents() def initialize_agents(self): self.graph = tf.get_default_graph() self.session = tf.Session() set_session(self.session) for model in self.required_models: self.models[model] = ChatModule(model) def classify(self, model, sentence): with self.graph.as_default(): set_session(self.session) bag = [0] * len(model.vocabulary) for s in [ self.stemmer.stem(word.lower()) for word in nltk.word_tokenize(sentence) ]: for i, w in enumerate(model.vocabulary): if w == s: bag[i] = 1 results = model.model.predict( DataFrame([(np.array(bag))], dtype=float, index=['input']))[0] results = [[i, r] for i, r in enumerate(results) if r > self.error_threshold] results.sort(key=lambda x: x[1]) return results def intent(self, model, results): result = results.pop() intent_name = model.classes[result[0]] if intent_name == 'not_found': return ChatAgentResponse(intent_name, '', result[1]) intent_model = Intent().select().where( Intent.name == intent_name).get() if not intent_model.dialogs: response = Application().handle_request(intent_name) if response: response = ChatAgentResponse(intent_name, response, result[1]) else: intent_response = random.choice(intent_model.responses) response = ChatAgentResponse(intent_name, intent_response.text, result[1]) else: dialogs = [] for dialog in intent_model.dialogs: if not dialogs: dialog_intent_model = Intent().select().where( Intent.name == dialog.name).get() response = ChatAgentResponse( intent_name, random.choice(dialog_intent_model.responses).text, result[1], dialog.input_type) dialogs.append({ 'name': dialog.name, 'slot': dialog.slot, 'value': None, 'input_type': dialog.input_type }) session['intent'] = intent_model.name session['dialogs'] = dialogs session['dialog_step'] = 0 if intent_model.contexts: contexts = [] for context in intent_model.contexts: contexts.append(intent_model.text) session['context'] = " ".join(contexts) return response def dialog(self, input): self.store_input(input) if self.dialog_has_next_step(): response = self.dialog_next_step() else: response = self.complete_dialog() return response def dialog_has_next_step(self): return session.get('dialog_step') + 1 < len(session.get('dialogs')) def dialog_next_step(self): dialogs = session.get('dialogs') session['dialog_step'] += 1 intent_name = dialogs[session.get('dialog_step')]['name'] intent_model = Intent().select().where( Intent.name == intent_name).get() return ChatAgentResponse( intent_name, random.choice(intent_model.responses).text, input_type=dialogs[session.get('dialog_step')]['input_type']) def store_input(self, input): dialogs = session.get('dialogs') dialogs[session.get('dialog_step')]['value'] = input session['dialogs'] = dialogs def complete_dialog(self): dialogs = session.get('dialogs') intent_model = Intent().select().where( Intent.name == session.get('intent')).get() slots = {} for dialog in dialogs: slots[dialog['slot']] = dialog['value'] response = Application().handle_request(session.get('intent'), slots) response = ChatAgentResponse(session.get('intent'), response) self.clean_session() return response def chat(self, sentence): if not session.get('intent'): for model in self.models: results = self.classify(self.models[model], sentence) response = self.intent(self.models[model], results) if response.confidence > .85: if response.classification == 'not_found': continue else: break else: response = self.dialog(sentence) return response def clean_session(self): session['intent'] = None session['dialogs'] = None session['dialog_step'] = None
class BllipParser(Parser): """ Implementation of the bllipparser for parsing the each sentence in each part separately, finding dependencies, parts of speech tags, lemmas and head words for each entity. Uses preprocessed text :param nbest: the number of parse trees to obtain :type nbest: int :param overparsing: overparsing determines how much more time the parser will spend on a sentence relative to the time it took to find the first possible complete parse :type overparsing: int """ def __init__(self, nbest=10, overparsing=10, only_parse=False, stop_words=None): try: from bllipparser import RerankingParser # WARNING if only_parse=False, BllipParser depends on PyStanfordDependencies: pip install PyStanfordDependencies except ImportError: raise ImportError( 'BllipParser not installed, perhaps it is not supported on OS X yet' ) self.parser = RerankingParser.fetch_and_load('GENIA+PubMed', verbose=True) # WARNING this can take a long while. Install manually: `python -mbllipparser.ModelFetcher -i GENIA+PubMed` """create a Reranking Parser from BllipParser""" self.parser.set_parser_options(nbest=nbest, overparsing=overparsing) """set parser options""" self.only_parse = only_parse """whether features should be used from the BllipParser""" self.stemmer = LancasterStemmer() """an instance of LancasterStemmer from NLTK""" self.stop_words = stop_words if self.stop_words is None: self.stop_words = stopwords.words('english') def parse(self, dataset): outer_bar = Bar('Processing [Bllip]', max=len(list(dataset.parts()))) for part in dataset.parts(): outer_bar.next() if len(part.sentence_parse_trees) > 0: continue for index, sentence in enumerate(part.sentences): sentence = [token.word for token in part.sentences[index]] parse = self.parser.parse(sentence) parsed = parse[0] part.sentence_parse_trees.append(str(parsed.ptb_parse)) if not self.only_parse: tokens = parsed.ptb_parse.sd_tokens() for token in tokens: tok = part.sentences[index][token.index - 1] is_stop = False if tok.word.lower() in self.stop_words: is_stop = True tok.features = { 'id': token.index - 1, 'pos': token.pos, 'lemma': self.stemmer.stem(tok.word), 'is_punct': self._is_punct(tok.word), 'dep': token.deprel, 'is_stop': is_stop, 'dependency_from': None, 'dependency_to': [], 'is_root': False, } for token in tokens: tok = part.sentences[index][token.index - 1] self._dependency_path(token, tok, part, index) part.percolate_tokens_to_entities() part.calculate_token_scores() part.set_head_tokens() outer_bar.finish() def _dependency_path(self, bllip_token, token, part, index): if bllip_token.head - 1 >= 0: token.features['dependency_from'] = ( part.sentences[index][bllip_token.head - 1], bllip_token.deprel) else: token.features['dependency_from'] = ( part.sentences[index][token.features['id']], bllip_token.deprel) token_from = part.sentences[index][bllip_token.head - 1] if (bllip_token.index != bllip_token.head): token_from.features['dependency_to'].append( (token, bllip_token.deprel)) else: token.features['is_root'] = True def _is_punct(self, text): if text in ['.', ',', '-']: return True return False
import string import unicodedata import sys # a table structure to hold pronunciation types punct_tbl = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P')) # function to remove punctuation from a table def remove_punctuation(text): return text.translate(punct_tbl) # init the stemmer stemmer = LancasterStemmer() # read the json file and load the training data with open('data.json', 'r') as json_data: data = json.load(json_data) # list of all the categories to train for categories = list(data.keys()) words = [] # a list of tuple with words in the sentence and the respective category name docs = [] for each_category in data.keys(): for each_sentence in data[each_category]: # remove punctuation from sentence print(each_sentence)
class NLPImplementation: def __init__(self, intents_location): self.intents_ignore_words = ["?", "!", ".", ","] self.ERROR_THRESHOLD = 0.25 self.model = None self.intents_location = intents_location self.stemmer = LancasterStemmer() self.intents_words, self.intents_documents, self.intents_classes = self.apply_tokenization_on_intents() self.model_save_name = "chatbot_model.h5" self.spacy = spacy.load("en_core_web_sm") def clean_up_sentence(self, sentence): # tokenize the pattern sentence_words = nltk.word_tokenize(sentence) # stem each word sentence_words = [self.stemmer.stem(word.lower()) for word in sentence_words] return sentence_words def bag_of_words(self, sentence): """ return bag of words array: 0 or 1 for each word in the bag that exists in the sentence""" # tokenize the pattern sentence_words = self.clean_up_sentence(sentence) # bag of words bag = [0] * len(self.intents_words) for sw in sentence_words: for index, word in enumerate(self.intents_words): if word == sw: bag[index] = 1 return np.array(bag) def spacy_retrieve_nouns(self, text): """ Explain what spacy is """ doc = self.spacy(text) ents = [] for ent in doc.ents: ents.append(ent) return ents @staticmethod async def get_weather(location): client = python_weather.Client(format=python_weather.METRIC) weather = await client.find(location) current_temperature = int((weather.current.temperature - 32) * 5/9) return_text = f"Current temperature in {location} is {current_temperature}°C" \ f"\n\nThe forecast temperature for the next 5 days will be: \n" for forecast in weather.forecasts: temp = int((forecast.temperature-32)*5/9) return_text += f"Date: {forecast.date.date()}, Sky: {forecast.sky_text}, Temperature: {temp}°C\n" await client.close() return return_text @staticmethod def get_time_by_city(city_location): g = Nominatim(user_agent='twitter_chat_bot') location = g.geocode(city_location) obj = TimezoneFinder() result = obj.timezone_at(lng=location.longitude, lat=location.latitude) t = pytz.timezone(result) time = datetime.now(t).strftime('%Y:%m:%d %H:%M:%S') return str(time) def response(self, sentence): with open(self.intents_location) as json_data: intents = json.load(json_data) json_data.close() results = self.classify(sentence) # if classification exists then find the matching intent tag and return a response from the respective tag if results: # loop as long as there are matches to process while results: for i in intents['intents']: # find a tag matching the first result if i['tag'] == results[0]["intent"]: # return a random response from the intent # If question is for specific data such as Time, Weather, Wikipedia, etc, return specified info if i['tag'] == 'information': topic = re.search('tell me about (.*)', sentence.lower()) if topic: topic = topic.group(1) try: wiki = wikipedia.summary(topic) except (wikipedia.exceptions.PageError, wikipedia.exceptions.DisambiguationError) as e: wiki = str(e) return wiki return "For me to understand your wikipedia question, use the format 'tell me about *'" if i['tag'] == 'time': ents = self.spacy_retrieve_nouns(sentence) if len(ents) > 0: time = self.get_time_by_city(str(ents[0])) return f"The current time in {str(ents[0])} is {time}" if i['tag'] == 'weather': ents = self.spacy_retrieve_nouns(sentence) print(ents) if len(ents) > 0: loop = asyncio.get_event_loop() data_weather = loop.run_until_complete(self.get_weather(str(ents[0]))) return data_weather if i['tag'] == 'stocks': ticker = reticker.TickerExtractor().extract(sentence.upper()) print(ticker) return_text = "" for tick in ticker: yahoo_price = YahooFinancials(tick) if yahoo_price.get_current_price() is None: continue return_text += f"Current price of {tick} is {yahoo_price.get_currency()} " \ f"{yahoo_price.get_current_price()}\n" if len(return_text) > 0: return return_text return random.choice(i['response']) results.pop(0) def classify(self, sentence): # generate probabilities from the model self.load_model() bow = self.bag_of_words(sentence) results = self.model.predict(np.array([bow]))[0] # Filters out predictions below a threshold results = [[i, res] for i, res in enumerate(results) if res > self.ERROR_THRESHOLD] # sort by strength of probability results.sort(key=lambda x: x[1], reverse=True) return_list = [] for r in results: return_list.append({"intent": self.intents_classes[r[0]], "probability": r[1]}) # return dict of intent and probability print(return_list) return return_list def apply_tokenization_on_intents(self): documents = [] words = [] classes = [] with open(self.intents_location) as json_data: intents = json.load(json_data) json_data.close() for intent in intents["intents"]: for pattern in intent["patterns"]: # Tokenize each word word = nltk.word_tokenize(pattern) words.extend(word) # Add to documents in our corpus documents.append((word, intent["tag"])) # Add to classes list if intent["tag"] not in classes: classes.append(intent["tag"]) words = [self.stemmer.stem(w.lower()) for w in words if w not in self.intents_ignore_words] words = sorted(list(set(words))) # Removes duplicates classes = sorted(list(set(classes))) # print(f"Document Length: {len(documents)}") # print(f"Classes length: {len(classes)} contains: \n {classes}") # print(f"Number of unique stemmed words: {len(words)} contains: \n {words}") return words, documents, classes def create_training_data(self): training = [] # create an empty array for our output output_empty = [0] * len(self.intents_classes) # training set, bag of words for each sentence for doc in self.intents_documents: # initialize our bag of words bag = [] # list of tokenized words for the pattern pattern_words = doc[0] # stem each word pattern_words = [self.stemmer.stem(word.lower()) for word in pattern_words] # create our bag of words array for word in self.intents_words: bag.append(1) if word in pattern_words else bag.append(0) # output is a '0' for each tag and '1' for current tag output_row = list(output_empty) output_row[self.intents_classes.index(doc[1])] = 1 training.append([bag, output_row]) random.shuffle(training) training = np.array(training) train_x = list(training[:, 0]) train_y = list(training[:, 1]) return [train_x, train_y] def train_model(self): # Build neural network train_x, train_y = self.create_training_data() model = Sequential() model.add(Dense(128, input_shape=(len(train_x[0]),), activation='relu')) model.add(Dropout(0.5)) model.add(Dense(64, activation='relu')) model.add(Dropout(0.5)) model.add(Dense(len(train_y[0]), activation='softmax')) sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) model.compile(loss="categorical_crossentropy", optimizer=sgd, metrics=["accuracy"]) model_fit = model.fit(np.array(train_x), np.array(train_y), epochs=2000, batch_size=5, verbose=1) model.save(self.model_save_name, model_fit) print("Training Complete") pickle.dump( { 'words': self.intents_words, 'classes': self.intents_classes, 'train_x': train_x, 'train_y': train_y}, open("training_data", "wb"), ) def load_model(self): """Makes sure that self.model is loaded to be used for predictions""" try: data = pickle.load(open("training_data", "rb")) words = data['words'] classes = data['classes'] train_x = data['train_x'] train_y = data['train_y'] self.model = load_model(self.model_save_name) except FileNotFoundError as e: print("Model was not trained yet. Now training model") self.train_model() self.model = load_model(self.model_save_name)
解释一下,Stemming 是抽取词的词干或词根形式(不一定能够表达完整语义)。 NLTK中提供了三种最常用的词干提取器接口,即 Porter stemmer, Lancaster Stemmer 和 Snowball Stemmer。 Porter Stemmer基于Porter词干提取算法,来看例子 ''' from nltk.stem.porter import PorterStemmer porter_stemmer = PorterStemmer() porter_stemmer.stem('maximum') porter_stemmer.stem('presumably') porter_stemmer.stem('multiply') porter_stemmer.stem('provision') porter_stemmer.stem('owed') # Lancaster Stemmer 基于Lancaster 词干提取算法,来看例子 from nltk.stem.lancaster import LancasterStemmer lancaster_stemmer = LancasterStemmer() lancaster_stemmer.stem('maximum') lancaster_stemmer.stem('presumably') lancaster_stemmer.stem('presumably') lancaster_stemmer.stem('multiply') lancaster_stemmer.stem('provision') lancaster_stemmer.stem('owed') # Snowball Stemmer基于Snowball 词干提取算法,来看例子 from nltk.stem import SnowballStemmer snowball_stemmer = SnowballStemmer('english') snowball_stemmer.stem('maximum') snowball_stemmer.stem('presumably') snowball_stemmer.stem('multiply') snowball_stemmer.stem('provision') snowball_stemmer.stem('owed')
# -*- coding: utf-8 -*- from nltk.stem.porter import PorterStemmer from nltk.stem.lancaster import LancasterStemmer from nltk.stem.snowball import SnowballStemmer input_words = [ 'writing', 'calves', 'be', 'branded', 'horse', 'randomize', 'possibly', 'provision', 'hospital', 'kept', 'scratchy', 'code' ] porter = PorterStemmer() lancaster = LancasterStemmer() snowball = SnowballStemmer('english') stemmer_names = ['INPUT WORD', 'PORTER', 'LANCASTER', 'SNOWBALL'] fmt = '{:>16}' * len(stemmer_names) print(fmt.format(*stemmer_names)) print('=' * 68) for word in input_words: output = [ word, porter.stem(word), lancaster.stem(word), snowball.stem(word) ] print(fmt.format(*output))
class Preprocessor: _stemmer = LancasterStemmer() @staticmethod def stem(word): return Preprocessor._stemmer.stem(word)
# pass # pass matches = tool.check(sentence) if matches: i = 0 while i < len(matches): grammer_error.append(matches[i].context) i += 1 pass pass relevence_dict["business"] = tmp_bus relevence_dict['nonbusiness'] = tmp_non return relevence_dict, sen, mom_data, grammer_error stemmer = LancasterStemmer() training_data = [] training_data.append({"class": "greeting", "sentence": "how are you?"}) training_data.append({"class": "greeting", "sentence": "how is your day?"}) training_data.append({"class": "greeting", "sentence": "Hi, Vilas"}) training_data.append({ "class": "greeting", "sentence": "how is it going today?" }) training_data.append({"class": "greeting", "sentence": "I am doing good"}) training_data.append({"class": "goodbye", "sentence": "have a nice day"}) training_data.append({"class": "goodbye", "sentence": "see you later"}) training_data.append({"class": "goodbye", "sentence": "have a nice day"}) training_data.append({"class": "goodbye", "sentence": "talk to you soon"})
import nltk from nltk.stem.lancaster import LancasterStemmer stemmer = LancasterStemmer() import numpy as np import random import json import pickle from keras.models import Sequential from keras.layers import Dense, Activation, Dropout from keras.optimizers import SGD from keras.models import load_model ###############################learn this with open("intents.json") as file: data = json.load(file) ############################## try: with open("data.pickle", "rb") as f: words, labels, training, output = pickle.load(f) error except: words = [] labels = [] docs_x = [] docs_y = [] for intent in data["intents"]: for pattern in intent["patterns"]:
import nltk from nltk import word_tokenize, sent_tokenize from nltk.stem.lancaster import LancasterStemmer stemmer = LancasterStemmer() import numpy import tflearn import tensorflow import random import json import pickle with open("intents.json") as file: data = json.load(file) try: with open("data.pickle", "rb") as f: words, labels, training, output = pickle.load(f) except: words = [] labels = [] docs_x = [] docs_y = [] for intent in data["intents"]: for pattern in intent["patterns"]: wrds = nltk.word_tokenize(pattern) words.extend(wrds) docs_x.append(wrds) docs_y.append(intent["tag"]) if intent["tag"] not in labels:
import nltk from nltk.stem.lancaster import LancasterStemmer stemmer = LancasterStemmer() import time import difflib import numpy import webbrowser import tflearn import tensorflow import random from flask import Flask, render_template, request import json import pickle import os app = Flask(__name__) with open("intents.json") as file: data = json.load(file) try: with open("data.pickle", "rb") as f: words, labels, training, output = pickle.load(f) except: words = [] labels = [] docs_patt = [] docs_tag = [] for intent in data["intents"]: # below we fetch patterns from all intents in one place
#coding=utf-8 #得到词原的几种不同方法 比较 from nltk.stem.porter import PorterStemmer from nltk.stem import SnowballStemmer from nltk.stem import WordNetLemmatizer from nltk.stem.lancaster import LancasterStemmer string = "called" porter_stemmer = PorterStemmer() a = porter_stemmer.stem(string) print a wordnet_lemmatizer = WordNetLemmatizer() b = wordnet_lemmatizer.lemmatize(string) print b snowball_stemmer = SnowballStemmer("english") c = snowball_stemmer.stem(string) print c st = LancasterStemmer() d = st.stem(string) print d
import json import numpy as np from yellowbrick.text import PosTagVisualizer nltk.download from nltk import tokenize from nltk import sent_tokenize from nltk import word_tokenize from nltk import pos_tag #Import Stemmers from nltk.stem.porter import PorterStemmer from nltk.stem.lancaster import LancasterStemmer from nltk.stem.snowball import SnowballStemmer stemmer_porter = PorterStemmer() stemmer_lancaster = LancasterStemmer() stemmer_snowball = SnowballStemmer('english') #Load Reviews def open_file(filename): data = [] for line in open(filename, 'r'): data_ = json.loads(line) header = ['overall', 'reviewTime', 'reviewText', 'summary', 'unixReviewTime'] line__ = [data_.get(h) for h in header] data.append(line__) df = pd.DataFrame(data, columns=header) return df df = open_file('Cell_Phones_and_Accessories_5.json') text_list = df['reviewText'].values.tolist()Pandas library