def porter_list1(lista): stemmer = PorterStemmer() newlist = [] for b in lista: b = stemmer.stem(b) newlist.append(b) return newlist
def splitAndStem(inputfilename, outputfilename): ''' For each ingredient split it into words, stem each word, construct a new recipe from those words :param inputfilename: :return: ''' with open(outputfilename, 'w') as ff: ff.write('[\n') with open(inputfilename) as f: d = eval(f.read()) stemmer = PorterStemmer() with open(outputfilename, 'a') as ff: for i in d: # print(i) new_item = {} new_ingredients = [] for ingredient in i['ingredients']: tokens = word_tokenize(ingredient) clean_tokens = [re.subn('[^A-Za-z]', '', token)[0] for token in tokens] new_ingredients += [stemmer.stem(w).lower() for w in clean_tokens] new_item['cuisine'] = i['cuisine'] new_item['id'] = i['id'] new_item['ingredients'] = new_ingredients json_recipe = json.dumps(new_item) ff.write('%s,\n' % str(json_recipe))
def parseReviews(mypath): filelist = os.listdir(mypath) wordDict = {} negationList = ["no","not","never","can't","won't","cannot","didn't","couldn't"] negationFlag = False stopwordList = set(stopwords.words("english")) stemmer = PorterStemmer() for file in filelist: with open(mypath + "/" + file,"r") as f: word_list = word_tokenize(f.read()) for word in word_list: if word in negationList: #double negative if negationFlag: negationFlag = False else: negationFlag = True continue if not word.isalnum(): negationFlag = False if word.isalnum() and word not in stopwordList: word = stemmer.stem(word) if negationFlag: word = "!" + word negationFlag = False if word not in wordDict: wordDict[word] = 1 else: wordDict[word] += 1 return wordDict
def tokenizeTags(str,dict_items): #temp map (for getting the local term frequency) #for a sentence str =str.decode('ascii', 'ignore') #tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer() #tokens=tokenizer.tokenize(str) tokens = str.split() #print tokens stemmer = PorterStemmer() #small set of stopwords (remove you, are, and, I those kinds of words) last =[] #bigram_list=[] for d in tokens: d = d.split('-') for c in d: c=re.compile('[%s]' % re.escape(string.punctuation)).sub('', c) #regular expression -> strip punctuations if c!='' and c not in dict_items: try: if int(c): if len(c)!=4 and (c>2015 or c<1900): #keep years c=stemmer.stem('NUM') except Exception: c = stemmer.stem(c.lower()) pass #c = stemmer.stem(c.lower()) last.append(c) #bigram generation #index= len(last) #if index>1: # bigram = last[index-2]+' '+last[index-1] # bigram_list.append(bigram) return last
def tokenize2_bigram(str,df_freq): temp_map={} #for a sentence str =str.decode('ascii', 'ignore') tokens = str.split() #print tokens stemmer = PorterStemmer() last =[] bigram_list=[] for d in tokens: d = d.split('-') for c in d: c=re.compile('[%s]' % re.escape(string.punctuation)).sub('', c) #regular expression -> strip punctuations if c!='': try: if int(c): if len(c)!=4 and (c>2015 or c<1900): #keep years c=stemmer.stem('NUM') except Exception: c = stemmer.stem(c.lower()) pass #c = stemmer.stem(c.lower()) last.append(c) #bigram generation index= 0 if index>1: bigram = last[index-2]+' '+last[index-1] bigram_list.append(bigram) updateDF(temp_map,df_freq,bigram) index+=1 return bigram_list
def _stemmatize(self, word): lmtzr = WordNetLemmatizer() # lemmatizer won't stem words ending in '-ing' unless you tell it it's a verb stemmer = PorterStemmer() if word.endswith('ing'): return stemmer.stem(word) return lmtzr.lemmatize(word)
def prepare_data(reviews): # run porter stemmer on every word stemmer = PorterStemmer() stem_text = lambda x: {'class': x['class'], 'text': stemmer.stem(x['text'])} # clean text and remove empty items reviews = filter(lambda x: x != {}, reviews) reviews = map(stem_text, reviews) print('classification: ' + reviews[observed_element]['class'] + '\n\n------------------------------------\n\n') print('stemming: ' + reviews[observed_element]['text'] + '\n\n------------------------------------\n\n') # remove stopwords reviews = map(remove_stop_words, reviews) print('stopwords: ' + reviews[observed_element]['text'] + '\n\n------------------------------------\n\n') # remove undesired patterns reviews = map(clean_text, reviews) print('elementos inuteis: ' + reviews[observed_element]['text'] + '\n\n------------------------------------\n\n') return reviews
def deleting_stop_words_and_punctuating(text): stop_words = set(stopwords.words('english')) ps = PorterStemmer() lemmatizer = WordNetLemmatizer() word_tokenize_text = word_tokenize(text) words = [ps.stem(lemmatizer.lemmatize(w)) for w in word_tokenize_text] return [w.lower() for w in words if not (w in stop_words or w in string.punctuation or w in "''" or w in '``' or w in "the" or w in 'in' or w in "'s")]
def preprocess(text): stemmer = PorterStemmer() stop = stopwords.words('english') tokens = [tok for tok in word_tokenize(text.lower()) if tok not in stop] tokens_stemmed = [stemmer.stem(tok) for tok in tokens] return tokens_stemmed
class StemmedBagOfWordsFeatureGenerator(EdgeFeatureGenerator): """ Generates stemmed Bag of Words representation for each sentence that contains an edge, using the function given in the argument. By default it uses Porter stemmer :type feature_set: nala.structures.data.FeatureDictionary :type stemmer: nltk.stem.PorterStemmer :type stop_words: list[str] :type training_mode: bool """ def __init__(self, feature_set, stop_words=[], training_mode=True): self.feature_set = feature_set """the feature set for the dataset""" self.training_mode = training_mode """whether the mode is training or testing""" self.stemmer = PorterStemmer() """an instance of the PorterStemmer""" self.stop_words = stop_words """a list of stop words""" def generate(self, dataset): for edge in dataset.edges(): sentence = edge.part.sentences[edge.sentence_id] if self.training_mode: for token in sentence: if self.stemmer.stem( token.word ) not in self.stop_words and not token.features['is_punct']: feature_name = '4_bow_stem_' + self.stemmer.stem( token.word) + '_[0]' self.add_to_feature_set(edge, feature_name)
def preprocessing(text, debug = False): if debug: print text # lower case text = text.lower() if debug: print text # can't -> cannot, bya's -> bya is text = replacers.RegexpReplacer().replace(text) if debug: print text # word tokenize words = word_tokenize(text) if debug: print words # removing stopwords english_stops = set(stopwords.words('english')) english_stops_added = english_stops | {'.', ',', ':', ';'} words = [word for word in words if word not in english_stops_added] if debug: print words # stemming words stemmer = PorterStemmer() words_stemmed = list(map(lambda word: stemmer.stem(word), words)) if debug: print words_stemmed return words, words_stemmed
def stem(string): """Stem a phrase""" global stemmer if not stemmer: stemmer = Stemmer() #words = string.split() #for i in range(len(words)): # words[i] = self.stemmer.stem(words[i]) # stemming last word only #string = self._reGlue(words) # #string2 = stemmer.stem(string) #if string2 not in stemdict: # stemdict[string2] = string # FIX ME if string not in stemdict: if bad_unicode(string): ## added A. Meyers 8/28/15 temp = stemmer.stem(remove_non_unicode(string)) else: temp = stemmer.stem(string) if temp: stemdict[string] = temp if not temp: pass elif temp not in unstemdict: unstemdict[temp] = [string] elif string not in unstemdict[temp]: unstemdict[temp].append(string) else: temp = stemdict[string] return temp
def extract_clean_sentences(self): """ Extracts sentences from plain text. Also applies the following cleaning operations: - Exclude all characters not recognized by 'utf-8' encoding - Exclude all characters not contained in [a-zA-Z0-9 '-] - Exclude common stopwords """ text = self.raw_text exclude = re.compile('[^a-zA-Z0-9 \'-]') linebreaks = re.compile('\s') excess_space = re.compile('\s+') stemmer = PorterStemmer() sentences = sent_tokenize(text) out = [] for sentence in sentences: sentence = linebreaks.sub(' ', sentence) sentence = exclude.sub(' ', sentence) sentence = excess_space.sub(' ', sentence) tokens = word_tokenize(sentence) tokens = [stemmer.stem(t.lower()) for t in tokens] out.append(tokens) return out
def stemming(): ps = PorterStemmer() input_tweet = 'testing tests trying tries' words = word_tokenize(input_tweet) for w in words: print(ps.stem(words))
def openfile(filename,output): print(filename) #starts run time start = timeit.default_timer() ps = PorterStemmer() file = open(filename,"r") tokens = [] #Used for removing punctuation from the documents translate_table = dict((ord(char), None) for char in string.punctuation) start2 = timeit.default_timer() #splits the lines into words and removes the punctuation for line in file: tokens += word_tokenize(line.translate(translate_table) ) start3 = timeit.default_timer() print("tokenize") print(start3 - start2) #creates a set of stop words to be removed later stop_words = set(stopwords.words("english")) start6 = timeit.default_timer() #if a word is not a stop word it adds it to a list filtered_sentence = [] for w in tokens: if w not in stop_words: filtered_sentence.append(w) start7 = timeit.default_timer() print("stop word removal") print(start7 - start6) startw = timeit.default_timer() #stems each word and adds it to the output file in csv form f = open(output,'w') iterFilSen = iter(filtered_sentence) if output == "documents.csv": for w in filtered_sentence: if w == "I": f.write("\n") f.write(ps.stem(w)) f.write(",") else: for w in iterFilSen: if w == "I": f.write("\n") #removes the I number W next(iterFilSen) next(iterFilSen) else: f.write(ps.stem(w)) f.write(",") #ends run time stop = timeit.default_timer() print("writing") print(stop - startw) print("total: "+output) print(stop - start)
def buildVocab(self): '''Build a vocabulary for the selected documents (from dir database).''' ## Note: The source of text should be Lucene processed field values. Lucene tokenized the text, remove stop words, and may take other unknown steps. ## Right now the vocabulary is built on the raw text with NLTK based stopwords removal, and tokenization. This should be improved. # collect contents from /database/ for each of these doc for pmid in self.pmidList: # self.pmidList includes the query and the 99 most similar articles selected by BM25 self.corpus.append(file(os.path.join(self.dbDir,pmid)).read()) # corpus contains raw text (MH, title*2, abstract) for text in self.corpus: sent_tokenize_list = sent_tokenize(text.strip().lower(), "english") # tokenize an article text stemmed_text = [] if sent_tokenize_list: # if sent_tokenize_list is not empty porter_stemmer = PorterStemmer() for sent in sent_tokenize_list: words = TreebankWordTokenizer().tokenize(sent) # tokenize the sentence words = [word.strip(string.punctuation) for word in words] words = [word for word in words if not word in stopwords.words("english")] words = [word for word in words if len(word)>1] # remove single letters and non alphabetic characters words = [word for word in words if re.search('[a-zA-Z]',word)] words = [porter_stemmer.stem(word) for word in words] # apply Porter stemmer stemmed_text.append(" ".join(words)) self.vocab+=words self.stemmed_corpus.append(". ".join(stemmed_text)) # append a stemmed article text # save stemmed corpus pickle.dump(self.stemmed_corpus, file(os.path.join(self.stemmed_corpusDir,str(self.pmidList[0])),"w")) # remove low frequency tokens and redundant tokens tokenDist = Counter(self.vocab) lowFreqList = [] for token, count in tokenDist.iteritems(): if count<2: lowFreqList.append(token) self.vocab = list(set(self.vocab)-set(lowFreqList)) # save vocabulary pickle.dump(self.vocab,file(os.path.join(self.vocabDir,str(self.pmidList[0])),"w"))
def parseTranscript(transcript): assert isinstance(transcript, Transcript), \ "transcript must be stored in custom namedtuple, not {}".format(type(transcript)) text = transcript.prepared.append(transcript.QandA) id = "{ticker}-{year}-{month}-{day}".format(ticker=transcript.ticker.split(':')[-1], year=transcript.date.year, month=transcript.date.month, day=transcript.date.day) tokenizer = wordpunct_tokenize stemmer = PorterStemmer() index = dict() pos = 0 for row in text: for i, token in enumerate(tokenizer(row.lower())): token = stemmer.stem(token) if token not in index and '|' not in token: index[token] = [id, [str(pos + i)]] elif '|' not in token: index[token][-1].append(str(pos + i)) try: pos += (i + 1) except: pass return index
def get_english_vocab(lemmatize=False): vocab = (w.lower() for w in words.words()) if lemmatize: stemmer = PorterStemmer() vocab = (stemmer.stem(w) for w in vocab) return set(vocab)
def testing(): # - tokenize on sentence and word ex_txt = "hello there Mr. Bartuska, How are you? The weather is great and I enjoy Python. cheers!" print(sent_tokenize(ex_txt)) print(word_tokenize(ex_txt, language='english')) # - stop words (pre-defined by nltk) stop_words = set(stopwords.words('english')) print(stop_words) words = word_tokenize(ex_txt) print(words) filtered_sent = [] for w in words: if w not in stop_words: filtered_sent.append(w) print(filtered_sent) filtered_sent = [w for w in words if not w in stop_words] print(filtered_sent) # - stemming ps = PorterStemmer() example_words = [python,pythoner,pythoning,pythoned,pythonly] # for w in example_words: # print(ps.stem(w)) new_text = "it is very important to be pothonly while you are pythoning with python. All pythoners have pythoned poorly at least once." words = word_tokenize(new_text) for w in words: print(ps.stem(w))
def stemText(text): ps = PorterStemmer() words = word_tokenize(text) #all_words = []; for w in words: all_words.append(ps.stem(w))
def preprocess_document(doc): stopset = set(stopwords.words('english')) stemmer = PorterStemmer() tokens = wordpunct_tokenize(doc) clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2] final = [stemmer.stem(word) for word in clean] return final
def tokenize2(str,df_freq): #temp map (for getting the local term frequency) temp_map={} #for a sentence str =str.decode('ascii', 'ignore') #tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer() #tokens=tokenizer.tokenize(str) tokens = str.split() #print tokens stemmer = PorterStemmer() #small set of stopwords (remove you, are, and, I those kinds of words) last =[] #bigram_list=[] for d in tokens: d = d.split('-') for c in d: c=re.compile('[%s]' % re.escape(string.punctuation)).sub('', c) #regular expression -> strip punctuations if c!='': try: if int(c): if len(c)!=4 and (c>2015 or c<1900): #keep years c=stemmer.stem('NUM') except Exception: c = stemmer.stem(c.lower()) pass last.append(c) updateDF(temp_map,df_freq,c)
def new_lesk(context_sentence, ambiguous_word, pos=None, stem=True, hyperhypo=True): ps = PorterStemmer() max_overlaps = 0; lesk_sense = None context_sentence = context_sentence.split() for ss in wn.synsets(ambiguous_word): # If POS is specified. if pos and ss.pos is not pos: continue lesk_dictionary = [] # Includes definition. lesk_dictionary+= ss.definition.split() # Includes lemma_names. lesk_dictionary+= ss.lemma_names # Optional: includes lemma_names of hypernyms and hyponyms. if hyperhypo == True: lesk_dictionary+= list(chain(*[i.lemma_names for i in ss.hypernyms()+ss.hyponyms()])) if stem == True: # Matching exact words causes sparsity, so lets match stems. lesk_dictionary = [ps.stem(i) for i in lesk_dictionary] context_sentence = [ps.stem(i) for i in context_sentence] overlaps = set(lesk_dictionary).intersection(context_sentence) if len(overlaps) > max_overlaps: lesk_sense = ss max_overlaps = len(overlaps) return lesk_sense
class IntermediateTokensFeatureGenerator(EdgeFeatureGenerator): """ Generate the bag of words representation, masked text, stemmed text and parts of speech tag for each of the tokens present between two entities in an edge. :param feature_set: the feature set for the dataset :type feature_set: nala.structures.data.FeatureDictionary :param training_mode: indicates whether the mode is training or testing :type training_mode: bool """ def __init__(self, feature_set, training_mode=True): self.feature_set = feature_set """the feature set for the dataset""" self.training_mode = training_mode """whether the mode is training or testing""" self.stemmer = PorterStemmer() """an instance of PorterStemmer""" def generate(self, dataset): for edge in dataset.edges(): sentence = edge.part.sentences[edge.sentence_id] if edge.entity1.head_token.features['id'] < edge.entity2.head_token.features['id']: first = edge.entity1.head_token.features['id'] second = edge.entity2.head_token.features['id'] for i in range(first+1, second): token = sentence[i] feature_name = '33_fwd_bow_intermediate_'+token.word+'_[0]' self.add_to_feature_set(edge, feature_name) feature_name = '34_fwd_bow_intermediate_masked_'+token.masked_text(edge.part)+'_[0]' self.add_to_feature_set(edge, feature_name) feature_name = '35_fwd_stem_intermediate_'+self.stemmer.stem(token.word)+'_[0]' self.add_to_feature_set(edge, feature_name) feature_name = '36_fwd_pos_intermediate_'+token.features['pos']+'_[0]' self.add_to_feature_set(edge, feature_name) else: first = edge.entity2.head_token.features['id'] second = edge.entity1.head_token.features['id'] for i in range(first+1, second): token = sentence[i] feature_name = '37_bkd_bow_intermediate_'+token.word+'_[0]' self.add_to_feature_set(edge, feature_name) feature_name = '38_bkd_bow_intermediate_masked_'+token.masked_text(edge.part)+'_[0]' self.add_to_feature_set(edge, feature_name) feature_name = '39_bkd_stem_intermediate_'+self.stemmer.stem(token.word)+'_[0]' self.add_to_feature_set(edge, feature_name) feature_name = '40_bkd_pos_intermediate_'+token.features['pos']+'_[0]' self.add_to_feature_set(edge, feature_name) for i in range(first+1, second): token = sentence[i] feature_name = '41_bow_intermediate_'+token.word+'_[0]' self.add_to_feature_set(edge, feature_name) feature_name = '42_bow_intermediate_masked_'+token.masked_text(edge.part)+'_[0]' self.add_to_feature_set(edge, feature_name) feature_name = '43_stem_intermediate_'+self.stemmer.stem(token.word)+'_[0]' self.add_to_feature_set(edge, feature_name) feature_name = '44_pos_intermediate_'+token.features['pos']+'_[0]' self.add_to_feature_set(edge, feature_name)
def stemming(line): stemmer = PorterStemmer() line_array = line.split(" ") for word in line_array: replace_word = stemmer.stem(word) #print replace_word line = line.replace(word,replace_word) return line
def stemming(line): from nltk.stem import PorterStemmer lmtzr = PorterStemmer() words=[] for word in line.split(' '): words.append(lmtzr.stem(word)) line = ' '.join(words) return line
def create_test(self, test_sentence, all_words): ps = PorterStemmer() words_token = word_tokenize(test_sentence.lower()) words_token = [ps.stem(w) for w in words_token] # convert numbers to keyword DIGIT words_tag = ['DIGIT' if word[1] == 'CD' else word[0] for word in pos_tag(words_token)] t = {word: (word in words_tag) for word in all_words} return t
def stemming(x,logger): logger.info('Stemminization') try: st=PorterStemmer() x = [st.stem(word) for word in x.split()] return(" ".join(x)) except: return(x)
def stem_text(sent, context=None): processed_tokens = [] tokens = word_tokenize(sent) porter = PorterStemmer() for t in tokens: t = porter.stem(t) processed_tokens.append(t) return " ".join(processed_tokens)
def stem(self,tokens): """method for stemming the tokens using nltk""" stemmer = PorterStemmer() #instantiating the stemmer stemmedTokens = set([]) #set for containing the stemmed tokens for token in tokens: stemmedTokens.add(stemmer.stem(token)) return stemmedTokens
def dummywhere(text): print('Inside Where_Query') count = 0 ps = PorterStemmer() pre_cond_query = [] post_cond_query = [] select_query = [] comment1 = [text.lower()] #print(text) place = GeoText(text) #print("Printing comment",comment1) for i in range(0, len(comment1)): token_comment = word_tokenize(comment1[i]) #print(token_comment) if all(word not in 'who whose whom' for word in token_comment): if ('over' in token_comment): index1 = token_comment.index('over') #print(index1) token_comment.insert(index1, 'who') elif ('under' in token_comment): index1 = token_comment.index('under') #print(index1) token_comment.insert(index1, 'who') comment = [' '.join(map(str, token_comment))] for i in range(0, len(comment)): token_comment = word_tokenize(comment[i]) #print(token_comment) if ('whose' in token_comment): index = token_comment.index('whose') prequery = token_comment[:index] print("Prequery", prequery) postquery = token_comment[index:(len(token_comment))] print("Postquery", postquery) elif ('who' in token_comment): index = token_comment.index('who') prequery = token_comment[:index] print("Prequery", prequery) postquery = token_comment[index:(len(token_comment))] print("Postquery", postquery) for i in range(0, len(comment)): token_comment = word_tokenize(comment[i]) tagged_comment = pos_tag(token_comment) for i in range(0, len(prequery)): tagged_comment1 = pos_tag(prequery) for word, tag in tagged_comment1: if ((tag == 'NNP' or tag == 'NN' and word == 'show' or word == 'describe' or word == 'state') or word == 'show' or word == 'tell' or word == 'give'): pre_cond_query.append('Select') if word in AttrList: if (word == 'live'): word = 'city' if (word == 'female' or word == 'girls' or word == 'girl' or word == 'male' or word == 'boys' or word == 'boy'): word = 'gender' if (word == 'enrolled' or word == 'admission' or word == 'admitted'): word = 'registration_date' if (word == 'studying' or word == 'studies' or word == 'study'): word = 'class' if (word == 'roll' or word == 'serial' or word == 'no.'): word = 'roll_no' pre_cond_query.append(lemmatizer.lemmatize((word))) print(pre_cond_query) pre_cond_query.extend(['from', 'Student', 'where']) if (pre_cond_query[0] != 'Select'): pre_cond_query.insert(0, 'Select') print("Preconditional Query: ", pre_cond_query) for i in range(0, len(postquery)): tagged_comment2 = pos_tag(postquery) for word, tag in tagged_comment2: if word in AttrList: if (word == 'live'): word = 'city' if (word == 'female' or word == 'girls' or word == 'girl' or word == 'male' or word == 'boys' or word == 'boy'): word = 'gender' if (word == 'enrolled' or word == 'admission' or word == 'admitted'): word = 'registration_date' if (word == 'studying' or word == 'studies' or word == 'study'): word = 'class' if (word == 'roll' or word == 'serial' or word == 'no.'): word = 'roll_no' post_cond_query.append(lemmatizer.lemmatize((word))) if (word == 'age'): for word, tag in tagged_comment: if (word == 'less' or word == 'under'): post_cond_query.append('<') if (word == 'more' or word == 'over' or word == 'above'): post_cond_query.append('>') if (tag == 'CD'): post_cond_query.append(word) if ('>' not in post_cond_query and '<' not in post_cond_query): post_cond_query.insert(-1, '=') if (word == 'roll_no'): for word, tag in tagged_comment: if (word == 'less' or word == 'under'): post_cond_query.append('<') if (word == 'more' or word == 'over' or word == 'above'): post_cond_query.append('>') if (tag == 'CD'): post_cond_query.append(word) if ('>' not in post_cond_query and '<' not in post_cond_query): post_cond_query.insert(-1, '=') if (word == 'city'): post_cond_query.extend(['=']) if (len(place.cities) == 0): post_cond_query.append("'" + token_comment[-1] + "'") else: post_cond_query.append(str(place.cities).strip('[]')) if (word == 'gender'): post_cond_query.extend(['=']) if ('girls' in token_comment or 'girl' in token_comment or 'female' in token_comment): post_cond_query.append("'female'") else: post_cond_query.append("'male'") if (word == 'name'): post_cond_query.append("=") post_cond_query.append("'" + token_comment[-1] + "'") if (word == 'registration_date'): post_cond_query.extend(['=']) for word, tag in tagged_comment: if (tag == 'CD'): post_cond_query.append("'" + word + "'") print("PostConditional Query", post_cond_query) cond_query = pre_cond_query + post_cond_query condi_query = ' '.join(map(str, cond_query)) print('FINAL QUERY--> ' + condi_query) text1.insert(END, condi_query)
def form_whereallquery(text): print("From all") ps = PorterStemmer() cond_query = [] select_query = [] comment = [text.lower()] place = GeoText(text) for i in range(0, len(comment)): token_comment = word_tokenize(comment[i]) tagged_comment = pos_tag(token_comment) for word, tag in tagged_comment: if ((tag == 'NNP' or tag == 'NN' and word == 'show' or word == "give" or word == 'describe' or word == 'state') or word == 'show' or word == 'tell' or word == 'give'): select_query.append('Select') cond_query.append('Select') cond_query.append('*') if word in AttrList: if (word == 'live' or word == 'lives'): word = 'city' if (word == 'female' or word == 'girls' or word == 'girl' or word == 'male' or word == 'boys' or word == 'boy'): word = 'gender' if (word == 'enrolled' or word == 'admission'): word = 'registration_date' if (word == 'studying' or word == 'studies' or word == 'study'): word = 'class' if (word == 'roll' or word == 'serial' or word == 'no.'): word = 'roll_no' if (word == 'age'): cond_query.extend(['from', ' Student', 'where', word]) for word, tag in tagged_comment: if (word == 'less' or word == 'under'): cond_query.append('<') if (word == 'more' or word == 'over' or word == 'above'): cond_query.append('>') if (tag == 'CD'): cond_query.append(word) if ('>' not in cond_query and '<' not in cond_query): cond_query.insert(-1, '=') if (word == 'city' or word == 'residence' or word == 'live'): word = 'city' cond_query.extend(['from', 'Student', 'where', word, '=']) if (len(place.cities) == 0): cond_query.append("'" + token_comment[-1] + "'") else: cond_query.append(str(place.cities).strip('[]')) if (word == 'gender'): cond_query.extend(['from', 'Student', 'where', word, '=']) if ('girls' in token_comment or 'girl' in token_comment or 'female' in token_comment): cond_query.append("'female'") else: cond_query.append("'male'") if (word == 'registration_date'): for word, tag in tagged_comment: if (tag == 'CD'): cond_query.extend( ['from', 'Student', 'where', word, '=']) cond_query.append("'" + word + "'") if (word == 'name'): cond_query.extend(['from', 'Student', 'where', word, '=']) cond_query.append("'" + token_comment[-1] + "'") if (word == 'class'): cond_query.extend(['from', 'Student', 'where', word, '=']) cond_query.append("'" + token_comment[-1] + "'") if (word == 'roll_no'): cond_query.extend(['from', ' Student', 'where', word]) for word, tag in tagged_comment: if (word == 'less' or word == 'under'): cond_query.append('<') if (word == 'more' or word == 'over' or word == 'above'): cond_query.append('>') if (tag == 'CD'): cond_query.append(word) if ('>' not in cond_query and '<' not in cond_query): cond_query.insert(-1, '=') if (cond_query[0] != 'Select'): cond_query.insert(0, 'Select') condi_query = ' '.join(map(str, cond_query)) print('FINAL QUERY--> ' + condi_query + ';') text1.insert(END, condi_query)
def form_countquery(text): print("From Count Query") ps = PorterStemmer() place = GeoText(text) cond_query = [] select_query = [] comment = [text.lower()] #print(text) for i in range(0, len(comment)): token_comment = word_tokenize(comment[i]) #print(token_comment) tagged_comment = pos_tag(token_comment) #print('tagged comment', tagged_comment) #print([(word, tag) for word, tag in tagged_comment]) cond_query.append('Select') cond_query.append('COUNT(*)') for word, tag in tagged_comment: if (word == 'elder' or word == 'older' or word == 'under' or word == 'above' or word == 'below'): word = 'age' if word in AttrList: if (word == 'live' or word == 'lives'): word = 'city' if (word == 'female' or word == 'girls' or word == 'girl' or word == 'male' or word == 'boys' or word == 'boy'): word = 'gender' if (word == 'enrolled' or word == 'admission'): word = 'registration_date' if (word == 'studying' or word == 'studies' or word == 'study' or word == 'class'): word = 'class' #cond_query.append(lemmatizer.lemmatize((word))) if (word == 'roll' or word == 'serial' or word == 'no.'): word = 'roll_no' if (word == 'age'): cond_query.extend(['from', ' Student', 'where', word]) for word, tag in tagged_comment: if (word == 'less' or word == 'under'): cond_query.append('<') if (word == 'more' or word == 'over' or word == 'above' or word == 'older'): cond_query.append('>') if (tag == 'CD'): cond_query.append(word) if ('>' not in cond_query and '<' not in cond_query): cond_query.insert(-1, '=') if (word == 'city' or word == 'residence' or word == 'live'): word = 'city' cond_query.extend(['from', 'Student', 'where', word, '=']) if (len(place.cities) == 0): cond_query.append("'" + token_comment[-1] + "'") else: cond_query.append(str(place.cities).strip('[]')) if (word == 'gender'): cond_query.extend(['from', 'Student', 'where', word, '=']) if ('girls' in token_comment or 'girl' in token_comment or 'female' in token_comment): cond_query.append("'female'") else: cond_query.append("'male'") if (word == 'registration_date'): for word, tag in tagged_comment: if (tag == 'CD'): cond_query.extend( ['from', 'Student', 'where', word, '=']) cond_query.append("'" + word + "'") if (word == 'name'): cond_query.extend(['from', 'Student', 'where', word, '=']) cond_query.append("'" + token_comment[-1] + "'") if (word == 'class'): cond_query.extend(['from', 'Student', 'where', word, '=']) cond_query.append("'" + token_comment[-1] + "'") if (word == 'roll_no'): cond_query.extend(['from', ' Student', 'where', word]) for word, tag in tagged_comment: if (word == 'less' or word == 'under'): cond_query.append('<') if (word == 'more' or word == 'over' or word == 'above'): cond_query.append('>') if (tag == 'CD'): cond_query.append(word) if (cond_query[0] != 'Select'): cond_query.insert(0, 'Select') condi_query = ' '.join(map(str, cond_query)) if (len(condi_query) == 15): condi_query = condi_query + ' from Student;' print('FINAL QUERY--> ' + condi_query + ';') text1.insert(END, condi_query)
""" Name: Karan pankaj Makhija and Jeet Thakur Version: Python 2.7 Title : Multiclass_Classification with Feature Engineering """ from sklearn import preprocessing, svm from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn import ensemble import pandas as pd from sklearn.model_selection import train_test_split from nltk.corpus import stopwords from sklearn import metrics from nltk.stem import PorterStemmer #Finding the stem of the words ps = PorterStemmer() my_stopwords = stopwords.words('english') data = pd.read_json('Final1.json') #Converting to lower case data.iloc[:, 1] = data.iloc[:, 1].apply( lambda x: " ".join(x.lower() for x in x.split())) #Removing Punctuation data.iloc[:, 1] = data.iloc[:, 1].str.replace('[^\w\s]', '') #Removing StopWords data.iloc[:, 1] = data.iloc[:, 1].apply( lambda x: " ".join(ps.stem(x) for x in x.split())) data.iloc[:, 1] = data.iloc[:, 1].apply( lambda x: " ".join(x for x in x.split() if x not in my_stopwords)) X = data.iloc[:, 1].values
print(path + "/" + name) next = path + "/" + name nextnameL = [re.findall(r'[a-z]+', name)[0]] nextname = nextnameL[0] ## Keep just the name ListOfCompleteFilePaths.append(next) ListOfJustFileNames.append(nextname) # In[2]: #################################################### ## Create the Stemmer Function......... ###################################################### ## Instantiate it A_STEMMER = PorterStemmer() #---------------------------------------- # Use NLTK's PorterStemmer in a function - DEFINE THE FUNCTION #------------------------------------------------------- def MY_STEMMER(str_input): ## Only use letters, no punct, no nums, make lowercase... words = re.sub(r"[^A-Za-z\-]", " ", str_input).lower().split() words = [A_STEMMER.stem(word) for word in words] ## Use the Stemmer... return words ################################################################## ## Tokenize and Vectorize the text data from the corpus... ##############################################################
#!/usr/bin/env python import textmining import glob from nltk.stem import PorterStemmer tdm = textmining.TermDocumentMatrix() ps = PorterStemmer() files = glob.glob("*.txt") print(files) newcont = [] newst = '' for f in files: content = open(f).read() content = content.replace('\n', ' ') c = content.split() for i in c: newcont.append(ps.stem(i)) newst = " ".join(newcont) tdm.add_doc(newst) tdm.write_csv('matrix.csv', cutoff=1)
#!/usr/bin/env python ''' Description : This Script will use the simple Stemming technique from Python List ''' from nltk.stem import PorterStemmer from nltk.stem import LancasterStemmer from nltk.stem import WordNetLemmatizer #Stemming is a technique for removing affixes from a word my_list = ['going', 'cars', 'went', 'came', 'coming'] print "*****************************" print "The Porter Stemmer Algorithm" print "*****************************" stemmer = PorterStemmer() for i in my_list: print "%s => %s " % (i, stemmer.stem(i)) print print "*******************************" print "The Lancaster Stemmer Algorithm" print "*******************************" stemmer = LancasterStemmer() for i in my_list: print "%s => %s " % (i, stemmer.stem(i)) print print "*****************************" print " Wornet Lemmatization" print "*****************************" lem = WordNetLemmatizer()
class MounicaSelectorPhrasalTEMP: def __init__(self, ngram, google_freq_file=RESOURCES['en']['nrr']['google'], cutoff=0): google_freq = {} total = 0 nextone = 0 logger.debug("Loading google %d-gram frequencies..." % ngram) for line in open(google_freq_file, encoding='utf-8'): line_tokens = [t.strip() for t in line.strip().split('\t')] try: count = int(line_tokens[1]) if count > cutoff: google_freq[line_tokens[0]] = np.log10(count) total += 1 nextone = 0 except IndexError: logger.debug( "Error: the following has no corresponding word: " + str(line_tokens)) pass if (total % 1000 == 0 and nextone == 0): nextone = 1 logger.debug("N-gram count: " + str(total)) logger.info("Total n-grams loaded: " + str(total)) self.ngram = ngram self.google_freq = google_freq self.ps = PorterStemmer() self.lem = nltk.WordNetLemmatizer() def select(self, sent, so, eo, candidates): cand = list(candidates) scores = self.get_scores(sent, so, eo, cand) out = [] for i in range(0, len(scores)): if (scores[i] != 0): out.append(cand[i]) # This can filter out wrong tenses & duplicates before OR after ngram comparison out = self.filter_out_tense(sent, so, eo, out) return out def get_scores(self, sent, so, eo, candidates): t_b = word_tokenize(sent[:so]) t_a = word_tokenize(sent[eo:]) if len(t_b) < self.ngram - 1: t_b = ['<S>'] + t_b if len(t_a) < self.ngram - 1: t_a = t_a + ['</S>'] scores = [] for word in candidates: combos = t_b[-self.ngram + 1:] + [word] + t_a[:self.ngram - 1] scores.append(0) for j in range(0, len(combos) - self.ngram + 1): phrase = '' for word in combos[j:j + self.ngram]: phrase += word + ' ' phrase = phrase.lower() if phrase[:-1] in self.google_freq: scores[-1] += self.google_freq[phrase[:-1]] return scores def filter_out_tense(self, sent, so, eo, candidates): stems = [] out = [] word_tag = nltk.pos_tag([sent[so:eo]])[0][1] stems.append(self.ps.stem(sent[so:eo])) for word in candidates: cand_stem = self.ps.stem(word) if cand_stem not in stems: stems.append(cand_stem) try: cand_tag = self.tag_for_lemmatizer(word) if cand_tag is None: out.append( getInflection(self.lem.lemmatize(word, pos=cand_tag), tag=word_tag)[0]) else: out.append(word) except IndexError: # Lemminflect does not support all POS tags - lemminflect.readthedocs.io/en/latest/tags/ out.append(word) logger.debug( "ERROR: Lemminflect cannot convert {} with type {}, skipping" .format(word, word_tag)) return out def tag_for_lemmatizer(self, word): tag = nltk.pos_tag([word])[0][1][:2] if tag in ['VB']: return 'v' elif tag in ['JJ']: return 'a' elif tag in ['RB']: return 'r' else: return 'n'
def __init__(self): self.stemmer = NltkPorterStemmer()
#emoji = resolveEmoji(text) lang = payload['lang'] tstamp = time.mktime(tstmp) return {'id':int(id), 'words':words, 'hashtags':hashtags, 'checkins':checkins, 'mentions':mentions, \ 'ctype':q, 'crds':crds, 'timestamp':tstamp, 'lang':lang, 'urls':urls+media_urls} except Exception as e: print 'could not process object', obj['_id'], e return None else: return None def batch_process(workload): prc = [process(d) for d in workload] return prc def build_tdelta(d=0, h=0): return (datetime.now() - timedelta(days=d, hours=h)) tok_regex = re.compile(r'(?u)[@|#]?\w+') punkt_regex = re.compile('[%s]' % re.escape(string.punctuation)) url_regex = re.compile( r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' ) swords = load_stop_words() lm = PorterStemmer()
from nltk.stem import PorterStemmer, WordNetLemmatizer import re from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.metrics import confusion_matrix, accuracy_score import pickle #Reading the Dataset df = pd.read_csv('train.csv') #Dropping the rows of null values df = df.dropna() messages = df.copy() messages.reset_index(inplace=True) ps = PorterStemmer() lemma = WordNetLemmatizer() corpus = [] for i in range(0, len(messages)): review = (re.sub('[^a-zA-Z]', ' ', messages['title'][i])).lower() review = review.split() review = [ lemma.lemmatize(word, pos='n') for word in review if word not in stopwords.words('english') ] review = ' '.join(review) corpus.append(review) #Extract features with CountVectorizer
def proccess_tweet(df, domain_terms, col='text', removeSlang=True, spellCorrection=False): ''' Function recieves - a dataframe with 'text' column to be tokenize - a set of domain saved terms to be skipped in the stop-removal (and slang) and stemming phase The function apply stemming, stop-words removal, slang correction and tokenization Spell correction is currently commented due to low performance ''' # Initialize a ProterStemmer object stemmer = PorterStemmer() stop_words = set(stopwords.words('english')) # Create a set of stop words, countries names, negation words and slang words (if needed) -> saved_words # These words won't be stemmed, due to uses of these words in later phases saved_words = get_saved_words(removeSlang) # Tokenization, stemming and stopword (and punctuation) removal - the result is a list of the terms of the tweet. # The '#' removed in the strip_punctuation function. Converted to normal word. # Convert text to lower case df['tokenized_text'] = df[col].apply(lambda text: text.lower()) print(' - Lower case completed') df['tokenized_text'] = df['tokenized_text'].apply( lambda x: x.replace('…', '')) df['tokenized_text'] = df['tokenized_text'].apply( lambda x: x.replace('"', '')) df['tokenized_text'] = df['tokenized_text'].apply( lambda x: x.replace('\n', '')) # Drop links and mentions # word_tokenizer(tweet) was replaced by .split(). Reason: probably done some extra pre-proccessing that cause damage df['tokenized_text'] = df['tokenized_text'].apply(lambda tweet: [ strip_punctuation(token) for token in tweet.split() if ((token not in stop_words) and ('http' not in token) and ('pic.twitter' not in token) and ('bitly.' not in token) and (token != 'rt') and (token != '…') and (token != '"') and ('bit.ly' not in token) and (not token.startswith('@'))) ]) print(' - Strip punctuation completed') print(' - Stop-words removal completed') print(' - Tokenization completed') if removeSlang: # Replace slang words slang_dict = slang.slang_words() df['tokenized_text'] = df['tokenized_text'].apply(lambda tweet: [ slang_dict[token] if token in slang_dict else token for token in tweet ]) print(' - Slang words correction completed') if spellCorrection: # Spell correction for tokens, unless they are domain terms df['tokenized_text'] = df['tokenized_text_no_spell'].apply( lambda tokens: [ str(TextBlob(token).correct()) if token not in domain_terms else token for token in tokens ]) # spelling correction feature df['num_spell_errors'] = df[[ 'tokenized_text_no_spell', 'tokenized_text' ]].apply(lambda x: spell_correction(x[0], x[1])) print(' - Spell correction completed') # Before using Porter stemmer - use domain stemmer and stem words that Porter stems badly. domainStemmer = domain_stemmer() # Stem it to words that are in domain_terms, so Porter stemmer will skip these words df['tokenized_text'] = df['tokenized_text'].apply(lambda tokens: [ domainStemmer[token] if token in domainStemmer else token for token in tokens ]) # Stem tokens, unless they are domain terms (or length = 1) or saved words df['tokenized_text'] = df['tokenized_text'].apply(lambda tokens: [ stemmer.stem(token) if (token not in domain_terms and token not in saved_words and len(token) > 1) else token for token in tokens ]) print(' - Stemming phase completed') # Remove empty tokens df['tokenized_text'] = df['tokenized_text'].apply( lambda tokens: [token for token in tokens if len(token) > 1]) print(' - Cleaning empty tokens completed') print('Final number of features: {}'.format(df.shape[0])) print('\nCOMPLETED: Pre-proccess') print('----------------------\n----------------------') return df
filtered_sentence = [] for w in word_tokens: if w not in stop_words: filtered_sentence.append(w) filtered_sentence filtered_sentence = [w for w in word_tokens if not w in stop_words] print(word_tokens) print(filtered_sentence) from nltk.stem import PorterStemmer ps = PorterStemmer() example_words = ["shop","shopping","shops"] for w in example_words: print(ps.stem(w)) new_text = "Raindrops are the size of bullets thundered on the castle windoes for days on end; the lake rose, the flower beds turned into muddy streams, and Hagrid's pumpkins swelled to the size of garden sheds." words = word_tokenize(new_text) for w in words: print(ps.stem(w)) from nltk.corpus import state_union from nltk.tokenize import PunktSentenceTokenizer train_text = state_union.raw("2005-GWBush.txt") sample_text = state_union.raw("2006-GWBush.txt")
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sat Oct 13 20:46:26 2018 an example of stemming e.g. writing becomes write (the stem) Basically normalising sentences Using @author: jay """ from nltk.stem import PorterStemmer from nltk.tokenize import word_tokenize ps = PorterStemmer() words = ["python", "pythoner", "pythoning", "pythonly"] for w in words: print(w, ' = ', ps.stem(w))
def preprocess(df): ''' Preprocess text from dataframe. Splits reviews into sentences, then each sentence is preprocessed. Returns dataframe with original and preprocessed text. ''' # Split into sentences nlp = spacy.load('en_core_web_sm') review_ids = [] text = [] # Assuming the reviews aren't split and that reviews columns are 'review', # for review_id, review in zip(df['review_id'], df['review']): # sentences= [i for i in nlp(review).sents] # for sentence in sentences: # review_ids.append(review_id) # text.append(str(sentence)) # No review IDs for review in df['review']: sentences= [i for i in nlp(review).sents] for sentence in sentences: # review_ids.append(review_id) text.append(str(sentence)) reviews_df = pd.DataFrame() # reviews_df['review_id'] = review_ids reviews_df['raw_text'] = text # Remove symbols,punctuations... reviews_df['clean_text'] = reviews_df['raw_text'].str.replace('[^\w\s]','') reviews_df['clean_text'] = reviews_df['clean_text'].str.replace('\d+', '') reviews_df['clean_text'] = reviews_df['clean_text'].str.lower() reviews_df['clean_text'] = reviews_df['clean_text'].str.replace('^https?:\/\/.*[\r\n]*', '') reviews_df['clean_text'].replace('', np.nan, inplace=True) drop = reviews_df[pd.isnull(reviews_df['clean_text'])].index reviews_df.drop(drop , inplace=True) reviews_df = reviews_df.reset_index(drop = True) def preprocess_aspect(df): ''' Preprocessing text for aspect extraction and classification. Returns tf-idf/corpus, LDA model. ''' def sent_to_words(sentences): for sentence in sentences: yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations data_words = list(sent_to_words(df['clean_text'])) # Build the bigram and trigram models bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases. trigram = gensim.models.Phrases(bigram[data_words], threshold=100) # Faster way to get a sentence clubbed as a trigram/bigram bigram_mod = gensim.models.phrases.Phraser(bigram) trigram_mod = gensim.models.phrases.Phraser(trigram) stop_words = stopwords.words('english') stop_words.extend(['from', 'subject', 're', 'edu', 'use']) # Define functions for stopwords, bigrams, trigrams and lemmatization def remove_stopwords(texts): return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts] def make_bigrams(texts): return [bigram_mod[doc] for doc in texts] def make_trigrams(texts): return [trigram_mod[bigram_mod[doc]] for doc in texts] def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): """https://spacy.io/api/annotation""" texts_out = [] for sent in texts: doc = nlp(" ".join(sent)) texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) return texts_out # Remove Stop Words data_words_nostops = remove_stopwords(data_words) # Form Bigrams data_words_bigrams = make_bigrams(data_words_nostops) # Initialize spacy 'en' model, keeping only tagger component (for efficiency) # python3 -m spacy download en nlp = spacy.load('en', disable=['parser', 'ner']) # Do lemmatization keeping only noun, adj, vb, adv data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) # Create Dictionary id2word = corpora.Dictionary(data_lemmatized) # Create Corpus texts = data_lemmatized # Term Document Frequency corpus = [id2word.doc2bow(text) for text in texts] return corpus, id2word porter = PorterStemmer() def stemSentence(sentence): token_words=word_tokenize(sentence) token_words stem_sentence=[] for word in token_words: stem_sentence.append(porter.stem(word)) stem_sentence.append(" ") return "".join(stem_sentence) stemmed = [] for sentence in reviews_df['clean_text']: stemmed.append(stemSentence(sentence)) reviews_df['stem_text'] = stemmed # corpus, id2word = preprocess_aspect(reviews_df) # Remove stop words # stop_words = set(stopwords.words('english')) # reviews_df['no_sw'] = reviews_df['clean_text'][:].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)])) return reviews_df #, corpus, id2word
from math import log from nltk import WordNetLemmatizer from nltk.corpus import stopwords from string import punctuation from nltk import word_tokenize from nltk.stem import PorterStemmer import codecs import string import math from collections import defaultdict ''' 詞形還原(Lemmatization) 例如:dogs -> dog, cats -> cat doing -> do, done -> do better -> good''' wnl = WordNetLemmatizer() stemmer = PorterStemmer() #f = open("dict_RTS.txt","w", encoding = 'utf-8' ) #f.write( str(RTS)) #f.close() # punctuation = (全部)標點符號 stop_words = stopwords.words('english') + list(punctuation) DN = float(len(reuters.fileids())) # type=dict,_RTS = 路透社資料字典 RTS = { k: [ wnl.lemmatize(w.lower()) for w in reuters.words(k) if w not in stop_words and not w.isdigit() and len(w.strip()) > 2 ] for k in reuters.fileids()
# Author : Sujay <*****@*****.**> # Last Modified By : Sujay <*****@*****.**> from math import log as l import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from nltk.stem import PorterStemmer from bccDataLoad_for_density import * import matplotlib.pyplot as plt #extra to be removed import operator stop_words = stopwords.words('english') + ['said' + 'v' + 'would' + 'could'] ps = PorterStemmer() business_file = entertainment_file = politics_file = sport_file = tech_file = [] #get the data def getFreqDist(data): global tech_file, business_file, sport_file, entertainment_file, politics_file if data == "business": data = [ ps.stem(w) for w in word_tokenize(getBusiness()) if w not in stop_words ] business_file = set(data) elif data == "entertainment": data = [ ps.stem(w) for w in word_tokenize(getEntertainment())
# reading input.txt file text = open('input.txt', encoding="utf8").read() # Tokenization wtokens = word_tokenize(text) stokens = sent_tokenize(text) print("\n Word tokens:", wtokens) print("\n Sentence tokens:", stokens) # POS pos = nltk.pos_tag(wtokens) print("\n Parts of Speech:", pos) # Stemming pstem = PorterStemmer() lstem = LancasterStemmer() sstem = SnowballStemmer("english") pstem_output = ' '.join([pstem.stem(w) for w in wtokens]) lstem_output = ' '.join([lstem.stem(w) for w in wtokens]) sstem_output = ' '.join([sstem.stem(w) for w in wtokens]) print("\n PorterStemmer Stemming:\n", pstem_output) print("\n LancasterStemmer Stemming:\n", lstem_output) print("\n SnowballStemmer Stemming:\n", sstem_output) # Lemmatization lemmatizer = WordNetLemmatizer() lem_output = ' '.join([lemmatizer.lemmatize(w) for w in wtokens]) print("\n Lemmatization: \n", lem_output) # Trigram
from nltk.stem import PorterStemmer, LancasterStemmer from nltk.tokenize import sent_tokenize, word_tokenize import string totneg = 0 totpos = 0 porter = PorterStemmer() lancaster = LancasterStemmer() stopfile = open("stopwords.txt", 'r') stopwords = stopfile.read() stopwords = stopwords.split() negfile = open("negative-words.txt", 'r', encoding="ISO-8859-1") negwords = negfile.read() negwords = negwords.split() posfile = open("positive-words.txt", 'r', encoding="ISO-8859-1") poswords = posfile.read() poswords = poswords.split() #comment ="it sends you away a believer again and quite cheered at just that" exclude = set(string.punctuation) comment = ''.join(ch for ch in comment if ch not in exclude) comment = comment.split() x = ' '.join(j for j in comment if j not in stopwords) x = x.split()
def stream(self, records): if self.custom_stopwords: custom_stopwords = self.custom_stopwords.replace(' ','').split(',') for record in records: if self.keep_orig: record['orig_text'] = record[self.textfield] #URL removal if self.remove_urls: record[self.textfield] = self.f_remove_urls( record[self.textfield] ) #Tokenization if (self.base_word and self.base_type == 'lemma_pos') or self.force_nltk_tokenize: #lemma_pos - if option is lemmatization with POS tagging do cleaning and stopword options now if (self.base_word and self.base_type == 'lemma_pos'): record['pos_tuple'] = pos_tag( word_tokenize( record[self.textfield] ), tagset=self.pos_tagset ) if self.default_clean and self.remove_stopwords: if self.custom_stopwords: stopwords = set(stop_words.words('english') + custom_stopwords) else: stopwords = set(stop_words.words('english')) record['pos_tuple'] = [ [ re.sub(r'[\W\d]','',text[0]).lower(), text[1] ] for text in record['pos_tuple'] if re.sub(r'[\W\d]','',text[0]).lower() not in stopwords and not re.search(r'[\W]',text[0]) ] elif self.default_clean and not self.remove_stopwords: record['pos_tuple'] = [ [ re.sub(r'[\W\d]','',text[0]).lower(), text[1] ] for text in record['pos_tuple'] if not re.search(r'[\W]',text[0]) ] elif self.force_nltk_tokenize: record[self.textfield] = word_tokenize( record[self.textfield] ) elif self.default_clean or (self.base_word and self.base_type == 'lemma'): #https://stackoverflow.com/a/1059601 record[self.textfield] = re.split('\W+', record[self.textfield]) else: record[self.textfield] = record[self.textfield].split() #Default Clean if self.default_clean and not self.base_type == 'lemma_pos': record[self.textfield] = [ re.sub(r'[\W\d]','',text).lower() for text in record[self.textfield] ] #Lemmatization with POS tagging if self.base_word and self.base_type == 'lemma_pos': lm = WordNetLemmatizer() tuple_list = [] tag_list = [] record[self.textfield] = [] record['pos_tag'] = [] for text in record['pos_tuple']: keep_text = lm.lemmatize( text[0], self.get_wordnet_pos(text[1]) ) if keep_text: record[self.textfield].append(keep_text) tuple_list.append([keep_text,text[1]]) tag_list.append(text[1]) record['pos_tag'] = tag_list record['pos_tuple'] = tuple_list #Lemmatization or Stemming with stopword removal if self.remove_stopwords and self.base_word and self.base_type != 'lemma_pos': if self.custom_stopwords: stopwords = set(stop_words.words('english') + custom_stopwords) else: stopwords = set(stop_words.words('english')) if self.base_type == 'lemma': lm = WordNetLemmatizer() record[self.textfield] = [ lm.lemmatize(text) for text in record[self.textfield] if text not in stopwords ] if self.base_type == 'stem': ps = PorterStemmer() record[self.textfield] = [ ps.stem(text) for text in record[self.textfield] if text not in stopwords ] #Lemmatization or Stemming without stopword removal if not self.remove_stopwords and self.base_word: if self.base_type == 'lemma': lm = WordNetLemmatizer() record[self.textfield] = [ lm.lemmatize(text) for text in record[self.textfield] ] if self.base_type == 'stem': ps = PorterStemmer() record[self.textfield] = [ ps.stem(text) for text in record[self.textfield] ] #Stopword Removal without Lemmatization or Stemming if self.remove_stopwords and not self.base_word: if self.custom_stopwords: stopwords = set(stop_words.words('english') + custom_stopwords) else: stopwords = set(stop_words.words('english')) record[self.textfield] = [ text for text in record[self.textfield] if text not in stopwords ] #Minimum term length if self.term_min_len > 0: record[self.textfield] = [ i for i in record[self.textfield] if len(i) >= self.term_min_len ] #ngram column creation (min_n,max_n) = self.ngram_range.split('-') #if max_n > 1 and max_n >= min_n: if int(max_n) > 1 and int(max_n) >= int(min_n): max_n = int(max_n) + 1 ngram_extract = self.ngram( [_f for _f in record[self.textfield] if _f], int(min_n), max_n ) if ngram_extract: for i in ngram_extract: if not self.ngram_mix: if 'ngrams_' + str(i[0]) not in record: record['ngrams_' + str(i[0])] = [] record['ngrams_' + str(i[0])].append(i[1]) else: if 'ngrams' not in record: record['ngrams'] = [] record['ngrams'].append(i[1]) else: if not self.ngram_mix: for n in list(range(int(min_n),int(max_n))): if n!=1: record['ngrams_' + str(n)] = [] else: if 'ngrams' not in record: record['ngrams'] = [] #Final Multi-Value Output if not self.mv: record[self.textfield] = ' '.join(record[self.textfield]) try: record['pos_tag'] = ' '.join(record['pos_tag']) except: pass yield record
print("*****************Sentence Tokenizer*******************") for s in senttokens: print(s) print("*****************Word Tokenizer*******************") for w in wordtokens: print(w) print("*****************POS Tagging*******************") print(nltk.pos_tag(wordtokens)) print("*****************Stemming*******************") ps = [] ls = [] ss = [] print("*****************Porter Stemming*******************") pStemmer = PorterStemmer() for w in wordtokens: ps.append(pStemmer.stem(w)) print(ps) print("*****************Lancaster Stemming*******************") lStemmer = LancasterStemmer() for w in wordtokens: ls.append(lStemmer.stem(w)) print(ls) print("*****************Snowball Stemming*******************") sStemmer = SnowballStemmer('english') for w in wordtokens: ss.append(sStemmer.stem(w)) print(ss) print("*****************Lemmatization*******************")
def match(query, method, instance_url): ''' search solr index using user query ''' q_list = [] q_string = '' solr = pysolr.Solr(instance_url) tokens = word_tokenize(query) stemmer = PorterStemmer() tagged_tokens = pos_tag(tokens) tagged_list = [tuple2str(t) for t in tagged_tokens] tokens_clean = deleteStopWords(tokens) lemmas = get_lemmatized_line(tagged_tokens) stem_line = [stemmer.stem(t) for t in tokens_clean] synonyms, hypernyms, hyponyms, meronyms, holonyms = get_semantic_features( tagged_tokens, tokens) # head_words = get_dependency_relations(lemmas, True) print('user input features', ' ===> \n', 'text: \n', query, '\n', 'tokens: \n', tokens, '\n', 'pos tag: \n', tagged_tokens, '\n', 'remove_stopWords: \n', tokens_clean, '\n', 'lemmatized: \n', lemmas, '\n', 'stemmed: \n', stem_line, '\n', 'synonyms: \n', synonyms, '\n', 'hypernyms: \n', hypernyms, '\n', 'hyponyms: \n', hyponyms, '\n', 'meronyms: \n', meronyms, '\n', 'holonymns: \n', holonyms, '\n\n') pos_tag_data = '&'.join(tagged_list) lemmas = '&'.join(lemmas.split()) stems = '&'.join(stem_line) synonyms = '&'.join(synonyms.split()) hypernyms = '&'.join(hypernyms.split()) hyponyms = '&'.join(hyponyms.split()) holonyms = '&'.join(holonyms.split()) meronyms = '&'.join(meronyms.split()) if method == 3: # head_words = '&'.join(head_words.split()) if tokens: q_list.append('text:' + '&'.join(tokens)) if tokens_clean: q_list.append('text_clean:' + '&'.join(tokens_clean)) if pos_tag_data: q_list.append('pos_tag:' + pos_tag_data) if lemmas: q_list.append('lemmas:' + lemmas) if stems: q_list.append('stems:' + stems) if synonyms: q_list.append('synonyms:' + synonyms) if hypernyms: q_list.append('hypernyms:' + hypernyms) if hyponyms: q_list.append('hyponyms:' + hyponyms) if meronyms: q_list.append('meronyms:' + meronyms) if holonyms: q_list.append('holonymns:' + holonyms) # if head_words: # q_list.append('head_word:' + head_words) if method == 4: if tokens: q_list.append('text:' + '&'.join(tokens) + '^0.5') if pos_tag_data: q_list.append('pos_tag:' + pos_tag_data + '^0.02') if lemmas: q_list.append('lemmas:' + lemmas + '^4') if tokens_clean: q_list.append('text_clean:' + '&'.join(tokens_clean) + '^5') if stems: q_list.append('stems:' + stems + '^1.5') if synonyms: q_list.append('synonyms:' + synonyms + '5') if hypernyms: q_list.append('hypernyms:' + hypernyms + '^5') # if hyponyms: # q_list.append('hyponyms:' + hyponyms + '^4') # if head_words: # q_list.append('head_word:' + head_words + '^0.5') # if meronyms: # q_list.append('meronyms:' + meronyms + '^1.4') # if holonyms: # q_list.append('holonymns:' + holonyms + '^1.4') q_string = ', '.join(q_list) print('The Solr query is q=%s, fl=\'id,text\'\n' % (q_string)) result = solr.search(q=q_string, fl='id,text') for r in result: print(r['id']) print(' '.join(r['text']))
import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from nltk.corpus import wordnet from nltk.stem import PorterStemmer import string nltk.download('punkt') nltk.download('stopwords') nltk.download('wordnet') stemmer = PorterStemmer() stop_words = stopwords.words('english') inp = input("Enter String: ") tokens = word_tokenize(inp) clean_tokens = tokens = tokens[:] for token in tokens: if token in stop_words: clean_tokens.remove(token) table = str.maketrans('', '', string.punctuation) clean_tokens = [word.translate(table) for word in tokens] all_synonyms = [] for word in clean_tokens: for syn in wordnet.synsets(word): for lemma in syn.lemmas(): all_synonyms.append(lemma.name())
import os import pandas as pd from nltk.tokenize import word_tokenize from nltk.corpus import stopwords import nltk from nltk.stem import PorterStemmer from sklearn.feature_extraction.text import CountVectorizer from nltk.stem import WordNetLemmatizer from imblearn.over_sampling import RandomOverSampler import matplotlib.pyplot as plt import seaborn as sns my_lemmatizer = WordNetLemmatizer() vector = CountVectorizer() stemming = PorterStemmer() nltk.download('stopwords') nltk.download('punkt') nltk.download('wordnet') #Getting the stopwords corpora stopwords_list = stopwords.words('english') print(stopwords_list[:5]) #Loading the final merged dataset youtube_train = pd.read_csv("C:/Users/prate/Desktop/ICT_solution/Data/final_data/final_data.csv",delimiter=',') youtube_train_sen = youtube_train['video_title'] print(youtube_train_sen[1]) #Creating an empty array final_sent = []; #Initializing variables count = 0
#Transform label to binary #data.label = data.label.map(dict(REAL=1, FAKE=0)) # Visualising the dataset import seaborn as sb def create_distribution(dataFile): return sb.countplot(x='Label', data=dataFile, palette='hls') #create_distribution(data) data = shuffle(data) # Clean the text eng_stemmer2 = PorterStemmer() eng_stemmer = SnowballStemmer('english') #nltk.corpus.stopwords.words('english').remove('not') stopwords = set(nltk.corpus.stopwords.words('english')) stopwords.remove('not') def stem_tokens(tokens, stemmer): stemmed = [] for token in tokens: stemmed.append(stemmer.stem(token)) return stemmed #tokens = data.iloc[3, 0] def process_data(tokens):
from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.stem import PorterStemmer file_index = open('label/index', encoding='utf-8') stop_words = set(stopwords.words('english')) # get stop word set columns = ['label', 'word_count', 'text'] del_table = string.punctuation str_table = str.maketrans('', '', '1234567890') dict1 = {} count = 0 file = open('output.txt', 'w') spam_word = [] ps = PorterStemmer() df = pd.DataFrame(columns=columns) for line in file_index.readlines(): tmp = line.split(' ') tmp_label = 1 if tmp[0] == 'spam' else 0 # mail label, spam -> 1, ham -> 0 tmp_path = tmp[1] # mail's path tmp_path = tmp_path.replace('\n', '') # remove '\n' in path tmp_path = tmp_path.replace('../', '') # remove '../' in path if tmp_label == 1: try: count += 1 print(count) mail_file = open(tmp_path, encoding='utf-8') mail_text = mail_file.read()
else: contents[fileNum] = contents.get(fileNum) + content #print contents[fileNum] print len(contents), '\n', fileNum # In[3]: doc = open('classification.txt', 'rb') classification2_text = doc.readlines() # In[4]: from nltk.stem import PorterStemmer from nltk.tokenize import sent_tokenize, word_tokenize ps = PorterStemmer() # head of HTML file f = open('HTML/search_keywords.html', 'w') f.write("<html>\n") f.write("<head> Search Keywords\n") f.write(" <script type=\"text/javascript\" src=\"mktree.js\"></script>\n") f.write( " <link rel=\"stylesheet\" href=\"mktree.css\" type=\"text/css\">\n") f.write("</head>\n\n") f.write("<body>\n") f.write("<ul class=\"mktree\">\n") has_subdir = False has_record = False
def preprocess_reviews(revs, word_process): try: stop_words = set(stopwords.words("english")) processed_revs = [] ps = PorterStemmer() lemmatizer = WordNetLemmatizer() print(len(revs)) for rev in revs: # Get the categories if rev == '': continue rev = rev.split('|~|') pro = rev[0] con = rev[1] adv = rev[2] score = rev[3] pre_post = rev[4] rev_id = rev[5] company_id = rev[6] industry = rev[7] comp_good_bad = rev[8] # Tokenize the words pro_words = word_tokenize(pro) con_words = word_tokenize(con) adv_words = word_tokenize(adv) if word_process == "raw": # Option 1: Filter the stopwords out and append the category filtered_pros = [ w.lower() for w in pro_words if not w in stop_words ] filtered_cons = [ w.lower() for w in con_words if not w in stop_words ] filtered_adv = [ w.lower() for w in adv_words if not w in stop_words ] elif word_process == "stem": # Option 2: Filter the stopwords out, stem the words, and append the category filtered_pros = [ ps.stem(w).lower() + ("_pro" if labels else "") for w in pro_words if not w in stop_words ] filtered_cons = [ ps.stem(w).lower() + ("_con" if labels else "") for w in con_words if not w in stop_words ] filtered_adv = [ ps.stem(w).lower() + ("_adv" if labels else "") for w in adv_words if not w in stop_words ] elif word_process == "lemma": # Option 3: Filter the stopwords out, lemmatize the words, and append the category filtered_pros = [ lemmatizer.lemmatize(w).lower() + ("_pro" if labels else "") for w in pro_words if not w in stop_words ] filtered_cons = [ lemmatizer.lemmatize(w).lower() + ("_con" if labels else "") for w in con_words if not w in stop_words ] filtered_adv = [ lemmatizer.lemmatize(w).lower() + ("_adv" if labels else "") for w in adv_words if not w in stop_words ] elif word_process == "features": # Option 4: Only keep the words in the specified featureset filtered_pros = [ ps.stem(w).lower() + ("_pro" if labels else "") for w in pro_words if w in all_features ] filtered_cons = [ ps.stem(w).lower() + ("_con" if labels else "") for w in con_words if w in all_features ] filtered_adv = [ ps.stem(w).lower() + ("_adv" if labels else "") for w in adv_words if w in all_features ] else: print( "Invalid word processing type. Please enter \"raw\", \"stem\", or \"lemma\". Exiting program..." ) exit() # Turn the filtered words back into a sentence and create a tuple of the review and the score rev = (' '.join(word for word in (filtered_pros + filtered_cons + filtered_adv)), int(score), pre_post, sampled, rev_id, company_id, industry, comp_good_bad) # Append to the processed list processed_revs.append(rev) return processed_revs except KeyboardInterrupt: print('exit') exit(1)
# Expand contractions (can skip since they are removed in the next step) doc = contractions.fix(doc).replace(' ', ' ') print('\nExpanded contractions.') # Tokenize tokens = word_tokenize(doc) print(f'\nTokenizing:\n{tokens[:40]}...') # Remove stop words stop_words = set(stopwords.words("english")) tokens = [w for w in tokens if w not in stop_words] print('\nRemoved stopwords.') # Lemmatization wordnet_lem = WordNetLemmatizer() tokens_lem = [wordnet_lem.lemmatize(token) for token in tokens] # Stemming (skip as word meaning is lost) porter_stem = PorterStemmer() tokens_stem = [porter_stem.stem(token) for token in tokens_lem] print(f'\nAfter Lemmatization and removing Stop words:\n{tokens_lem[:40]}...') # Word frequency fdist = FreqDist(tokens_lem) print(f'\nMost common words:\n{fdist}') print(fdist.most_common(20)) fdist.plot(20, cumulative=False) # END
def stemming(tokens): ps = PorterStemmer() StemmedWords = [ps.stem(words) for words in tokens] return StemmedWords