def getDomainUnigram(self, directory = None): collocations = set() #collocation items ewordlists = list() #list of lists of words #extract words from essays if directory is not None: doclist = os.listdir(directory) for essay in doclist: dir_essay = directory+'/'+essay etext = open(dir_essay,'r').read() tokens = nltk.wordpunct_tokenize(etext) tokens = [word.lower() for word in tokens] #stemming if self._stemoption ==True: st = PorterStemmer() tokens = [st.stem(t) for t in tokens] #extract the collocation for the given essay e_bigram = set(Mytext(tokens).collocations()) collocations = collocations | e_bigram ewordlists.append(tokens) else: # using the mapped essay to calcuate the candidate bigrams #need to call mapessay fuction first for ins in self._data: if ins['essay'] is not None: etext = open(ins['essay'],'r').read() tokens = nltk.wordpunct_tokenize(etext) tokens = [word.lower() for word in tokens] #stemming if self._stemoption ==True: st = PorterStemmer() tokens = [st.stem(t) for t in tokens] #extract the collocation for the given essay e_bigram = set(Mytext(tokens).collocations()) collocations = collocations | e_bigram ewordlists.append(tokens) #get collection of all essays under the specified directory / associated essays collection_text = TextCollection(ewordlists) itemlist = list() for (a, b) in collocations: itemlist.append(a) itemlist.append(b) itemlist = list(set(itemlist)) word_idf = [] for i in range(len(itemlist)): word_idf.append((collection_text.idf(itemlist[i]), itemlist[i])) word_idf = sorted(word_idf, key = operator.itemgetter(0)) ave = 0 if len(word_idf)!=0: ave = sum(map(operator.itemgetter(0), word_idf)) / len(word_idf) wlist = [j for (i, j) in word_idf if i<ave] return wlist
def search(dictionary_file, postings_file, query_file, output_file): """ Entry point to the program """ stemmer = PorterStemmer() with open(dictionary_file, "rb") as dfile: dictionary = pickle.loads(dfile.read()) with open(query_file, "rb") as qfile: with open(postings_file, "rb") as pfile: for query in qfile: print "query: ", query prefix = parser.to_polish_notation(query) print "prefix: ", prefix processed = [] for token in prefix: if parser.is_operand(token): token = stemmer.stem(token).lower() processed.append(token) print "processed: ", processed query = parser.process_query(processed) print "query: ", query result = execute_query(query, dictionary, pfile) print result
def tokenStem(words): words = words.strip('[').strip(']').lower() #remove brackets and lowercase words = re.sub('[(){}<>:,.!?\'"]', '', words) stemmer = PorterStemmer() stops = stopwords.words('english') output = [stemmer.stem(token) for token in wordpunct_tokenize(words) if token not in stops ] #stem words return " ".join(output) #merge into strings
def AddTopicUnigram(self, feaName,comName, data = None): #need mapping first if data is None: data =self._data for i in range(len(data)): t_bigram = self.getEssayCollocation(data, i) t_uni = list() for (a, b) in t_bigram: t_uni.append(a) t_uni.append(b) t_uni = set(t_uni) comment = data[i][comName] tokens = nltk.wordpunct_tokenize(comment) tokens = [word.lower() for word in tokens] #stemming if self._stemoption ==True: st = PorterStemmer() tokens = [st.stem(t) for t in tokens] t_uni = set([st.stem(t) for t in list(t_uni)]) shared = [w for w in tokens if w in t_uni] #normalized data[i][feaName] = float(len(shared))/(len(tokens)+0.00001)
def cleanData(doc_list): # tokenize tokens = [] for doc in doc_list: text_l = [] ws_split = re.split(split_on, doc) for w in ws_split: # remove URLs and empty strings if not (url_pat.match(w) or w == u''): text_l.append(w) # rejoin text and 'properly' tokenize text = " ".join(text_l) text_l = nltk.word_tokenize(text) # stop words text_l = [ w.lower() for w in text_l if w.lower() not in stops] # stemming p_stemmer = PorterStemmer() text_l = [p_stemmer.stem(t) for t in text_l] ## append cleaned text to list tokens.append(text_l) return tokens
def extractFeatures(dataSet): vector1, vector2 = list(), list() stemmer = PorterStemmer() # Produces list of all unique word stems in the titles in the dataset wordBag = list({stemmer.stem(word) for entry in dataSet for word in entry[2].strip().split(" ") if not word in stopwords.words('english')}) for entry in dataSet: genre, isbn, title, authors = entry[0], entry[1].strip(), entry[2].strip(), entry[3].strip() wordList, authorList = [word for word in title.split(" ")], [author.strip() for author in authors.split(";")] sortedWords = sorted(wordList, key = lambda x: -1*len(x)) nonStopWords = [word for word in sortedWords if not word in stopwords.words('english')] stemmedWords = [stemmer.stem(word) for word in nonStopWords] # Quantitative data about the title shortestWord = len(nonStopWords[-1]) longestWord = len(nonStopWords[0]) meanWord = sum([len(word) for word in nonStopWords])/len(nonStopWords) wordSD = (sum([(len(word)-meanWord)**2 for word in nonStopWords])/len(nonStopWords))**.5 vector1.append([(len(authorList), len(wordList), longestWord, shortestWord, meanWord, wordSD), genre]) # Creates a vector storing whether a word in a dataset occurred in the title occurrences = tuple(1 if word in stemmedWords else 0 for word in wordBag) vector2.append([occurrences, genre]) return (vector1,vector2)
def lda(data): data = get_only_text(data) only_tweet = data length = len(only_tweet) length = min(20,length) for i in xrange(0,length): print i print only_tweet[i] return tokenizer = RegexpTokenizer(r'\w+') en_stop = get_stop_words('en') p_stemmer = PorterStemmer() length = len(only_tweet) length = min(20,length) total_texts = [] for i in xrange(0,length): print only_tweet[i] print to_lower = only_tweet[i].lower() tokens = tokenizer.tokenize(to_lower) stopped_tokens = [k for k in tokens if not k in en_stop] texts = [p_stemmer.stem(k) for k in stopped_tokens] total_texts.append(texts) dictionary = corpora.Dictionary(total_texts) corpus = [dictionary.doc2bow(text) for text in total_texts] ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20) result = ldamodel.print_topics(num_topics=2, num_words=1) for i in result: print i
def createLDAModel(texts, n_topics, n_passes): """Generates a LDA model from an array of texts """ tokenizer = RegexpTokenizer(r'\w+') #Create EN stop words list en_stop = get_stop_words('en') #Create p_stemmer of class PorterStemmer p_stemmer = PorterStemmer() texts_ = [] # loop through document list for i in texts: # clean and tokenize document string raw = i.lower() tokens = tokenizer.tokenize(raw) # remove stop words from tokens stopped_tokens = [i for i in tokens if not i in en_stop] # stem tokens stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] # add tokens to list texts_.append(stemmed_tokens) # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(texts_) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in texts_] # generate LDA model ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=n_topics, id2word = dictionary, passes=n_passes) return(ldamodel)
def main(): rake=RAKE.Rake('SmartStoplist.txt') fp=open(input_file,'r') text=fp.read() text=text_clean(text) """wnl=WordNetLemmatizer() text=' '.join([wnl.lemmatize(i.strip()) for i in nltk.word_tokenize(text)])""" porter_stemmer=PorterStemmer() text=' '.join([porter_stemmer.stem(i.strip()) for i in nltk.word_tokenize(text)]) keywords=rake.run(text) # print keywords with open(key_score_file,'wb') as out: csv_out=csv.writer(out) csv_out.writerow(['KEYWORD','SCORE']) for row in keywords: if row[1]>0: csv_out.writerow(row) unibitrigram_list=[] unibitrigram_list=generate_unibitrigrams(key_score_file) #print unibitrigram_list #ngram_freq=[] ngram_freq=Counter(unibitrigram_list) sorted_ngram_freq=sorted(ngram_freq.items(),key=lambda x:x[1],reverse=True ) print ngram_freq with open('bcom_ngramfr_stem.csv','wb') as nf_csv: csv_wr=csv.writer(nf_csv) for item in sorted_ngram_freq: if ((item[0]!='')): csv_wr.writerow(item)
def Tokenize(TextData): tokenizer = RegexpTokenizer(r'\w+') tokens = list() # create English stop words list en_stop = get_stop_words('en') # Create p_stemmer of class PorterStemmer p_stemmer = PorterStemmer() # clean and tokenize document string raw = TextData.lower() tokens = tokenizer.tokenize(raw) # remove stop words from tokens stopped_tokens = [i for i in tokens if not i in en_stop] # stem tokens stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] tokens = stemmed_tokens TOKENIZEDTEXT_FILE = path.join(os.pardir, "Resources/TokenizedTextFiles/Personal-Narration/Unbroken - Motivational Video.txt") fp = open(TOKENIZEDTEXT_FILE, "w") print(TOKENIZEDTEXT_FILE) # pickle.dump(tokens, fp) fp.write(str(tokens)) fp.close()
def get_stemmed_separate(indeed_reviews_db, glassdoor_reviews_db): separate = get_separate_reviews(indeed_reviews_db, glassdoor_reviews_db) stemmer = PorterStemmer() stemmed_reviews = [] for review in separate: stemmed_reviews.append(' '.join([stemmer.stem(word) for sent in sent_tokenize(review) for word in word_tokenize(sent.lower())])) return stemmed_reviews
def create_bag_of_words(self): """Create a BagOfWords for the document. Performs named entity recognition, stemming and stopword removal. """ stemmer = PorterStemmer() nes = [] tagged_text = self.ner_tagger.get_entities(self.content.encode('utf-8')) for key in tagged_text.keys(): if key != 'O': nes += tagged_text[key] for n in nes: self.bag_of_words.add_stem_word(n, n) Document.vocabulary.add_stem_word(n, n) wo_named = re.sub('|'.join(nes), '', self.content) words = re.findall(r'\w+', wo_named,flags = re.UNICODE | re.LOCALE) for wordo in words: word = wordo.rstrip(r'\n') if word.lower() not in stopwords: w = stemmer.stem(word.lower()) self.bag_of_words.add_stem_word(w, word) Document.vocabulary.add_stem_word(w, word) for word in self.bag_of_words.get_all_words(): if word in Document.document_word_frequency: Document.document_word_frequency[word] += 1 else: Document.document_word_frequency[word] = 1
def evaluate(query): global DICTIONARY word_score = {} seek_pos = open(postings_file, 'r') seek_pos.seek(0,0) words = query.split() stemmer = PorterStemmer() words = [element.lower() for element in words] for item in words: word = stemmer.stem(item) if word not in word_score: if word in DICTIONARY: seek_pointer = DICTIONARY[word] seek_pos.seek(int(seek_pointer)) line = seek_pos.readline() seek_pos.seek(0,0) post_list = line.split() score = score_documents(post_list) word_score[word] = score else: #not encountered, score of 0 word_score[word] = [] #else duplicate, skip word result = score_query(word_score) return result
def stemText(s): ps = PorterStemmer() stemmedText = [] for word in s: stemmedText.append(ps.stem(word)) return stemmedText
def parse_questions(self): stemmer = PorterStemmer() tokenizer = RegexpTokenizer(r'\w+') for questions_key in self.rawSamples: # Stem the Question Text question_text = self.rawSamples[questions_key][0] words_array = tokenizer.tokenize(question_text) question_text = "" for word in words_array: if word.isnumeric(): continue if word not in text.ENGLISH_STOP_WORDS: word = stemmer.stem(word) word = stemmer.stem(word) question_text += (word + " ") self.rawSamples[questions_key][0] = question_text # Stem the topic names topics_text = self.rawSamples[questions_key][2] words_array = tokenizer.tokenize(topics_text) topics_text = "" for word in words_array: if word.isnumeric(): continue if word not in text.ENGLISH_STOP_WORDS: word = stemmer.stem(word) word = stemmer.stem(word) topics_text += (word + " ") self.rawSamples[questions_key][2] = topics_text
def clean_split_stem(rawstring): stop = stopwords.words('english') out_str = rawstring.split() porter = PorterStemmer() out_str = [porter.stem(word) for word in out_str] out_str = [word for word in out_str if word not in stop] return out_str
def extract_entities(doc): print 'extracting entities from %s...' % doc.getFilename() nps = list(set([re.sub(' \.', '', re.sub(' -[A-Z]{3}-', '', np).lower()) for np in doc.getAllNodesOfType('NP')])) p = PorterStemmer() entities = [] for np in nps: try: response = json.loads(requests.get(host+'select', params={'q': 'wam:[50 TO 100] AND iscontent:true AND lang:en AND (title_en:"%s" OR redirect_titles_mv_en:"%s")' % (np, np), 'fl': 'title_en,redirect_titles_mv_en', 'wt': 'json'}).content) except requests.exceptions.ConnectionError: while True: time.sleep(15) print 'retrying connection...' try: response = json.loads(requests.get(host+'select', params={'q': 'wam:[50 TO 100] AND iscontent:true AND lang:en AND (title_en:"%s" OR redirect_titles_mv_en:"%s")' % (np, np), 'fl': 'title_en,redirect_titles_mv_en', 'wt': 'json'}).content) break except requests.exceptions.ConnectionError: continue docs = response[u'response'][u'docs'] if len(docs) > 0: titles = [docs[0][u'title_en']] + docs[0].get(u'redirect_titles_mv_en', []) else: titles = [] if len(titles) > 0: titles = [' '.join([p.stem(w.lower()) for w in t.split(' ')]) for t in titles] stem_np = ' '.join([p.stem(w) for w in np.split(' ')]) for title in titles: if stem_np == title: entities.append(np) print np break #print doc.getFilename(), entities return (doc.getFilename(), entities)
def preprocess_text(raw): lower_raw = raw.lower() tokens = nltk.word_tokenize(lower_raw) filtered_tokens = [word for word in tokens if word not in stopwords.words('english')] port = PorterStemmer() #This extracts the important root of a word. eg. parsing -> pars stemmed = [port.stem(item) for item in tokens] return stemmed
def tokenize(docs, norm, stop, ne, central_per=None, central_loc=None, central_org=None): if stop: with open("stopwords.txt", "r") as f: sw = set([word.strip().decode("utf-8").lower() for word in f]) if norm == "stem": from nltk.stem.porter import PorterStemmer stemmer = PorterStemmer() all_toks = [] for doc in docs: toks = [] for sent in doc: if norm == "lemma": stoks = [unicode(tok.lem).lower() for tok in sent] elif norm == "stem": stoks = [stemmer.stem(unicode(tok).lower()) for tok in sent] else: stoks = [unicode(tok).lower() for tok in sent] if stop: toks.extend([tok for tok in stoks if tok not in sw]) else: toks.extend(stoks) toks = [tok for tok in toks if len(tok) < 50] #if len(toks) == 0: continue string = u" ".join(toks).encode("utf-8") #print string all_toks.append(string) return all_toks
def PreProcessing(line): unigrams = line.split() word_list = [word.lower() for word in unigrams if word.lower() not in stopwords] st = PorterStemmer() word_list = [st.stem(word) for word in word_list if word] vocab = [word for word in word_list if word not in stopwords] return vocab
def text_process(text): ''' Takes in a string of text, then performs the following 1. Tokenizes and removes punctuation 2. Removes stopwords 3. Stems 4. Returns a list of the cleaned text ''' if(pd.isnull(text)): return [] # Tokenize tokenizer = RegexpTokenizer(r'\w+') text_processed = tokenizer.tokenize(text) # Removing any stopwords text_processed = [word.lower() for word in text_processed if word.lower() not in stopwords.words('english')] # Stemming porterStemmer = PorterStemmer() text_processed = [porterStemmer.stem(word) for word in text_processed] try: text_processed.remove('b') except: pass return " ".join(text_processed)
def pre_processing(resume): unigrams = resume.split() word_list = [word.lower() for word in unigrams if word.lower() not in stopwords] st = PorterStemmer() word_list = [st.stem(word) for word in word_list if word] vocab = [word for word in word_list if word not in stopwords] return vocab
def issue_analysis(df): df_sub = df[['Issue']] df_sub.insert(0, 'count', 1) Issue_List=[] for i in range(0,50): Issue_List.append(df_sub.groupby(['Issue']).sum().sort_index(by='count', ascending=False).ix[i].name) tokenizer = RegexpTokenizer(r'[A-Za-z0-9\']+') # set tokenize Reg en_stop = get_stop_words('en') # create English stop words list p_stemmer = PorterStemmer() # Create p_stemmer of class PorterStemmer texts = [] # list for tokenized documents in loop text_view = '' # loop through document list for i in Issue_List: # clean and tokenize document string raw = i.lower() tokens = tokenizer.tokenize(raw) # remove stop words from tokens stopped_tokens = [i for i in tokens if not i in en_stop] # stem tokens and add them to list stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] texts.append(stemmed_tokens) #print ' '.join(stemmed_tokens) text_view += ' '.join(stemmed_tokens) text_view += ' ' wordcloud = WordCloud().generate(text_view) fig = plt.figure(figsize=(8,6)) fig1 = fig.add_subplot(1,1,1) fig1.set_title("Top issued words", fontdict={'fontsize':25}) fig1.imshow(wordcloud) fig1.axis("off") #plt.savefig('ComplainCount_WC.png') plt.savefig('ComplainCount_WC_2016.png') # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(texts) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in texts] # generate LDA model ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=25, id2word = dictionary) LDAText = ldamodel.print_topics(num_topics=5, num_words=3) #print "\n Topic analysis result for top 25 issues with LDA" #print(LDAText) vis_data = gensimvis.prepare(ldamodel, corpus, dictionary) #pyLDAvis.show(vis_data) #pyLDAvis.save_html(vis_data, "issue_lda.html") #pyLDAvis.save_json(vis_data, "issue_lda.json") pyLDAvis.save_html(vis_data, "issue_lda_2016.html") pyLDAvis.save_json(vis_data, "issue_lda_2016.json") return 0
def processing(raw_review): word1=[] # 1. Remove HTML review_text = BeautifulSoup(raw_review).get_text() # 2. Remove Punctuations letters_only = remove_punctuations(review_text) # 3. Convert to lower case, split into individual words for words in letters_only: wordset=[word.lower() for word in words] word1.append(wordset) #4Handling Double Negation negated_words=negation_handling(word1) #5 Read only verbs,adjectives,adverbs,interjections (descriptive words) meaningful_words=descriptive_words(negated_words) #6 Remove Time, Location, Organization, Person, Money, Percent, Date using NER #removed_words=remove_names(meaningful_words) #7. Remove stop words stops =open(r'C:\Users\PSarka\Desktop\sentimentanalysis\stopwords.txt','r') stops= set([word[:-1] for word in stops]) meaningful_words_new = [w for w in meaningful_words if not w in stops] #6.Stemming using Porter Stemmer,Lemming can also be used check which is more efficient st=PorterStemmer() stemmed_words=[st.stem(words) for words in meaningful_words_new] #7. Join the words back into one string separated by space, # and return the result. print stemmed_words return( " ".join(stemmed_words ))
def tweet_stemming(tweet, token_freqs): """ Stems tweets words and counts diversty :param tweet: the tweet to analyze :type tweet: str or unicode :param token_freqs: counter of words frequency :type token_freqs: Counter :returns: words added to token_freqs :rtype: int """ pattern_url = '((https?:\/\/)|www\.)([\da-z\.-]+)\.([\/\w \.-]*)( |$)' regex_punctuation = re.compile('[%s]' % re.escape(string.punctuation)) porter = PorterStemmer() counter_tokens = 0 tweet_url_removed = re.sub(pattern_url, '', tweet, flags=re.MULTILINE) # remove URL tweet_url_removed_tokenized = word_tokenize(tweet_url_removed) # tokenize tweet tweet_url_removed_tokenized_cleaned_stemming = [] # cleaned of URLs and hashs, and stemming for token in tweet_url_removed_tokenized: new_token = regex_punctuation.sub(u'', token) # remove punctuation and hash if not new_token == u'': new_token_stemming = porter.stem(new_token) tweet_url_removed_tokenized_cleaned_stemming.append(new_token_stemming) token_freqs[new_token_stemming] += 1 counter_tokens += 1 return counter_tokens
def query(new_doc,doc_topic,topic_word,dictionary,LSH,num_topic): tokens = [] token = get_tokens(new_doc) stopped_tokens = [i for i in token if not i in en_stop] p_stemmer = PorterStemmer() stemed_tokens = [] for i in stopped_tokens: try: temp_token = str(p_stemmer.stem(i)) stemed_tokens.append(temp_token) except IndexError: pass tokens = stemed_tokens new_corpus=dictionary.doc2bow(tokens) new_corpus = to_gibbs_corpus([new_corpus])[0] ##convert new_topic_vector = np.zeros(num_topic) for t in new_corpus: mult_par = topic_word[:,t[0]] + 1 mult_par = mult_par/np.sum(mult_par) new_topic_vector += np.random.multinomial(t[1],mult_par) #print mult_par #print topic_word[:,t[0]] new_topic_vector = new_topic_vector/np.sum(new_topic_vector) dist,indices=LSH.kneighbors(new_topic_vector,n_neighbors=20) print indices+1
def compare_english_simple(article_title): """Given a title of an article, returns the number of tokens, types, and stems in both the English version and the simple English version.""" english = extract_wikipedia_page(article_title, "en") simple = extract_wikipedia_page(article_title, "simple") num_tokens_english = len(english) num_tokens_simple = len(simple) types_english = count_words(get_words(english)) types_simple = count_words(get_words(simple)) porter_stemmer = PorterStemmer() stem_english = defaultdict(int) stem_simple = defaultdict(int) for key in types_english.keys(): stem_english[porter_stemmer.stem(key)] += 1 for key in types_simple.keys(): stem_simple[porter_stemmer.stem(key)] += 1 print ("Number of Tokens in English " + article_title + ": %d" % num_tokens_english) print ("Number of Tokens in Simple English " + article_title + ": %d" % num_tokens_simple) print ("Number of Types in English " + article_title + ": %d" % len(types_english)) print ("Number of Types in Simple English " + article_title + ": %d" % len(types_simple)) print ("Number of Stems in English " + article_title + ": %d" % len(stem_english)) print ("Number of Stems in Simple English " + article_title + ": %d" % len(stem_simple))
def destem(self, stemmed_term, corpus): ''' Given a stemmed term, we look through the text of every document in corpus, determine the most common "parent" version of the given stemmed term, and return it. ''' destemmed_term = "" min_num_terms = 5000 min_percentage = 0.20 candidates = {} stemmer = PorterStemmer() num_terms_checked = 0 num_docs_checked = 0 total_matches = 0 for doc in corpus: # matches is the list of all term in the current text that are # "ancestor" versions of the stemmed term. matches = ([term for term in doc.split_text if stemmer.stem(term) == stemmed_term]) num_terms_checked += len(doc.split_text) num_docs_checked += 1 total_matches += len(matches) if not matches: continue # we keep a tally of the number of times each "ancestor" # appears in our text for match in matches: if match in candidates: candidates[match] += 1 else: candidates[match] = 1 # sort potential destemmed versions in descending order # by frequency sorted_candidates = sorted(candidates.keys(), key=lambda term: candidates[term], reverse=True) if num_docs_checked == self.num_corpus_docs: # we've run through every doc, so the most frequent # ancestor of the stemmed term is the best destemmed # result. destemmed_term = sorted_candidates[0] break # if we've reviewed enough total words, we can start trying # to find a suitable destemmed term from what we have so far if min_num_terms <= num_terms_checked: # this is the most frequent ancestor of the stemmed term possible_match = sorted_candidates[0] test_percentage = candidates[possible_match] \ / float(total_matches) # if the potential destemmed version accounts for a # sufficient percentage of the total matches, we can # decide that it's a suitable destemmed result. if min_percentage <= test_percentage: destemmed_term = possible_match break print("Destemmed: {0} --> {1}".format(stemmed_term, destemmed_term)) return destemmed_term
def read_class_data(path, label=None): ''' Label may come from the data itself, may be assigned at run time ''' if os.path.exists(path): if os.path.isdir(path): paths = [os.path.join(path, f) for f in os.listdir(path)] else: paths = [path] else: print 'Given path does not exist.' return doc = doc_file() stemmer = PorterStemmer() instances = [] for p in paths: doc.path = p for raw_record in doc: record = unpack(raw_record, ',') text = record[3].strip('"') inst = {'tokens': [], 'label': ''} for t in wordpunct_tokenize(text): stem_t = stemmer.stem(t.lower()) if stem_t[0].islower(): inst['tokens'].append(stem_t) else: continue inst['label'] = label instances.append(inst) return instances
def processEmail(email_contents): vocabList = getVocabList() word_indices = [] # Preprocss Email email_contents = email_contents.lower() email_contents = re.sub('<[^<>]+>', ' ', email_contents) email_contents = re.sub('[0-9]+', 'number', email_contents) email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents) email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents) email_contents = re.sub('[$]+', 'dollar', email_contents) print('==== Processed Email ====') pattern = '[\s' + re.escape("@$/#.-:&*+=[]?!(){},'\">_<;%") + ']' all_words = re.split(pattern, email_contents) all_words = [x for x in all_words if x != ''] stemmer = PorterStemmer() for w in all_words: w = re.sub('[^a-zA-Z0-9]', '', w) w = w.strip() w = stemmer.stem(w) # ============= YOUR CODE HERE ============= # Instructions: Fill in this function to add the index of str to # word_indices if it is in the vocabulary. try: idx = vocabList.index(w) except ValueError: idx = -1 if idx is not -1: word_indices.append(idx) # =========================================== return word_indices
# delete punctuation for c in string.punctuation: msg = re.sub(r'\{}'.format(c), '', msg) # delete separator i.e. \n \t msg = ' '.join(msg.split()) return msg # In[15]: nltk.download('words') #pull thai word(Bags) th_stop = tuple(thai_stopwords()) en_stop = tuple(get_stop_words('en')) p_stemmer = PorterStemmer() # In[16]: def split_word(text): tokens = word_tokenize(text, engine='newmm') # Remove stop words tokens = [i for i in tokens if not i in th_stop and not i in en_stop] # หารากศัพท์ภาษาไทย และภาษาอังกฤษ # English tokens = [p_stemmer.stem(i) for i in tokens]
def main(): # storm = 'private' # data = read_data(storm) # print("Length of Data: {length}".format(length=len(data)))'' num_topics = 3 tokenizer = RegexpTokenizer(r'[a-z0-9\']+') p_stemmer = PorterStemmer() print( "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Running Sandy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" ) data_sandy = read_data("sandy") try: with open("storm_extracts/dict_sandy", 'rb') as f: dict_sandy = pickle.load(f) with open("storm_extracts/counts_sandy", 'rb') as f: counts_sandy = pickle.load(f) except: dict_sandy, _, counts_sandy = parse_text(data_sandy, "sandy", tokenizer, en_stop, p_stemmer) print("Length of Data: {length}".format(length=len(data_sandy))) with open("storm_extracts/dict_sandy", "wb") as fp: #Pickling pickle.dump(dict_sandy, fp) with open("storm_extracts/counts_sandy", "wb") as fp: #Pickling pickle.dump(counts_sandy, fp) lda_sandy = run_model(data_sandy, "sandy", num_topics) print( "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Running Harvey ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" ) data_harvey = read_data("harvey") try: with open("storm_extracts/dict_storm", 'rb') as f: dict_harvey = pickle.load(f) with open("storm_extracts/counts_storm", 'rb') as f: counts_harvey = pickle.load(f) except: dict_storm, _, counts_storm = parse_text(data_storm, "harvey", tokenizer, en_stop, p_stemmer) print("Length of Data: {length}".format(length=len(data_storm))) with open("storm_extracts/dict_storm", "wb") as fp: #Pickling pickle.dump(dict_harvey, fp) with open("storm_extracts/counts_storm", "wb") as fp: #Pickling pickle.dump(counts_harvey, fp) lda_harvey = run_model(data_harvey, "harvey", num_topics) print( "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Running Florence ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" ) data_Florence = read_data("Florence") dict_Florence, _, counts_Florence = parse_text(data_Florence, "Florence", tokenizer, en_stop, p_stemmer) lda_florence = run_model(data_Florence, "Florence", num_topics) print( "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Running Lane ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" ) data_Lane = read_data("Lane") dict_Lane, _, counts_Lane = parse_text(data_Lane, "Lane", tokenizer, en_stop, p_stemmer) lda_lane = run_model(data_Lane, "Lane", num_topics) print( "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Running Michael ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" ) data_Michael = read_data("Michael") dict_Michael, _, counts_Michael = parse_text(data_Michael, "Michael", tokenizer, en_stop, p_stemmer) lda_michael = run_model(data_Michael, "Michael", num_topics) print( "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Running Bonnie ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" ) data_bonnie = read_data("bonnie") dict_bonnie, _, counts_bonnie = parse_text(data_bonnie, "bonnie", tokenizer, en_stop, p_stemmer) lda_bonnie = run_model(data_bonnie, "bonnie", num_topics) print( "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Running private ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" ) data_private = read_data("private") dict_private, _, counts_private = parse_text(data_private, "private", tokenizer, en_stop, p_stemmer) lda_private = run_model(data_private, "private", num_topics) print( "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Running noise ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" ) data_noise = read_data("noise") try: with open("storm_extracts/dict_noise", 'rb') as f: dict_noise = pickle.load(f) with open("storm_extracts/counts_noise", 'rb') as f: counts_noise = pickle.load(f) except: dict_noise, _, counts_noise = parse_text(data_noise, "noise", tokenizer, en_stop, p_stemmer) print("Length of Data: {length}".format(length=len(data_noise))) with open("storm_extracts/dict_noise", "wb") as fp: #Pickling pickle.dump(dict_noise, fp) with open("storm_extracts/counts_noise", "wb") as fp: #Pickling pickle.dump(counts_noise, fp) lda_noise = run_model(data_noise, "noise", num_topics) models = [ lda_sandy, lda_harvey, lda_florence, lda_lane, lda_michael, lda_bonnie, lda_private, lda_noise ] model_names = [ "lda_sandy", "lda_harvey", "lda_florence", "lda_lane", "lda_michael", "lda_bonnie", "lda_private", "lda_noise" ] print("Printing sorted scores per model...") scores = [] for i in range(len(model_names)): print("COMPARING DATASETS TO: {a}".format(a=model_names[i])) per_model = [] for j in range(len(model_names)): if i == j: continue # print("Comparing {a} with {b}".format(a=model_names[i], b=model_names[j])) dist = compare_models(models[i], models[j]) scores.append((i, j, dist)) per_model.append((i, j, dist)) per_model = sorted(per_model, key=operator.itemgetter(2)) for i, j, dist in per_model: print("Comparing {a} with {b}".format(a=model_names[i], b=model_names[j])) print("Hellinger distance:", dist) print("\n") print("Printing total sorted scores...") scores = sorted(scores, key=operator.itemgetter(2)) for i, j, dist in scores: print("Comparing {a} with {b}".format(a=model_names[i], b=model_names[j])) print("Hellinger distance:", dist)
import nltk, math from nltk.tokenize import RegexpTokenizer from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer from math import log10, sqrt from collections import Counter import os import io stemmer = PorterStemmer() tokenizer = RegexpTokenizer(r'[a-zA-Z]+') corpusroot = './presidential_debates' #My subdiretory name vectors = {} #tf-idf vectors for all documents df = Counter() #storage for document frequency tfs = {} #permanent storage for tfs of all tokens in all documents lengths = Counter() #used for calculating lengths of documents postings_list = {} #posting list storage for each token in the corpus st_tokens = [] for filename in os.listdir(corpusroot): file = io.open(os.path.join(corpusroot, filename), "r", encoding='UTF-8') doc = file.read() file.close() doc = doc.lower() #given code for reading files and converting the case tokens = tokenizer.tokenize(doc) #tokenizing each document sw = stopwords.words('english') tokens = [stemmer.stem(token) for token in tokens if token not in sw] #removing stopwords and performing stemming tf = Counter(tokens) df += Counter(list(set(tokens))) tfs[filename] = tf.copy() #making a copy of tf into tfs for that filename tf.clear() #clearing tf so that the next document will have an empty tf
see ourselves as a developed nation, self-reliant and self-assured. Isn’t this incorrect? I have a third vision. India must stand up to the world. Because I believe that unless India stands up to the world, no one will respect us. Only strength respects strength. We must be strong not only as a military power but also as an economic power. Both must go hand-in-hand. My good fortune was to have worked with three great minds. Dr. Vikram Sarabhai of the Dept. of space, Professor Satish Dhawan, who succeeded him and Dr. Brahm Prakash, father of nuclear material. I was lucky to have worked with all three of them closely and consider this the great opportunity of my life. I see four milestones in my career""" # Cleaning the texts import re from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer from nltk.stem import WordNetLemmatizer ps = PorterStemmer() wordnet = WordNetLemmatizer() sentences = nltk.sent_tokenize(paragraph) corpus = [] for i in range(len(sentences)): review = re.sub('[^a-zA-Z]', ' ', sentences[i]) review = review.lower() review = review.split() review = [ wordnet.lemmatize(word) for word in review if not word in set(stopwords.words('english')) ] review = ' '.join(review) corpus.append(review) # Creating the TF-IDF model
#Importing the Dataset dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3) # Cleaning the texts import re import nltk nltk.download('stopwords') from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer corpus = [] for i in range(0, 1000): review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) review = review.lower() review = review.split() ps = PorterStemmer() review = [ ps.stem(word) for word in review if not word in set(stopwords.words('english')) ] review = ' '.join(review) corpus.append(review) ''' To clean HTML tags import re def cleanhtml(raw_html): cleanr = re.compile('<.*?>') cleantext = re.sub(cleanr, '', raw_html) return cleantext '''
def __init__(self): self.model_path = "./model.joblib" self.stemmer = PorterStemmer() self.clf, self.cv = self.trainer()
class EmoClf: def __init__(self): self.model_path = "./model.joblib" self.stemmer = PorterStemmer() self.clf, self.cv = self.trainer() def trainer(self): # import pdb; pdb.set_trace() if not os.path.exists(self.model_path): reviews_train = self.data_loader() reviews_train_clean = self.clean(reviews_train) stemmed_reviews_list = self.get_stemmed_text(reviews_train_clean) clf, cv = self.stemmed_review(stemmed_reviews_list) self.feat_to_coeff(clf, cv) joblib.dump(clf, self.model_path) with open('./storage.bin', 'wb') as f: cPickle.dump(cv, f) else: clf = joblib.load(self.model_path) with open('storage.bin', 'rb') as f: cv = cPickle.load(f) return clf, cv def tester(self, query): reviews_train_clean = self.clean([query]) stemmed_reviews_list = self.get_stemmed_text(reviews_train_clean) vectorized_test = self.cv.transform(stemmed_reviews_list) pred_proba = self.clf.predict_proba(vectorized_test) # print (pred_proba) emotion_dct = {"positive": pred_proba, "negative": 1 - pred_proba} return emotion_dct def data_loader(self): reviews_train = [] with open('./data/training/full_train.txt', 'r') as fp: reviews_train = [ each_line.strip() for each_line in fp.readlines()[:] ] return reviews_train def preprocess_reviews(self, reviews): REPLACE_NO_SPACE = re.compile( r"(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)" ) REPLACE_WITH_SPACE = re.compile(r"(<br\s*/><br\s*/>)|(\-)|(\/)") NO_SPACE = "" SPACE = " " reviews = [ REPLACE_NO_SPACE.sub(NO_SPACE, line.lower()) for line in reviews ] reviews = [REPLACE_WITH_SPACE.sub(SPACE, line) for line in reviews] return reviews def clean(self, reviews_list): reviews_list_clean = self.preprocess_reviews(reviews_list) return reviews_list_clean def get_stemmed_text(self, corpus): return [ ' '.join([self.stemmer.stem(word) for word in review.split()]) for review in corpus ] def stemmed_review(self, stemmed_reviews_list): #reviews_train cv = CountVectorizer(binary=True) cv.fit(stemmed_reviews_list) X = cv.transform(stemmed_reviews_list) target = [1 if i < 12500 else 0 for i in range(25000)] X_train, X_val, y_train, y_val = train_test_split(X, target, train_size=0.75) clf = LogisticRegression(C=0.05) clf.fit(X, target) # print ("Final Accuracy: %s" # % accuracy_score(target, clf.predict(X_test))) return clf, cv def feat_to_coeff(self, clf, vectorized): feature_to_coef = { word: coef for word, coef in zip(vectorized.get_feature_names(), clf.coef_[0]) } for best_positive in sorted(feature_to_coef.items(), key=lambda x: x[1], reverse=True)[:30]: print(best_positive) print("\n\n") for best_negative in sorted(feature_to_coef.items(), key=lambda x: x[1])[:30]: print(best_negative)
def processQueryToDoStemming(self, words): stems = [] stemmer = PorterStemmer() for word in words: stems.append(stemmer.stem(word)) return stems
import matplotlib.pyplot as plt import nltk from nltk.stem.porter import PorterStemmer from nltk.stem import WordNetLemmatizer from nltk.corpus import stopwords import re df=pd.read_csv("train.csv",encoding="ISO-8859-1") df.isnull().sum() y=df["Sentiment"] message=df["SentimentText"] ps=PorterStemmer() #--------------------------------------removing @username from tweet---------------------------------- message=message.str.replace("@[\w]*","") #------------------------------------------REMOVING HYPERLINK FROM TWEET----------------------------- message = message.str.replace('https?:\/\/\S+', '') #----------------------------------------------removing RT from tweet-------------------------------- message=message.str.replace('RT[\s]+', '') #---------------------------------------removing unwanted symbols and stopwords----------------------- corpus=[] for i in range(len(message)): review=re.sub("[^a-zA-Z]"," ",message[i])
import nltk import string from nltk.stem.porter import PorterStemmer from nltk.stem import WordNetLemmatizer #from gensim.summarization import keywords #from textblob import TextBlob import matplotlib.pyplot as plt state = 'Georgia' #n6download_shell() with open('tweets_{}.json'.format(state), 'r') as f: tweets = json.load(f) ps = PorterStemmer() #wnl = WordNetLemmatizer() stem_tweet = [] stopwords = nltk.corpus.stopwords.words('english') p = string.punctuation d = string.digits table_p = string.maketrans(p, len(p) * " ") table_d = string.maketrans(d, len(d) * " ") wordcloud_tweet = [] txt1 = '' txt2 = '' for twt in tweets: tx = unicodedata.normalize('NFKD', twt).encode('ascii', 'ignore') txt1 = tx.translate(table_p)
import dataset from nltk.tokenize import RegexpTokenizer from nltk.stem.porter import PorterStemmer from stop_words import get_stop_words db = dataset.connect('sqlite:///news.db') articles = [] tokenizer = RegexpTokenizer(r'\w+') stop_words = get_stop_words('en') p_stemmer = PorterStemmer() for article in db['articles'].all(): text = article['title'].lower().strip() text += " " + article['textContent'].lower().strip() if not text: continue # Tokenize tokens = tokenizer.tokenize(text) # Remove stop words and small words clean_tokens = [i for i in tokens if not i in stop_words] clean_tokens = [i for i in clean_tokens if len(i) > 2] # Stem tokens stemmed_tokens = [p_stemmer.stem(i) for i in clean_tokens] # Add to list articles.append((article['title'], stemmed_tokens)) print(articles[0]) from gensim import corpora
dataset.hist(column='length', by='feedback', bins=50,figsize=(10,4)) #importing the dataset again data=pd.read_csv('data.tsv', delimiter = '\t', quoting = 3) #cleaning the texts and stemming the texts import re import nltk from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer corpus=[] for i in range(0,3150): review = re.sub('[^a-zA-Z]', ' ', data['verified_reviews'][i] ) review=review.lower() review=review.split() ps=PorterStemmer() review=[ps.stem(word) for word in review if not word in set(stopwords.words('english'))] review=' '.join(review) corpus.append(review) # creating the Bag of words Model from sklearn.feature_extraction.text import CountVectorizer cv=CountVectorizer(max_features=1500) X=cv.fit_transform(corpus).toarray() y=data.iloc[:,4].values # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0) # Feature Scaling
file.close() df4['abstract']=df4['abstract'].apply(lambda x: " ".join(x for x in str(x).split() if not x.isdigit() and not x.isspace())) df4['abstract']=df4['abstract'].str.replace('[^\w\s,]','') #df4['abstract']=df4['abstract'].str.lower() # Topic modeling with LDA and Gensim tokenizer = RegexpTokenizer(r'\w+') # create English stop words list en_stop = get_stop_words('en') stop_plus = ['word', 'count', 'text', 'all', 'right', 'no', 'without', 'abstract', 'no', 'reuse', 'without', 'abstract', 'nan'] # Create PorterStemmer p_stemmer = PorterStemmer() # create list of documents abstract_set = [] for abstract in df1['abstract'].dropna(): abstract_set.append(abstract) for abstract in df2['abstract'].dropna(): abstract_set.append(abstract) for abstract in df3['abstract'].dropna(): abstract_set.append(abstract) for abstract in df4['abstract'].dropna(): abstract_set.append(abstract) # list for tokenized documents in loop
# Cleaning the texts import re #nltk.download('stopwords') from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer corpus = [] for i in range(0, 3896): text = re.sub('[^a-zA-Z]', ' ', dataset['text'][i]) text=dataset['text'][i] #'Not' is replaced by 'Nots' so that it will not be detected by stopwords. text=re.sub("not","nots",text) text = text.lower() text = text.split() ps = PorterStemmer() text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))] text = ' '.join(text) corpus.append(text) #Comparison models from sklearn.tree import DecisionTreeClassifier from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score from sklearn.metrics import confusion_matrix from scipy.sparse import lil_matrix from sklearn.metrics import classification_report from matplotlib import pyplot as plt from sklearn.feature_extraction.text import CountVectorizer
def do_stemming(filtered): stemmed = [] for f in filtered: stemmed.append(PorterStemmer().stem(f)) return stemmed
# Importing the dataset dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3) # Cleaning the texts import re import nltk nltk.download('stopwords') from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer corpus = [] for i in range(0, 1000): review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) review = review.lower() review = review.split() ps = PorterStemmer() all_stopwords = stopwords.words('english') all_stopwords.remove('not') review = [ ps.stem(word) for word in review if not word in set(all_stopwords) ] review = ' '.join(review) corpus.append(review) # Creating the Bag of Words model from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(max_features=1500) X = cv.fit_transform(corpus).toarray() y = dataset.iloc[:, -1].values # Splitting the dataset into the Training set and Test set
import nltk nltk.download('wordnet') from nltk.stem.porter import PorterStemmer porter_stemmer = PorterStemmer() from nltk.stem import WordNetLemmatizer wordnet_lemmatizer = WordNetLemmatizer() #stemming word_data = "It originated from the idea that there are readers who prefer learning new skills from the comforts of their drawing rooms" # First Word tokenization nltk_tokens = nltk.word_tokenize(word_data) #Next find the roots of the word for w in nltk_tokens: print("Actual: %s Stem: %s" % (w, porter_stemmer.stem(w))) print('-----------------------------') print('lemmatization') #lemmatization word_data2 = "It originated from the idea that there are readers who prefer learning new skills from the comforts of their drawing rooms" nltk_tokens = nltk.word_tokenize(word_data2) for w in nltk_tokens: print("Actual: %s Lemma: %s" % (w, wordnet_lemmatizer.lemmatize(w)))
from sklearn.metrics.pairwise import cosine_similarity #Stemming and Lemmatisation from nltk.stem.porter import PorterStemmer from nltk.stem.wordnet import WordNetLemmatizer import re import nltk from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer from nltk.tokenize import RegexpTokenizer # Get corpus and CountVector from sklearn.feature_extraction.text import CountVectorizer nltk.download('wordnet') nltk.download('stopwords') lem = WordNetLemmatizer() stem = PorterStemmer() stop_words = set(stopwords.words("english")) new_words = ['not_the'] stop_words = stop_words.union(new_words) #Should 'because' added? def preprocess(df, reset_list=[',', '.', '?', ';', 'however', 'but']): corpus = [] for i in tqdm(range(df.shape[0])): text = df['review_text'][i] change_flg = 0 #Convert to lowercase text = text.lower() ##Convert to list from string, loop through the review text
import re import nltk nltk.download('stopwords') from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer corpus = [] for index in dataset.index: review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][index]) review = review.lower() review = review.split() #Remove words not useful for for NLP like the,and, or etc. review = [ word for word in review if not word in set(stopwords.words('english')) ] #Stemming : finding root of the words porterStemmer = PorterStemmer() review = [porterStemmer.stem(wrd) for wrd in review] review = ' '.join(review) corpus.append(review) #Creating Bag of Words model from sklearn.feature_extraction.text import CountVectorizer countVectorizer = CountVectorizer(max_features=1500) X = countVectorizer.fit_transform(corpus).toarray() y = dataset.iloc[:, 1].values #Splitting the data into train and test from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y,
paragraphs = list(map(lambda p: " ".join(p.split("\r\n")), paragraphs)) # 1.4 Tokenize paragraphs = list(map(lambda p: p.split(), paragraphs)) # 1.5 Remove punctuation import string paragraphs = list( map(lambda p: list(map(lambda w: w.strip(string.punctuation).lower(), p)), paragraphs)) paragraphs = list( map(lambda p: list(filter(lambda w: len(w) != 0, p)), paragraphs)) # 1.6 Stem words from nltk.stem.porter import PorterStemmer stemmer = PorterStemmer() paragraphs = list( map(lambda p: list(map(lambda w: stemmer.stem(w), p)), paragraphs)) # Bottleneck # 2.0 Build a dictionary import gensim dictionary = gensim.corpora.Dictionary(paragraphs) # 2.1 Filter away stopwords stopwords = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your".split( ",") token2id = dictionary.token2id stop_ids = [token2id[s] for s in stopwords if s in token2id] dictionary.filter_tokens(stop_ids)
def __init__(self, filenames, parser, batch_size=32, stemmer=PorterStemmer(), max_sentence_length=None, build_voc=False, voc_path='voc', voc_threshold=1): self.filenames = filenames self.parser = parser self.batch_size = batch_size self.stemmer = stemmer self.len_doc = 0 self.voc_threshold = voc_threshold if build_voc: print(self.filenames) vb = VocBuilder(self.filenames, self.parser, voc_path=voc_path, voc_threshold=self.voc_threshold, stemmer=stemmer) vb.build_vocab() #max_sentence_length = vb.max_sentence_length #print("max sentence length {}".format(max_sentence_length)) voc_components = [ 'index2word.json', 'word2index.json', 'voc_summary.json' ] #sanity check to see if all vocab files are present try: for item in voc_components: assert item in os.listdir(voc_path) except FileNotFoundError: raise Exception( "voabulary has not been created, set build_voc = True in BatchGenerator contructor" ) # counts the total number of lines in the input documents for filename in self.filenames: number_of_lines = 0 f = codecs.open(filename, 'r', encoding="utf8", errors='ignore') for line in f: number_of_lines += 1 self.len_doc += number_of_lines if self.batch_size is None: self.batch_size = self.len_doc print("loading voc data...") with open(os.path.join(os.getcwd(), voc_path, 'index2word.json')) as data_file: self.index2word = json.load(data_file) self.index2word[len(self.index2word)] = '<PAD>' with open(os.path.join(os.getcwd(), voc_path, 'word2index.json')) as data_file: self.word2index = json.load(data_file) self.word2index['<PAD>'] = 0 with open(os.path.join(os.getcwd(), voc_path, 'voc_summary.json')) as data_file: self.voc_summary = json.load(data_file) if max_sentence_length is None: self.sequence_len = self.voc_summary['max_sequence_len'] elif max_sentence_length > self.voc_summary['max_sequence_len']: self.sequence_len = self.voc_summary['max_sequence_len'] else: self.sequence_len = max_sentence_length self.vocab_size = len(self.index2word) print("...voc data loaded")
def tokenize(self, text): stemmer = PorterStemmer() tokens = nltk.word_tokenize(text) stems = self.stem_tokens(tokens, stemmer) return stems
example = "Automation automatic automated automotive" example_lower = example.lower().split() print(example_lower) # In[14]: #stemming #import stemmer from nltk.stem.porter import PorterStemmer ps = PorterStemmer() # In[15]: #FOR loop for using Porter Stemmer for word in example_lower: stemmed_word = ps.stem(word) print(stemmed_word) # # #### Lemmatization: #
specify that tab...and quoting is used to ignore ""...''' #cleaning of the text import re import nltk nltk.download('stopwords') #this stopwords file contains the words that are not relevant in reviews from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer #this is used to change loved = loving = love corpus = [] for i in range(0,1000): reviews = re.sub('[^a-zA-Z]', ' ', dt['Review'][i]) # '^' this sign is used not to remove alphabets reviews = reviews.lower() #to lower all alphabets reviews = reviews.split() ps = PorterStemmer() reviews = [ps.stem(word) for word in reviews if not word in set(stopwords.words('english'))] # set is used only to increse speed if we have a large review reviews = ' '.join(reviews) corpus.append(reviews) #bag of words model from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(max_features = 1500) #we also can use stopwords parameter in countervectorizer and other functions #max_feature will remove non_relavant word and change features from 1565 to 1500 X = cv.fit_transform(corpus).toarray() y = dt.iloc[:,1].values #fit the model naive bayes from sklearn.model_selection import train_test_split X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.2,random_state =0)
label_y=LabelEncoder() y_cat1=label_y.fit_transform(y) onehotencoder=OneHotEncoder(categorical_features=[0]) y_cat=onehotencoder.fit_transform(y_cat1.reshape(-1,1)).toarray() y_cat=y_cat[:,1:13] ################################################################################## ############ ######## train neural networks ############ ps=PorterStemmer() corpus=[] for i in range (0,40000): test_train=re.sub('[^a-zA-z]',' ',dataset['content'][i]) test_train=test_train.lower() test_train=test_train.split() test_train=[ps.stem(word) for word in test_train ] test_train=' '.join(test_train) corpus.append(test_train) cv=CountVectorizer(max_features=30000) x=cv.fit_transform(corpus).toarray() #y_cat=y_cat[0:2000] ################################################################################## ####### spiltting from sklearn.model_selection import train_test_split
def data_cleaning(): tokenized_collection = [] #loop through all xml files for file_name in range(5000): e = xml.etree.ElementTree.parse('./data/' + str(file_name + 1) + '.xml').getroot() #check if empty if not e.text: tokenized_collection.append([]) else: xml_text = word_tokenize(e.text) xml_text_stop = [] #remove empty words, and remove punctuations xml_text_stop = list(filter(None, xml_text)) for index, w in enumerate(xml_text_stop): xml_text_stop[index] = re.sub(r'[^\w\s]', '', w) #stop words processing xml_text_stop_fin = [] stopWords = set(stopwords.words('english')) for w in xml_text_stop: if w not in stopWords: xml_text_stop_fin.append(w) #stem words processing ps = PorterStemmer() xml_text_stop_fin = [ps.stem(a) for a in xml_text_stop_fin] #remove words with length lesser than 3 xml_text_stop_fin = [ word for word in xml_text_stop_fin if len(word) >= 3 ] #remove duplicates xml_text_stop_cleaned = list(set(xml_text_stop_fin)) #remove numbers and add hasNum feature for index, w in enumerate(xml_text_stop_cleaned): if str(w).isdigit(): del xml_text_stop_cleaned[index] if 'hasNum' not in xml_text_stop_cleaned: xml_text_stop_cleaned.append('hasNum') #print(xml_text_stop_fin) tokenized_collection.append(xml_text_stop_cleaned) #remove features that are mentioned less than 5 times in the dataset #create feature set feature_set = set() for item in tokenized_collection: for w in item: if w not in feature_set: feature_set.add(w) #lookup frequencies of occurences for feature in feature_set.copy(): feature_count = 0 for tokenized_collection_item in tokenized_collection: feature_count += tokenized_collection_item.count(feature) if feature_count < 5: for tokenized_collection_item_second in tokenized_collection: while feature in tokenized_collection_item_second: tokenized_collection_item_second.remove(feature) feature_set.remove(feature) print(len(feature_set)) #print(tokenized_collection) return feature_set, tokenized_collection
tokens = preprocess(txt, lowercase=True) punctuation = list(string.punctuation) en_stop = get_stop_words('en') stop = stopwords.words('english') + punctuation + [ 'http', 'html', 'com', ':/', 'rt', 'via', "https", "com" ] terms_stop = [ term for term in tokens if term not in stop and len(term) > 2 and not term in en_stop ] stopped_tokens = [i for i in terms_stop if not i in en_stop] # In[8]: p_stemmer = PorterStemmer() texts = [p_stemmer.stem(i) for i in terms_stop] print len(texts) # In[22]: #with open('processed_texts.txt', 'w') as fp: # fp.write(str(texts)) # In[34]: #f = open(mydir + "processed_texts.txt", 'r') #texts = f.read() #texts = texts.split() #len(texts)
from nltk.stem.wordnet import WordNetLemmatizer lem = WordNetLemmatizer() from nltk.stem.porter import PorterStemmer stem = PorterStemmer() word = 'flying' stemmed = stem.stem(word) print(f'Lemmatized Word: {lem.lemmatize(word, "v")}') print(f'Stemmed Word: {stemmed}')
# In[5]: from nltk.corpus import stopwords sw = stopwords.words('english') clean_tokens = [token for token in tokens if token not in sw] # In[6]: clean_tokens # ['Citizens', 'India', 'known', 'Indians'] # In[7]: from nltk.stem.porter import PorterStemmer pstemmer = PorterStemmer() [pstemmer.stem(token) for token in clean_tokens] # ['citizen', 'india', 'known', 'indian'] # In[8]: from nltk.stem import WordNetLemmatizer lemmatizer = WordNetLemmatizer() [lemmatizer.lemmatize(token) for token in clean_tokens] #['Citizens', 'India', 'known', 'Indians']
def __init__(self, stemmer=PorterStemmer()): self._stemmer = stemmer