def find_valid_pair(self,path='./ratings2.txt',final_path='./ratings_final.txt'): st=ps() cp_collection=[] docs=[] new_docs=[] new_word_pair=[] key=self.wordvector_dict.keys() for line in open(path): docs.append(line) for index in range(len(self.word_pair)): self.word_pair[index]=(st.stem(self.word_pair[index][0]),st.stem(self.word_pair[index][1])) if self.word_pair[index][0] in docs[index*2] and self.word_pair[index][1] in docs[index*2+1] and self.word_pair[index][0] in self.wordmap_collection and self.word_pair[index][1] in self.wordmap_collection and self.word_pair[index][0] in key and self.word_pair[index][1] in key: self.index_collection.append(index) cp_collection.append(self.collection[index]) #return index_collection for i in self.index_collection: new_docs.append(docs[i*2]) new_docs.append(docs[i*2+1]) new_word_pair.append(self.word_pair[i]) self.word_pair=new_word_pair file=open(final_path,'w') file.write(str(len(new_docs))+'\n') for item in range(len(new_docs)-1): file.write(new_docs[item]) pat=re.compile('\n') new_docs[-1]=re.sub(pat,'',new_docs[-1]) file.write(new_docs[-1]) file.close() self.collection=cp_collection
def avg_calc(self,mtopic,nword,ntopic): st=ps() for item in range(len(self.collection)): word1=st.stem(self.collection[item][1]) word2=st.stem(self.collection[item][3]) word1_3=self.pzIwc(2*item,word1,topic_range=200) word2_3=self.pzIwc(2*item+1,word2,topic_range=200) W1=[] W2=[] for itemx in word1_3[0]: W1.append(self.buildFinalVector(mtopic,nword,ntopic,word1,itemx,[0,0])) for itemx in word2_3[0]: W2.append(self.buildFinalVector(mtopic,nword,ntopic,word2,itemx,[0,0])) #W1.append(self.buildFinalVector(mtopic,nword,ntopic,word1,0,word1_3[1])) #W2.append(self.buildFinalVector(mtopic,nword,ntopic,word2,0,word2_3[1])) avg_sim=0 for item1 in range(len(W1)): for item2 in range(len(W2)): avg_sim+=self.P[word1+'_'+str(2*item)][item1]*self.P[word2+'_'+str(2*item+1)][item2]*self.similarity(W1[item1],W2[item2]) self.avg_sim_total.append(avg_sim) file=open('./final_avg.txt','a') file.write(str(avg_sim)+' '+str(self.collection[item][7])+' '+str(self.collection[item][1])+' '+str(self.collection[item][3])+'\n')# 7(avg),18(max),19(min) file.close() print('Iteration %d completed!'%item) print('AVGSim completed!') return self.avg_sim_total
def Modify(suffix='./wikicorp.201004.m2.txt',stopWords=['i','a','about','an','as','are','at','be','by','com','de','en','for','from','how','in','is','it','la','of','on','or','that','the','this','to','was','what','when','where','who','will','with','www']): newFileName='wikicorp.201004.m3.txt' newFile=open(newFileName,'w') n=0; #st=ls() st2=ps() d=enchant.Dict("en_US") for line in open(suffix): n+=1 x=line.split() for item in range(len(x)): if x[item] in stopWords: x[item]='' continue if d.check(x[item]): x[item]=st2.stem(x[item]) else: x[item]='' #x[item]=st.stem(x[item]) x=sorted(x) for xline in range(len(x)-1): if x[xline]!='': newFile.write(x[xline]+' ') if x[-1]!='': newFile.write(x[-1]) else: newFile.seek(-1,2) if n!=3035070: newFile.write('\n') newFile.close()
def Modify(suffix='./ratings1.txt',stopWords=['i','a','about','an','as','are','at','be','by','com','de','en','for','from','how','in','is','it','la','of','on','or','that','the','this','to','was','what','when','where','who','will','with','www']): newFileName='ratings2.txt' newFile=open(newFileName,'w') n=0; #st=ls() #pat=re.compile('[a-zA-Z\s]') pat='[^a-zA-Z\s]' st2=ps() d=enchant.Dict("en_US") for line in open(suffix): n+=1 x=line.split() for item in range(len(x)): if x[item] in stopWords: x[item]='' continue if d.check(x[item]) and x[item] not in string.punctuation and re.match(pat,x[item])==None: x[item]=st2.stem(x[item].lower()) else: x[item]='' #x[item]=st.stem(x[item]) x=sorted(x) for xline in range(len(x)-1): if x[xline]!='': newFile.write(x[xline]+' ') if x[-1]!='': newFile.write(x[-1]) else: newFile.seek(-1,2) if n!=4006: newFile.write('\n') newFile.close()
def version2(): # Data cleaning in NLP Model corpus = [] for i in range(0, 527383): review = re.sub( '[^a-zA-Z]', ' ', df.iloc[i, 1]) # Removing all elements except words from all reviews review = review.lower() review = review.split() review = [ word for word in review if not word in set(sw.words('english')) ] stammer = ps() review = [stammer.stem(word) for word in review] review = " ".join(review) corpus.append(review) features = cv().fit_transform(corpus) labels = df.iloc[:, -1] train_test_split(features, labels, 100) features_test_vectorized = cv().transform(features_test) features_train_vectorized = cv().fit_transform(features_train) model = lr().fit(features_train_vectorized, labels_train) predictions = model.predict(features_test_vectorized) ras(labels_test, predictions) cm(labels_test, predictions) return model
def word_diff(self): """ This function preserves the ending of each word since it gets destroyed after stemming """ stmr = ps() word_dict = {stmr.stem(word):[] for word in self.words} for word in self.words: stemmed = stmr.stem(word) st_temp = stmr.stem(word) while word.startswith(st_temp)==False and len(st_temp)!=0: splitted_str = list(st_temp) splitted_str.pop() st_temp = ''.join(splitted_str) word_dict[stemmed].append(re.sub(st_temp , '' , word)) return word_dict
def tok_tweet(tweet): stemmer=ps() tweet = tweet.strip() words = tweet.split() tokenlist = [] exclude = set(string.punctuation) punc = string.punctuation punc = punc.replace('#','') #we have speical handling for #tag exclude_punc = set(punc) for word in words: word = word.strip() word = word.lower() #Replace URLs with @http and then with blank -- think about this later (phase2) if word.startswith('www') or word.startswith('http') or word.startswith("@") or word.isdigit(): continue #ignore if word is a url, @mention or contains only numbers or is a stopword if ''.join(ch for ch in word if ch not in exclude) == '': #remove word if it is a sequence of punctuation characters continue nword = ''.join(ch for ch in word if ch not in exclude_punc) tokenlist.append(stemmer.stem(nword)) tokens= tokenlist return ' '.join(tokens)
def computeSentiment(document): senti=pickle.load(open('sentiWordNet.p')) updatedSenti = {} stemmer = ps() for word in senti: updatedSenti[stemmer.stem(word[:-2])]=senti[word] pos = 0 neg = 0 neu = 0 count = 0 for word in document.split(): if word in updatedSenti: pos += updatedSenti[word]['posScore'] #neg += updatedSenti[word]['negScore'] #neu += updatedSenti[word]['neuScore'] count += 1 else: pass #print word #print (float)(pos)/count #print (float)(neg)/count #print (float)(neu)/count return (float)(pos)/count
def tok_tweet(tweet): stemmer=ps() tweet = tweet.strip() words = tweet.split() tokenlist = [] exclude = set(string.punctuation) punc = string.punctuation punc = punc.replace('#','') exclude_punc = set(punc) for word in words: word = word.strip() word = word.lower() if word in stopwords.words('english'): continue #Replace URLs with @http and then with blank if word.startswith('www') or word.startswith('http') or word.startswith("@") or word.isdigit() or word == 'rt': continue #ignore if word is a url, @mention or contains only numbers or is a stopword nword = ''.join(ch for ch in word if ch not in exclude_punc) tokenlist.append(stemmer.stem(nword)) tokens= tokenlist return ' '.join(tokens)
def tok_tweet(tweet): stemmer = ps() tweet = tweet.strip() words = tweet.split() tokenlist = [] exclude = set(string.punctuation) punc = string.punctuation punc = punc.replace('#', '') #we have speical handling for #tag exclude_punc = set(punc) for word in words: word = word.strip() word = word.lower() #Replace URLs with @http and then with blank -- think about this later (phase2) if word.startswith('www') or word.startswith( 'http') or word.startswith("@") or word.isdigit(): continue #ignore if word is a url, @mention or contains only numbers or is a stopword if ''.join( ch for ch in word if ch not in exclude ) == '': #remove word if it is a sequence of punctuation characters continue nword = ''.join(ch for ch in word if ch not in exclude_punc) tokenlist.append(stemmer.stem(nword)) tokens = tokenlist return ' '.join(tokens)
def stemmer(self): #Stems a list of words using Porter's algorithm stmr = ps() return [stmr.stem(word) for word in self.words]
def stemming(obj): stem = ps() return set([stem.stem(words).lower() for words in obj])