Пример #1
0
 def find_valid_pair(self,path='./ratings2.txt',final_path='./ratings_final.txt'):
     st=ps()
     cp_collection=[]
     docs=[]
     new_docs=[]
     new_word_pair=[]
     key=self.wordvector_dict.keys()
     for line in open(path):
         docs.append(line)
     for index in range(len(self.word_pair)):
         self.word_pair[index]=(st.stem(self.word_pair[index][0]),st.stem(self.word_pair[index][1]))
         if self.word_pair[index][0] in docs[index*2] and self.word_pair[index][1] in docs[index*2+1] and self.word_pair[index][0] in self.wordmap_collection and self.word_pair[index][1] in self.wordmap_collection and self.word_pair[index][0] in key and self.word_pair[index][1] in key:
             self.index_collection.append(index)
             cp_collection.append(self.collection[index])
     #return index_collection
     for i in self.index_collection:
         new_docs.append(docs[i*2])
         new_docs.append(docs[i*2+1])
         new_word_pair.append(self.word_pair[i])
     self.word_pair=new_word_pair
     file=open(final_path,'w')
     file.write(str(len(new_docs))+'\n')
     for item in range(len(new_docs)-1):
         file.write(new_docs[item])
     pat=re.compile('\n')
     new_docs[-1]=re.sub(pat,'',new_docs[-1])
     file.write(new_docs[-1])
     file.close()
     self.collection=cp_collection
Пример #2
0
 def avg_calc(self,mtopic,nword,ntopic):
     st=ps()
     for item in range(len(self.collection)):
         word1=st.stem(self.collection[item][1])
         word2=st.stem(self.collection[item][3])
         word1_3=self.pzIwc(2*item,word1,topic_range=200)
         word2_3=self.pzIwc(2*item+1,word2,topic_range=200)
         W1=[]
         W2=[]
         for itemx in word1_3[0]:
             W1.append(self.buildFinalVector(mtopic,nword,ntopic,word1,itemx,[0,0]))
         for itemx in word2_3[0]:
             W2.append(self.buildFinalVector(mtopic,nword,ntopic,word2,itemx,[0,0]))
         #W1.append(self.buildFinalVector(mtopic,nword,ntopic,word1,0,word1_3[1]))
         #W2.append(self.buildFinalVector(mtopic,nword,ntopic,word2,0,word2_3[1]))
         avg_sim=0
         for item1 in range(len(W1)):
             for item2 in range(len(W2)):
                 avg_sim+=self.P[word1+'_'+str(2*item)][item1]*self.P[word2+'_'+str(2*item+1)][item2]*self.similarity(W1[item1],W2[item2])
         self.avg_sim_total.append(avg_sim)
         file=open('./final_avg.txt','a')
         file.write(str(avg_sim)+' '+str(self.collection[item][7])+' '+str(self.collection[item][1])+' '+str(self.collection[item][3])+'\n')# 7(avg),18(max),19(min)
         file.close()    
         print('Iteration %d completed!'%item)  
     print('AVGSim completed!')  
     return self.avg_sim_total
Пример #3
0
def Modify(suffix='./wikicorp.201004.m2.txt',stopWords=['i','a','about','an','as','are','at','be','by','com','de','en','for','from','how','in','is','it','la','of','on','or','that','the','this','to','was','what','when','where','who','will','with','www']):
    newFileName='wikicorp.201004.m3.txt'
    newFile=open(newFileName,'w')
    n=0;
    #st=ls()
    st2=ps()
    d=enchant.Dict("en_US")
    for line in open(suffix):
        n+=1
        x=line.split()
        for item in range(len(x)):
            if x[item] in stopWords:
                x[item]=''
                continue
            if d.check(x[item]):
                x[item]=st2.stem(x[item])
            else:
                x[item]=''
            #x[item]=st.stem(x[item])
        x=sorted(x)
        for xline in range(len(x)-1):
            if x[xline]!='':
                newFile.write(x[xline]+' ')
        if x[-1]!='':            
            newFile.write(x[-1])
        else:
            newFile.seek(-1,2)
        if n!=3035070:
            newFile.write('\n')
    newFile.close()
Пример #4
0
 def Modify(suffix='./ratings1.txt',stopWords=['i','a','about','an','as','are','at','be','by','com','de','en','for','from','how','in','is','it','la','of','on','or','that','the','this','to','was','what','when','where','who','will','with','www']):
     newFileName='ratings2.txt'
     newFile=open(newFileName,'w')
     n=0;
     #st=ls()
     #pat=re.compile('[a-zA-Z\s]')
     pat='[^a-zA-Z\s]'
     st2=ps()
     d=enchant.Dict("en_US")
     for line in open(suffix):
         n+=1
         x=line.split()
         for item in range(len(x)):
             if x[item] in stopWords:
                 x[item]=''
                 continue
             if d.check(x[item]) and x[item] not in string.punctuation and re.match(pat,x[item])==None:
                 x[item]=st2.stem(x[item].lower())
             else:
                 x[item]=''
             #x[item]=st.stem(x[item])
         x=sorted(x)
         for xline in range(len(x)-1):
             if x[xline]!='':
                 newFile.write(x[xline]+' ')
         if x[-1]!='':            
             newFile.write(x[-1])
         else:
             newFile.seek(-1,2)
         if n!=4006:
             newFile.write('\n')
     newFile.close()
def version2():  # Data cleaning in NLP Model
    corpus = []

    for i in range(0, 527383):
        review = re.sub(
            '[^a-zA-Z]', ' ',
            df.iloc[i,
                    1])  # Removing all elements except words from all reviews
        review = review.lower()
        review = review.split()
        review = [
            word for word in review if not word in set(sw.words('english'))
        ]
        stammer = ps()
        review = [stammer.stem(word) for word in review]
        review = " ".join(review)
        corpus.append(review)

    features = cv().fit_transform(corpus)
    labels = df.iloc[:, -1]

    train_test_split(features, labels, 100)

    features_test_vectorized = cv().transform(features_test)
    features_train_vectorized = cv().fit_transform(features_train)

    model = lr().fit(features_train_vectorized, labels_train)
    predictions = model.predict(features_test_vectorized)
    ras(labels_test, predictions)
    cm(labels_test, predictions)

    return model
Пример #6
0
    def word_diff(self):
        """
        This function preserves the ending of each word since 
        it gets destroyed after stemming
        
        """
        
        stmr = ps()
        word_dict = {stmr.stem(word):[] for word in self.words}

        for word in self.words:
            stemmed = stmr.stem(word)
            st_temp = stmr.stem(word)
            while word.startswith(st_temp)==False and len(st_temp)!=0:
                splitted_str = list(st_temp)
                splitted_str.pop()
                st_temp = ''.join(splitted_str)

            word_dict[stemmed].append(re.sub(st_temp , '' , word))
        return word_dict
Пример #7
0
def tok_tweet(tweet):
    stemmer=ps()
    tweet = tweet.strip()
    words = tweet.split()
    tokenlist = []
    exclude = set(string.punctuation)
    punc = string.punctuation
    punc = punc.replace('#','') #we have speical handling for #tag
    exclude_punc = set(punc)
    for word in words:
        word = word.strip()
        word = word.lower()
        #Replace URLs with @http and then with blank -- think about this later (phase2)
        if word.startswith('www') or word.startswith('http') or word.startswith("@") or word.isdigit():
            continue #ignore if word is a url, @mention or contains only numbers or is a stopword
        if ''.join(ch for ch in word if ch not in exclude) == '': #remove word if it is a sequence of punctuation characters
            continue
        nword = ''.join(ch for ch in word if ch not in exclude_punc)
        tokenlist.append(stemmer.stem(nword))
    tokens= tokenlist
    return ' '.join(tokens)
Пример #8
0
def computeSentiment(document):
	senti=pickle.load(open('sentiWordNet.p'))
	updatedSenti = {}
	stemmer = ps()
	for word in senti:
		updatedSenti[stemmer.stem(word[:-2])]=senti[word]

	pos = 0
	neg = 0
	neu = 0
	count = 0
	for word in document.split():
		if word in updatedSenti:
			pos += updatedSenti[word]['posScore']
			#neg += updatedSenti[word]['negScore']
			#neu += updatedSenti[word]['neuScore']
			count += 1
		else:
			pass
			#print word
	#print (float)(pos)/count
	#print (float)(neg)/count
	#print (float)(neu)/count
	return (float)(pos)/count
Пример #9
0
def tok_tweet(tweet):
    stemmer=ps()
    tweet = tweet.strip()
    words = tweet.split()
    tokenlist = []
    exclude = set(string.punctuation)
    punc = string.punctuation
    punc = punc.replace('#','')
    exclude_punc = set(punc)

    for word in words:
        word = word.strip()
        word = word.lower()

        if word in stopwords.words('english'):
            continue

        #Replace URLs with @http and then with blank
        if word.startswith('www') or word.startswith('http') or word.startswith("@") or word.isdigit() or word == 'rt':
            continue #ignore if word is a url, @mention or contains only numbers or is a stopword
        nword = ''.join(ch for ch in word if ch not in exclude_punc)
        tokenlist.append(stemmer.stem(nword))
    tokens= tokenlist
    return ' '.join(tokens)
Пример #10
0
def tok_tweet(tweet):
    stemmer = ps()
    tweet = tweet.strip()
    words = tweet.split()
    tokenlist = []
    exclude = set(string.punctuation)
    punc = string.punctuation
    punc = punc.replace('#', '')  #we have speical handling for #tag
    exclude_punc = set(punc)
    for word in words:
        word = word.strip()
        word = word.lower()
        #Replace URLs with @http and then with blank -- think about this later (phase2)
        if word.startswith('www') or word.startswith(
                'http') or word.startswith("@") or word.isdigit():
            continue  #ignore if word is a url, @mention or contains only numbers or is a stopword
        if ''.join(
                ch for ch in word if ch not in exclude
        ) == '':  #remove word if it is a sequence of punctuation characters
            continue
        nword = ''.join(ch for ch in word if ch not in exclude_punc)
        tokenlist.append(stemmer.stem(nword))
    tokens = tokenlist
    return ' '.join(tokens)
Пример #11
0
 def stemmer(self):
     #Stems a list of words using Porter's algorithm
     stmr = ps()
     return [stmr.stem(word) for word in self.words]
def stemming(obj):
    stem = ps()
    return set([stem.stem(words).lower() for words in obj])