def abbrivation_list(): projection={'_id':False} abbrivations_dict={} res=load_from_mongo('abbrivations','list',projection=projection) for i in res: for key in i.keys(): key=str(key) i[key]=str(i[key]) abbrivations_dict[key]=i[key] return abbrivations_dict
def abbrivation_list(): projection = {'_id': False} abbrivations_dict = {} res = mongo.load_from_mongo('abbrivations', 'list', projection=projection) for i in res: for key in i.keys(): key = str(key) i[key] = str(i[key]) abbrivations_dict[key] = i[key] return abbrivations_dict
for key in accum_term.keys(): num_pos=0 num_neg =0 adjusted_score=0 term_value=0 total_sum =0 for score in accum_term[key]: total_sum = total_sum + score term_value = (total_sum)/len(accum_term[key]) term_value=int(term_value) add_to_senti_file[key]=term_value #add to new words to database or file with open ('new_words.txt','w') as f: for key,value in add_to_senti_file.items(): f.write(key+'\t'+str(value)+'\n') #f.write('\t'.join(tupl)) return score,actual_tweet,tweet_id res=mongo.load_from_mongo('input','data') for i in res: a,b,c= sentiment_of_tweet(i,abb_list,sentiment) g={} g['_id']= c g['text'] = b g['sentiment'] = a mongo.save_to_mongo(g,'output_final','with_ourscores')
from mongo import load_from_mongo import numpy as np import re import lda feature_array=[] result=load_from_mongo("hindu_modified","docs1") f=open('vocab.txt','r') feature_vector=f.read().split('\n') #print len(feature_vector) len_of_feature_vector=len(feature_vector) #print feature_vector[:10] for each in result: text=each["text"] #print text text = re.sub(r'[^a-zA-Z0-9 ]',' ',text) text_tokens=text.split(" ") text_dist={} for each in text_tokens: if each.lower() in text_dist.keys(): text_dist[each.lower()] =text_dist[each.lower()]+1 else: text_dist[each.lower()] = 1 vector=[0]*len_of_feature_vector for each in text_dist.keys(): #print feature_vector.index(each) vector[feature_vector.index(each)]=text_dist[each]
from __future__ import division ##True class A (TA) - correctly classified into class A ##False class A (FA) - incorrectly classified into class A ##True class B (TB) - correctly classified into class B ##False class B (FB) - incorrectly classified into class B from mongo import load_from_mongo from pylab import * #estimated=load_from_mongo('input','data') observed_ourscores=load_from_mongo('tes','coll_copy') TA=0 FA=0 TB=0 FB=0 #labels = 'positive','negative' for i in observed_ourscores: tweet_id=i['_id'] #print 'id',tweet_id senti_ob=i['sentiment'] #print senti_ob senti=load_from_mongo('train','coll_copy',criteria={'_id':tweet_id},projection={'_id':0,'sentiment':1}) print tweet_id senti_es = str(senti[0].values()[0]) #print senti_es #print senti_es if senti_ob>0 and senti_es=='1': #print 'positive'
from __future__ import division ##True class A (TA) - correctly classified into class A ##False class A (FA) - incorrectly classified into class A ##True class B (TB) - correctly classified into class B ##False class B (FB) - incorrectly classified into class B from mongo import load_from_mongo from pylab import * #estimated=load_from_mongo('input','data') observed_outscores=load_from_mongo('output_final','without_scores') TA=0 FA=0 TB=0 FB=0 labels = 'positive','negative' for i in observed_outscores: tweet_id=i['_id'] #print 'id',tweet_id senti_ob=i['sentiment'] #print senti_ob senti=load_from_mongo('input','data',criteria={'_id':tweet_id},projection={'_id':0,'sentiment':1}) senti_es = str(senti[0].values()[0]) #print senti_es #print senti_es if senti_ob>0 and senti_es=='positive': #print 'positive' TA =TA+1
except: avg_word_length = 0.0 try: avg_para_length = num_sentences / float(num_para) except: avg_para_length = 0.0 try: avg_sent_length = sum(length) / float(num_sentences) except: avg_sent_length = 0.0 return ("%.2f" % avg_para_length), ("%.2f" % avg_sent_length), ("%.2f" % avg_word_length) email_content = load_from_mongo('email_content', 'coll_ten') for i in email_content: name = i['name'] text = i['email'] token = i['email_number'] _id = i['_id'] ## fdist=nltk.FreqDist(ngrams(text)[0]) ## fdist1=nltk.FreqDist(ngrams(text)[1]) ## with open('test.txt','a') as f: ## for k,v in fdist1.items(): ## for iter in k: ## f.write(iter) ## if iter!=k[-1]: ## f.write(",") ## f.write('\n') let_f = sorted(letter_freq(text).iteritems())
regex = re.compile(r"not\b") if regex.search(tweet): negated = True tweet = re.sub(r"not\b", "", tweet) #preprocessing may increase accuracy tweet_id = text['_id'] tweet = re.sub(r"(?:\@|https?\://)\S+", "", tweet) tweet = re.sub(r"\b\d+\b", "", tweet) tweet = tweet.strip().lower() word_list = tweet.split() for k in word_list: if any(k == s for s in pos_words): score = score + 1 elif any(k == s for s in neg_words): score = score - 1 if negated: score = -score return score, actual_text, tweet_id res = mongo.load_from_mongo('input', 'data') for i in res: a, b, c = sentiment(i, abb_list) g = {} g['_id'] = c g['text'] = b g['sentiment'] = a mongo.save_to_mongo(g, 'output_final', 'without_scores')
from mongo import load_from_mongo from mongo import save_to_mongo stopwords_file='stopwords.txt' stop_lis=[] def stopwords_list(filename): with open(filename,'r') as f: for line in f: line=line.replace('\n','') stop_lis.append(line) stopwords_list(stopwords_file) def remove_stopwords(text): text = ' '.join([word for word in text.split() if word not \ in stop_lis]) return text docs_before=load_from_mongo("hindu","docs1") for each in docs_before: each["text"]=remove_stopwords(each["text"]) save_to_mongo(each,"hindu_modified","docs1")
from mongo import load_from_mongo results=load_from_mongo("hindu","docs1") ##f=open("titles",'w') ##count=0 ##print len(results) ##for each,i in results,range(len(results)): ## text='' ## text=text+str(count)+" "+each['HD'] ## ## #print text ## f.write(text) ## #f.write('\n') ## #count=count+1 ## #print count ## ##g=open("titles",'r') ##lis=g.read() ##print lis ##print len(lis) ##titles=tuple(lis) ##print len(titles) ##print titles print titles