예제 #1
0
def getTweets(topic,PATH_TO_RAW_DATA,MAX_TWEETS) :
	auth=tweepy.OAuthHandler(API_KEY, API_SECRET)
	auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
	api = tweepy.API(auth,proxy="http://u.padalkar:[email protected]:3128/") #if using proxy based server,add an argument proxy="http://*****:*****@host:port/"
	
	for topic in topics :
		print topic
		outFile=PATH_TO_RAW_DATA+'/'+topic+".txt" 
		ctr=0
		while ctr < MAX_TWEETS :
			data=tweepy.Cursor(api.search, q=topic,lang='en').items(500)
			try:
				tweets = [data.text.lower().encode('utf-8') for tweet in data ]
				ctr=ctr+len(tweets)
				util.listTotxt(outFile,tweets,"a+")
				
			except tweepy.error.TweepError :	
				print("Waiting for 15 mins : Rate Limit Restriction "+str(ctr)+" Tweets Extracted\n") 
				time.sleep(60*15) # Rate Limit Restriction on no. of tweets extracted in one 15-min window
예제 #2
0
def Clustering(outfile,tweets,n_clusters,topic,k) :
	np.random.seed(0)
	vectorizer = TfidfVectorizer(stop_words='english')
	X = vectorizer.fit_transform(tweets)
	model = KMeans(n_clusters, init='k-means++', max_iter=500, n_init=20)
	model.fit(X)
	
	#Check for empty clusters if any
	Tweetclusters=list(model.predict(X))
	nonempty=[ i for i in range(0,n_clusters) if not Tweetclusters.count(i)==0 ]
	empty=list(set(range(0,n_clusters))-set(nonempty)) 
	print("Empty Clusters :"+str(topic)+" TOP "+str(k)+" KMEANS "+str(n_clusters)+" "+str(empty))

	#Write Tweetwise Clusters to File
	outfile1=os.path.dirname(outfile)+"/TWEET_CLUSTER_"+str(topic)+"_TOP_"+str(k)+"_KMEANS_"+str(n_clusters)+".txt"
	util.listTotxt(outfile1,Tweetclusters,"w+")
	
	#Get top ranked tweets from cluster 
	ind=int(k)/int(n_clusters)
	TopTweet ,ClusterAllIndex ,AllTopTweet = [] , [] , []
	for i in range(n_clusters) :
		ClusterTweets , ClusterIndex , TopClusterTweet=[] , [] ,[]
		for j in range(0,len(Tweetclusters)):
			if Tweetclusters[j]==i :
				ClusterTweets.append(tweets[j])
				ClusterIndex.append(j)
		ClusterAllIndex.append(ClusterIndex)
		TopClusterTweet=rank_by_val(ClusterTweets,topic,ind)
		TopTweet.append(TopClusterTweet)
		AllTopTweet.extend(TopClusterTweet)
	outfile1=os.path.dirname(outfile)+"/INDEX_"+str(topic)+"_TOP_"+str(k)+"_KMEANS_"+str(n_clusters)+".txt"
	util.listTotxt(outfile1,ClusterAllIndex,"w+")
	
	with open (outfile,"w+") as f:
		for i in range(len(AllTopTweet)) :
			j=AllTopTweet[i]
			f.write(str(tweets[j]).encode("utf-8")+"\n")

	outfile1=os.path.dirname(outfile)+"/CLUSTER_TWEETS_"+str(topic)+"_TOP_"+str(k)+"_KMEANS_"+str(n_clusters)+".txt"
	util.listTocsv(outfile1,TopTweet,"w+")
예제 #3
0
def process(inPath,outPath,topics) :
	for topic in topics :
		inFile=inPath+'/'+topic+".csv" 
		tweets=util.csvTolist(inFile)
		tweets= [ str(tweet).strip("[']") for tweet in tweets ]
	
		print("No. of Tweets extracted "+str(topic)+"\t\t\t"+str(len(tweets)))
		tweets=make_lowercase(tweets)
		tweets=remove_repetition(tweets)
		tweets=remove_newline(tweets)
		tweets=if_not_topic(tweets,topic.lower())

		#POS-Tagging of tweets
		pos_tweets=tagger.runtagger_parse(tweets) #[[[tw1_token1,postag,confidence],[tw1_token2,postag,confidence]],[[tw2_token1,postag,confidence]]]
		tweets=common_except_url(pos_tweets)
		pos_tweets=tagger.runtagger_parse(tweets)
		
		print("No. of Tweets after cleaning :"+str(topic)+"\t\t\t"+str(len(tweets)))
		
		outFile=outPath+'/data_'+topic+".txt" 
		util.listTotxt(outFile,tweets,"w+") 
		outFile=outPath+'/POS_'+topic+".csv" 
		util.listTocsv(outFile,pos_tweets,"w+") 
예제 #4
0
def GreedyNormal(outfile,TweetTokens,aspects,tweets,limit):
	TopInd , ctr =[] , []
	left=[aspect for aspect in aspects]
	
	for i in range(0,len(TweetTokens)) :	
		x=[ intersect(TweetTokens[i],left) , i ] 
		x.append(len(x[0]))
		ctr.append(x)

	ctr=sorted(ctr,key=lambda x: int(x[2]),reverse=True)
	while len(left)>0 and len(TopInd)<limit :
		TweetAsp=ctr[0][0] #aspects for that tweet
		TopInd.append(ctr[0][1])#index for tweet selected
		left=[ token for token in left if token not in TweetAsp ]
	        # remove aspect from other tweets aspects too as it is covered
		ctr.remove(ctr[0])
		for i in range(0,len(ctr)) :	
			ctr[i][0]=intersect(ctr[i][0], left) 
			ctr[i][2]=len(ctr[i][0])					
		ctr=sorted(ctr,key=lambda x: int(x[2]),reverse=True)

	results=[ tweets[i] for i in TopInd ]
	util.listTotxt( outfile, results , "w+")	
	return results
예제 #5
0
def getTweets(topic, PATH_TO_RAW_DATA, MAX_TWEETS):
    auth = tweepy.OAuthHandler(API_KEY, API_SECRET)
    auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
    api = tweepy.API(
        auth, proxy="http://*****:*****@HOSTNAME:PORT/"
    )  #if using proxy based server,add an argument proxy="http://*****:*****@host:port/"

    for topic in topics:
        print topic
        outFile = PATH_TO_RAW_DATA + '/' + topic + ".txt"
        ctr = 0
        while ctr < MAX_TWEETS:
            data = tweepy.Cursor(api.search, q=topic, lang='en').items(500)
            try:
                tweets = [data.text.lower().encode('utf-8') for tweet in data]
                ctr = ctr + len(tweets)
                util.listTotxt(outFile, tweets, "a+")

            except tweepy.error.TweepError:
                print("Waiting for 15 mins : Rate Limit Restriction " +
                      str(ctr) + " Tweets Extracted\n")
                time.sleep(
                    60 * 15
                )  # Rate Limit Restriction on no. of tweets extracted in one 15-min window
예제 #6
0
def ranker(rfile,data,topic,ind) :
	TopIndex=rank_by_val(data,topic,ind)
	TopData=[ data[j].encode("utf-8") for j in TopIndex ]
	util.listTotxt(rfile,TopData,"w+")
	return TopData
예제 #7
0
        topk = "TOP_" + str(ki)
        CosineSimilarityVSM = []

        method = "ALL_TWEETS"
        outPath = PATH_TO_RESULTS + "/" + topic + "/" + topk + "/" + method
        util.createFilePath(outPath)
        val = measures.entropy(tweets)
        print(topic + topk + method + " Entropy : " + str(val))

        method = "RANDOM_TWEETS"
        outPath = PATH_TO_RESULTS + "/" + topic + "/" + topk + "/" + method
        util.createFilePath(outPath)
        results = tweets[0:ki]
        rfile = outPath + "/" + topic + "_" + topk + "_" + method + ".txt"
        print rfile
        util.listTotxt(rfile, results, "w+")

        val = measures.entropy(results)
        print(topic + " " + topk + method + " Entropy : " + str(val))
        measures.get_ParaphraseSim(tweets, rfile, outPath, topic, ki)
        CosineSimilarityVSM.append(measures.get_VSMsim(rfile, tweets, results))
        outFile = outPath + "/" + topic + "_" + topk + "_" + method + "_VSMSimilarityMatrix.csv"
        measures.writeCosineSimMatrix(outFile, tweets, results)
        ind = outPath.rfind("/")
        outFile = outPath[
            0:
            ind] + "/" + topic + "_" + topk + "_" + method + "_Doc2vecSimilarityMatrix.csv"
        measures.writeDoc2vecSimMatrix(outFile, tweets, results, True)

        # UNSUPERVISED CLUSTERING USING KMEANS
        n_clusters = [5, 10, 25, 50, 100]
예제 #8
0
		topk="TOP_"+str(ki)
		CosineSimilarityVSM=[]

		method="ALL_TWEETS"
		outPath=PATH_TO_RESULTS+"/"+topic+"/"+topk+"/"+method
		util.createFilePath(outPath)
		val=measures.entropy(tweets)
		print(topic+topk+method+" Entropy : "+str(val))
	
		method="RANDOM_TWEETS"
		outPath=PATH_TO_RESULTS+"/"+topic+"/"+topk+"/"+method
		util.createFilePath(outPath)
		results=tweets[0:ki]
		rfile=outPath+"/"+topic+"_"+topk+"_"+method+".txt"
		print rfile
		util.listTotxt(rfile,results,"w+")

		val=measures.entropy(results)
		print(topic+" "+topk+method+" Entropy : "+str(val))
		measures.get_ParaphraseSim(tweets,rfile,outPath,topic,ki)
		CosineSimilarityVSM.append(measures.get_VSMsim(rfile,tweets,results))
		outFile=outPath+"/"+topic+"_"+topk+"_"+method+"_VSMSimilarityMatrix.csv"
		measures.writeCosineSimMatrix(outFile,tweets,results)
		ind=outPath.rfind("/")
		outFile=outPath[0:ind]+"/"+topic+"_"+topk+"_"+method+"_Doc2vecSimilarityMatrix.csv"
		measures.writeDoc2vecSimMatrix(outFile,tweets,results,True)
		
		# UNSUPERVISED CLUSTERING USING KMEANS		
		n_clusters=[5,10,25,50,100]
		for n in n_clusters :
			if ki>=n :