def Clustering(outfile,tweets,n_clusters,topic,k) : np.random.seed(0) vectorizer = TfidfVectorizer(stop_words='english') X = vectorizer.fit_transform(tweets) model = KMeans(n_clusters, init='k-means++', max_iter=500, n_init=20) model.fit(X) #Check for empty clusters if any Tweetclusters=list(model.predict(X)) nonempty=[ i for i in range(0,n_clusters) if not Tweetclusters.count(i)==0 ] empty=list(set(range(0,n_clusters))-set(nonempty)) print("Empty Clusters :"+str(topic)+" TOP "+str(k)+" KMEANS "+str(n_clusters)+" "+str(empty)) #Write Tweetwise Clusters to File outfile1=os.path.dirname(outfile)+"/TWEET_CLUSTER_"+str(topic)+"_TOP_"+str(k)+"_KMEANS_"+str(n_clusters)+".txt" util.listTotxt(outfile1,Tweetclusters,"w+") #Get top ranked tweets from cluster ind=int(k)/int(n_clusters) TopTweet ,ClusterAllIndex ,AllTopTweet = [] , [] , [] for i in range(n_clusters) : ClusterTweets , ClusterIndex , TopClusterTweet=[] , [] ,[] for j in range(0,len(Tweetclusters)): if Tweetclusters[j]==i : ClusterTweets.append(tweets[j]) ClusterIndex.append(j) ClusterAllIndex.append(ClusterIndex) TopClusterTweet=rank_by_val(ClusterTweets,topic,ind) TopTweet.append(TopClusterTweet) AllTopTweet.extend(TopClusterTweet) outfile1=os.path.dirname(outfile)+"/INDEX_"+str(topic)+"_TOP_"+str(k)+"_KMEANS_"+str(n_clusters)+".txt" util.listTotxt(outfile1,ClusterAllIndex,"w+") with open (outfile,"w+") as f: for i in range(len(AllTopTweet)) : j=AllTopTweet[i] f.write(str(tweets[j]).encode("utf-8")+"\n") outfile1=os.path.dirname(outfile)+"/CLUSTER_TWEETS_"+str(topic)+"_TOP_"+str(k)+"_KMEANS_"+str(n_clusters)+".txt" util.listTocsv(outfile1,TopTweet,"w+")
def process(inPath,outPath,topics) : for topic in topics : inFile=inPath+'/'+topic+".csv" tweets=util.csvTolist(inFile) tweets= [ str(tweet).strip("[']") for tweet in tweets ] print("No. of Tweets extracted "+str(topic)+"\t\t\t"+str(len(tweets))) tweets=make_lowercase(tweets) tweets=remove_repetition(tweets) tweets=remove_newline(tweets) tweets=if_not_topic(tweets,topic.lower()) #POS-Tagging of tweets pos_tweets=tagger.runtagger_parse(tweets) #[[[tw1_token1,postag,confidence],[tw1_token2,postag,confidence]],[[tw2_token1,postag,confidence]]] tweets=common_except_url(pos_tweets) pos_tweets=tagger.runtagger_parse(tweets) print("No. of Tweets after cleaning :"+str(topic)+"\t\t\t"+str(len(tweets))) outFile=outPath+'/data_'+topic+".txt" util.listTotxt(outFile,tweets,"w+") outFile=outPath+'/POS_'+topic+".csv" util.listTocsv(outFile,pos_tweets,"w+")