def createQueriesDictionary(Data): InvertedIndex, Queries = dm.readInvertedIndex(), {} N = 537933 # Total Number of Queries for data in Data: for i in range(1, 3): qid, query = data[i], [data[i+2]] if qid not in Queries: Queries.update({qid:query}) for qid in Queries: Words = TextBlob(Queries[qid][0]).lower().words # Dictionary word -> frequency Hashes, Weights = [], [] try: maxf = max(Words.count(w) for w in Words) # Max Frequency of a term in the query except: continue # Corrupted Data for w in Words: Hashes.append(hashFunction(w, 64)) f, n = Words.count(w)/maxf, len(InvertedIndex[w]) # f(t), n(t) idf = math.log(N/n)/math.log(N) # IDF(t) Weights.append(f*idf) queryHash = HashQuery(Hashes, Weights) Queries[qid].append(queryHash) with open('Queries.txt', 'wb') as file: pickle.dump(Queries, file)
def tweet_processor(path, part, freq=1): myFile = pd.read_csv(path, sep=',') tweets = myFile["text"] if "May" in path: part = 1 # if "May" not in path: tweets = tweets[int(len(tweets)*(part-1)*0.5):int(len(tweets)*part*0.5)] blob = " ".join(myFile["text"]).split(" ") processed_tweets = [] compound_sent = [] print("n tweets: ",len(tweets)) sid = SentimentIntensityAnalyzer() for tweet in tweets: cleaned_tweet = p.clean(tweet.lower()) filtered_tweet= clean_tweets(cleaned_tweet) ss = sid.polarity_scores(filtered_tweet) cur_sent = [ss['neg'],ss['pos'], ss['neu'], ss['compound']] blob = TextBlob(filtered_tweet) Sentiment = blob.sentiment polarity = Sentiment.polarity subjectivity = Sentiment.subjectivity if filtered_tweet != "" and len(filtered_tweet) >2: processed_tweets.append(filtered_tweet) compound_sent.append(cur_sent) # np.savetxt("processed_tweets.csv", processed_tweets, delimiter=",", fmt='%s') compound_sent = np.asarray(compound_sent) freqs = [] print("number of words: ",len((" ".join(processed_tweets).split(" ")))) print("unique words: ",len(set(" ".join(processed_tweets).split(" ")))) if freq ==0: #Use blob counting words = set(blob.split(" ")) for word in set(blob.split(" ")): if word != "" and len(word)>2: freqs.append([word,blob.count(word)]) freqs = np.asarray(freqs) freqs = freqs[np.argsort(freqs[:, 1])][::-1] if freq ==1: #Use NLTK freqdist freqs = pfreq_dist(" ".join(processed_tweets).split(" ")) freqs = np.asarray(freqs) return processed_tweets, freqs, compound_sent