def get_sentiment_data(): locations = lc.get_locations() ref_data_path = locations['REF_DATA_PATH'] sentiment_file_path = os.path.join(ref_data_path, 'sentiment_data/sentiment_lexicon.txt') sentiment_dict = defaultdict(dict) ## Reading and storing Emoticon words with open(sentiment_file_path, 'r') as f: emoticon_data = f.readlines() ## Creating Sentiment Dictionary for line in emoticon_data: line_split = line.split('\t'); emoticon = line_split[0] mean_variance = [float(line_split[1]), float(line_split[2])] sentiment_dict[emoticon] = mean_variance return sentiment_dict
def get_mpqa_data(): locations = lc.get_locations() ref_data_path = locations['REF_DATA_PATH'] mpqa_file_path = os.path.join(ref_data_path,'subjectivity_clues_hltemnlp05/subjclueslen1-HLTEMNLP05.tff') sentiment_map = lc.get_sentiment_map() with open(mpqa_file_path,'r') as f: data = f.readlines() mpqa_dict = {} for line in data: elem_bag = line.strip('\n').split(' ') for elem in elem_bag: item = elem.split('=') if item[0] == 'word1': word = item[1] elif item[0] == 'priorpolarity': binary_output = [0]*len(sentiment_map) if sentiment_map.get(item[1]) is not None: binary_output[sentiment_map.get(item[1])] = 1 mpqa_dict[word] = binary_output return mpqa_dict
def main(): locations = lrf_config.get_locations() INTERMED_DATA_PATH = locations['INTERMED_DATA_PATH'] intermedJsonPath = os.path.join(INTERMED_DATA_PATH, 'intermed_dict.json') tweets_classified_path = os.path.join(INTERMED_DATA_PATH, 'tweets_classified.txt') refRiskCatPath = os.path.join(INTERMED_DATA_PATH, 'risk_category_file.json') with open(intermedJsonPath, 'r') as f: intermed_data = json.load(f) with open(refRiskCatPath, 'r') as f: risk_data = json.load(f) ## reading data into the dictionaries again tweet_dict = dict(intermed_data['tweet_dict']) tweet_cmplt = dict(intermed_data['tweet_cmplt']) processTweets(tweet_dict, risk_data, tweet_cmplt, tweets_classified_path, vector_type='word_embeddings') print('DONE')
def main(): locations = lrf_config.get_locations() ref_data_dir = locations['REF_DATA_PATH'] + 'sentiment_data' intermed_data_dir = locations['INTERMED_DATA_PATH'] x_filename = 'tweets.txt' ##load and process samples print('start loading and process samples...') tweets = [] more_features = [] with open(os.path.join(ref_data_dir, x_filename)) as f: for i, line in enumerate(f): tweet_meta_features = {} tweet_obj = json.loads(line.strip(), encoding='utf-8') # Twitter Text contents content = tweet_obj['text'].replace("\n", " ") postprocessed_tweet, microblogging_features, lexicon_features = pre_process( content) tweets.append(postprocessed_tweet) tweet_meta_features[ 'microblogging_features'] = microblogging_features tweet_meta_features['lexicon_features'] = lexicon_features more_features.append(tweet_meta_features) # Write process tweet text to file with open(os.path.join(ref_data_dir, 'tweets_processed.txt'), 'w') as f: for tweet in tweets: f.write('%s\n' % tweet) # write additional tweet features to file with open(os.path.join(ref_data_dir, 'more_tweet_features.txt'), 'w', encoding='utf-8') as f: f.write(json.dumps(more_features, ensure_ascii=False)) print("Preprocessing is completed")
def get_news_data(folder_name,file_name): locations = lc.get_locations() NEWS_DATA_PATH = os.path.join(locations['REF_DATA_PATH'], folder_name+'/'+file_name) news_data = pd.read_csv(NEWS_DATA_PATH) news_data = news_data.drop(["Unnamed: 0"], axis=1).set_index('Unnamed: 0.1') news_data['category'] = prepare_data(news_data['category'],'y_data',list_type=int) news_data['text'] = prepare_data(news_data['text'],'x_data') news_data = news_data[news_data.text != 'None'] news_data.to_dict('index') return news_data
def get_twitter_abbreviations_data(): locations = lc.get_locations() ref_data_path = locations['REF_DATA_PATH'] abbr_file_path = os.path.join(ref_data_path, 'twitter_slang/twitter_slang_data.txt') data = pd.read_csv(abbr_file_path,sep='|',names=['abbr','meaning']) data_len = len(data['abbr']) abbr_list = data['abbr'].values meaning_list = data['meaning'].values slang_dict = {} for ind in range(data_len): slang_dict[abbr_list[ind]] = meaning_list[ind] return slang_dict
def get_tweet_data(file_type,file_name): locations = lc.get_locations() if file_type=='json': TWEETS_DATA_PATH = os.path.join(locations['INTERMED_DATA_PATH'], file_name) tweet_data = pd.read_json(TWEETS_DATA_PATH, orient='records',convert_axes=False) tweet_data['tweet_cmplt'] = prepare_data(tweet_data['tweet_cmplt'],'x_data') return tweet_data elif file_type == 'txt': TWEETS_DATA_PATH = os.path.join(locations['INTERMED_DATA_PATH'], file_name) if file_name == 'tweets_classified.txt': tweet_data = pd.read_csv(TWEETS_DATA_PATH,sep='|',names=['tweet_id','class_pos','class_both','tweet_word_list','tweet_cmplt']).drop_duplicates().set_index('tweet_id') elif file_name == 'tweet_truth.txt': tweet_data = pd.read_csv(TWEETS_DATA_PATH, sep='|').drop_duplicates(subset='tweet_id').set_index('tweet_id') tweet_data['class_annotated'] = prepare_data(tweet_data['class_annotated'], 'y_data',list_type=str) tweet_data['tweet_cmplt'] = prepare_data(tweet_data['tweet_cmplt'], 'x_data') tweet_data['tweet_word_list'] = prepare_data(tweet_data['tweet_word_list'], 'word_bag') tweet_data['class_pos'] = prepare_data(tweet_data['class_pos'], 'y_data') tweet_data['class_both'] = prepare_data(tweet_data['class_both'], 'y_data') tweet_data.to_dict('index') return tweet_data
def main(): locations = lrf_config.get_locations() glove_data_dict = get_glove_dict(locations['INTERMED_DATA_PATH'] + 'glove_key_subset.json')
import argparse import os import numpy as np from lrf import utility, sentiment_ml_classifier from sklearn.preprocessing import scale import json from scipy.sparse import coo_matrix from sklearn.feature_selection import chi2, SelectKBest from sklearn.metrics import classification_report, precision_score, recall_score, f1_score from lrf import lrf_config from scipy.sparse import coo_matrix, hstack ############################ GLOBAL Variables ########################## data paths locations = lrf_config.get_locations() data_dir = locations['REF_DATA_PATH'] + 'sentiment_data' tweets_data = 'tweets_processed.txt' labels_data = 'labels.txt' more_tweet_data = 'more_tweet_features.txt' MORE_FEAT_FLAG = True ######################## select_top_k_features(data,labels,n_components=1700) def select_top_k_features(data, labels, n_components=1700): data = SelectKBest(chi2, k=n_components).fit_transform(data, labels) return data ############################### load_and_process def load_and_process(data_file, label_file):
def main(): locations = lrf_config.get_locations() ref_data_dir = locations['REF_DATA_PATH'] x_filename = 'sentiment_data/tweets.txt' y_filename = 'sentiment_data/labels.txt' ##load and process samples print('start loading and process samples...') tweets = [] microblog_features = [] lexicon_features = [] tweets_lst = [] with open(os.path.join(ref_data_dir, x_filename)) as f: for i, line in enumerate(f): tweet_obj = json.loads(line.strip(), encoding='utf-8') # Twitter Text contents content = tweet_obj['text'].replace("\n", " ") tweets_lst.append(pre_process_lst(content)) postprocessed_tweet, microblogging_features, mpqa_sentiment_score = pre_process( content) tweets.append(postprocessed_tweet) microblog_features.append(microblogging_features) lexicon_features.append(mpqa_sentiment_score) lexicon_features = np.asarray(lexicon_features) microblog_features = np.asarray(microblog_features) tf_idf_vectorizer = utility.get_tf_idf_vectorizer(tweets_lst, ngram_range=2) transformed_data_rahul = tf_idf_vectorizer.fit_transform(tweets_lst) # # tf_idf_vectorizer = utility.get_tf_idf_vectorizer(tweets,ngram_range=2) # # transformed_data_mine = tf_idf_vectorizer.fit_transform(tweets) with open(os.path.join(ref_data_dir, y_filename)) as f: y_data = f.readlines() y_data = [y.strip('\n') for y in y_data] y_data = np.asarray(y_data) num_of_features = 50 accuracy_in_each_turn = [] while num_of_features <= 3000: X_new = SelectKBest(chi2, k=num_of_features).fit_transform( transformed_data_rahul, y_data) extended_features_1 = np.append(X_new.toarray(), lexicon_features, axis=1) extended_features_2 = np.append(extended_features_1, microblog_features, axis=1) sentiment_map = lrf_config.get_sentiment_map() inv_sentiment_map = {str(v): k for k, v in sentiment_map.items()} X_data = X_new.toarray() kf = KFold(n_splits=5) kf.get_n_splits(X_data) train_list = [] test_list = [] for train_index, test_index in kf.split(X_data): X_train = X_data[train_index] Y_train = y_data[train_index] X_test = X_data[test_index] Y_test = y_data[test_index] Y_pred, train_acc, test_acc = classifier.classify( 'svc', X_train=X_train, Y_train=Y_train, X_test=X_test, Y_test=Y_test, class_map=inv_sentiment_map, is_X_text=False) # print('_______________________________________________________') # print(train_acc) # print(test_acc) train_list.append(train_acc) test_list.append(test_acc) # print('Train_Acc : ',np.mean(train_acc)) # print('Test_Acc : ', np.mean(test_acc)) accuracy_in_each_turn.append([np.mean(train_acc), np.mean(test_acc)]) for elem in accuracy_in_each_turn: print(elem)
def glove_classification(self, data_dict, keywords, keyword_type, glove_data_file, glove_key_file): print('ENTERED GLOVE_CLASSIFICATION') locations = lrf_config.get_locations() glove_data_dict = utility.get_glove_dict( locations['INTERMED_DATA_PATH'] + glove_data_file) glove_key_dict = utility.get_glove_dict( locations['INTERMED_DATA_PATH'] + glove_key_file) glove_crux_pos = utility.getWordToCategMap(keywords, glove_key_dict, 'pos') pos_key_glove_arr = glove_crux_pos['key_glove_arr'] inv_pos_key_index = glove_crux_pos['inv_key_index'] pos_risk_dict = glove_crux_pos['risk_dict'] if keyword_type == 'both': glove_crux_neg = utility.getWordToCategMap( keywords, glove_key_dict, 'neg') neg_key_glove_arr = glove_crux_neg['key_glove_arr'] inv_neg_key_index = glove_crux_neg['inv_key_index'] neg_risk_dict = glove_crux_neg['risk_dict'] pos_predictions = {} both_predictions = {} for id in data_dict.keys(): data_lst = [] for word in data_dict.get_value(id): if word in glove_data_dict: data_lst.append(glove_data_dict[word][0]) ## Preparing Tweet Array data_arr = np.asarray(data_lst) if len(data_arr) != 0: ## Calculating cosine similarity pos_cos_similarity = cosine_similarity( data_arr, pos_key_glove_arr) pos_nearest_neighbors = np.argsort(pos_cos_similarity, axis=1)[:, -10:] pos_tweet_neighbors = [ item for sublist in pos_nearest_neighbors for item in sublist ] membership_count = {} membership_count_pos = utility.getMembershipCount( pos_tweet_neighbors, inv_pos_key_index, pos_risk_dict, membership_count) v_pos = list(membership_count_pos.values()) k_pos = list(membership_count_pos.keys()) output_pos = k_pos[v_pos.index(max(v_pos))] if keyword_type == 'both': neg_cos_similarity = cosine_similarity( data_arr, neg_key_glove_arr) neg_nearest_neighbors = np.argsort(neg_cos_similarity, axis=1)[:, :10] neg_tweet_neighbors = [ item for sublist in neg_nearest_neighbors for item in sublist ] membership_count_both = utility.getMembershipCount( neg_tweet_neighbors, inv_neg_key_index, neg_risk_dict, membership_count_pos.copy()) v_both = list(membership_count_both.values()) k_both = list(membership_count_both.keys()) output_both = k_both[v_both.index(max(v_both))] pos_predictions[id] = [output_pos] both_predictions[id] = [ output_both ] if keyword_type == 'both' else None return pos_predictions, both_predictions