def load_and_process(data_file, label_file): with open(os.path.join(data_dir, data_file), 'r') as f: x = f.readlines() with open(os.path.join(data_dir, label_file), 'r') as f: y = np.array(f.readlines()) tfidf_vectorizer = utility.get_tf_idf_vectorizer(data=x, ngram_range=2) x_feats = tfidf_vectorizer.transform(x) x_feats = scale(x_feats, with_mean=False) return (x_feats, y)
def main(): fs = FeatureSelection() data = datasets.get_news_data('keyword_data', 'annotator_data_dump_with_text') X_data, Y_data = data['text'].values, data['category'].values tf_idf_vectorizer = utility.get_tf_idf_vectorizer(X_data) X_data_tf = tf_idf_vectorizer.transform(X_data) Y_binary = utility.binarize_data(data=Y_data) res = fs.ChiSquare.chiSquare(X_data_tf, Y_binary, 50) for i in range(20): print('%%%%%%%%%%%%%%%%%%%%%%%%%%%') print(res[0])
def main(): locations = lrf_config.get_locations() ref_data_dir = locations['REF_DATA_PATH'] x_filename = 'sentiment_data/tweets.txt' y_filename = 'sentiment_data/labels.txt' ##load and process samples print('start loading and process samples...') tweets = [] microblog_features = [] lexicon_features = [] tweets_lst = [] with open(os.path.join(ref_data_dir, x_filename)) as f: for i, line in enumerate(f): tweet_obj = json.loads(line.strip(), encoding='utf-8') # Twitter Text contents content = tweet_obj['text'].replace("\n", " ") tweets_lst.append(pre_process_lst(content)) postprocessed_tweet, microblogging_features, mpqa_sentiment_score = pre_process( content) tweets.append(postprocessed_tweet) microblog_features.append(microblogging_features) lexicon_features.append(mpqa_sentiment_score) lexicon_features = np.asarray(lexicon_features) microblog_features = np.asarray(microblog_features) tf_idf_vectorizer = utility.get_tf_idf_vectorizer(tweets_lst, ngram_range=2) transformed_data_rahul = tf_idf_vectorizer.fit_transform(tweets_lst) # # tf_idf_vectorizer = utility.get_tf_idf_vectorizer(tweets,ngram_range=2) # # transformed_data_mine = tf_idf_vectorizer.fit_transform(tweets) with open(os.path.join(ref_data_dir, y_filename)) as f: y_data = f.readlines() y_data = [y.strip('\n') for y in y_data] y_data = np.asarray(y_data) num_of_features = 50 accuracy_in_each_turn = [] while num_of_features <= 3000: X_new = SelectKBest(chi2, k=num_of_features).fit_transform( transformed_data_rahul, y_data) extended_features_1 = np.append(X_new.toarray(), lexicon_features, axis=1) extended_features_2 = np.append(extended_features_1, microblog_features, axis=1) sentiment_map = lrf_config.get_sentiment_map() inv_sentiment_map = {str(v): k for k, v in sentiment_map.items()} X_data = X_new.toarray() kf = KFold(n_splits=5) kf.get_n_splits(X_data) train_list = [] test_list = [] for train_index, test_index in kf.split(X_data): X_train = X_data[train_index] Y_train = y_data[train_index] X_test = X_data[test_index] Y_test = y_data[test_index] Y_pred, train_acc, test_acc = classifier.classify( 'svc', X_train=X_train, Y_train=Y_train, X_test=X_test, Y_test=Y_test, class_map=inv_sentiment_map, is_X_text=False) # print('_______________________________________________________') # print(train_acc) # print(test_acc) train_list.append(train_acc) test_list.append(test_acc) # print('Train_Acc : ',np.mean(train_acc)) # print('Test_Acc : ', np.mean(test_acc)) accuracy_in_each_turn.append([np.mean(train_acc), np.mean(test_acc)]) for elem in accuracy_in_each_turn: print(elem)
def tf_idf_classification(self,data_dict,keywords,keyword_type='both'): data_arr = [] data_index = {} for i, ind in enumerate(data_dict): record = data_dict[ind] data_index[i] = ind if type(record) == list: new_rec = ' '.join(record) data_arr.append(new_rec) elif type(record) == str: data_arr.append(record) pos_risk = [] neg_risk = [] category_index = {} ind = 0 for category in keywords: pos_rec = ' '.join(keywords[category]['pos'].keys()) pos_risk.append(pos_rec) category_index[ind] = category if keyword_type == 'both': neg_rec = ' '.join(keywords[category]['neg'].keys()) neg_risk.append(neg_rec) tf_idf_vectorizer = utility.get_tf_idf_vectorizer(data_arr) data_tfidf = tf_idf_vectorizer.transform(data_arr) pos_category_tfidf = tf_idf_vectorizer.transform(pos_risk) cos_sim_pos = cosine_similarity(data_tfidf, pos_category_tfidf) pos_res = np.argmax(cos_sim_pos, axis=1) if keyword_type == 'both': neg_category_tfidf = tf_idf_vectorizer.transform(neg_risk) cos_sim_neg = cosine_similarity(data_tfidf, neg_category_tfidf) cos_sim_both = cos_sim_pos - cos_sim_neg both_res = np.argmax(cos_sim_both, axis=1) pos_result = {} both_result = {} for i in data_index: pos_result[data_index[i]] = category_index[pos_res[i]] if keyword_type == 'both': both_result[data_index[i]] = category_index[both_res[i]] return pos_result, both_result
def classify(classifier_type, X_train, Y_train=None, X_test=None, Y_test=None, keywords=None, class_map=None, is_X_text=True): if (classifier_type in ['svc', 'lr', 'ada_boost']): if Y_train is None: raise ValueError( classifier_type, ' is a Supervised Algorithm, pass training labels ...') elif X_test is None and Y_test is None: train_data = zip(X_train, Y_train) train_data, test_data = sklearn.model_selection.train_test_split( pd.DataFrame.from_records(train_data)) X_train, Y_train = train_data[0], train_data[1] X_test, Y_test = test_data[0], test_data[1] print( 'Since no TEST Data provided, splitting given data into train and test' ) X_train = utility.get_str_from_list(X_train) X_test = utility.get_str_from_list(X_test) # if class_map is not None: # # fitted_binarizer, Y_train_binary = utility.binarize_data(Y_train,class_mapping=class_map) # else: # fitted_binarizer, Y_train_binary = utility.binarize_data(Y_train) print(Y_train) exit(0) if Y_test is not None: f, Y_test_binary = utility.binarize_data(Y_test, class_mapping=class_map) if is_X_text == True: tf_idf_vectorizer = utility.get_tf_idf_vectorizer(X_train) X_train_tf_idf = tf_idf_vectorizer.transform(X_train) X_test_tf_idf = tf_idf_vectorizer.transform(X_test) else: X_train_tf_idf = X_train X_test_tf_idf = X_test if classifier_type == 'svc': svc_class = supervised.SupervisedClassifier.SvcClassifier() if Y_test is not None: Y_pred, train_acc, test_acc = svc_class.classify( X_train_tf_idf, Y_train_binary, X_test_tf_idf, Y_test_binary) return Y_pred, train_acc, test_acc else: Y_pred, train_acc = svc_class.classify(X_train_tf_idf, Y_train_binary, X_test_tf_idf) return Y_pred, train_acc return fitted_binarizer.inverse_transform( Y_pred), train_acc, test_acc elif classifier_type == 'lr': lr_class = supervised.SupervisedClassifier.LogisticRClassifier() if Y_test is not None: Y_pred, train_acc, test_acc = lr_class.classify( X_train_tf_idf, Y_train_binary, X_test_tf_idf, Y_test_binary) return Y_pred, train_acc, test_acc else: Y_pred, train_acc = lr_class.classify(X_train_tf_idf, Y_train_binary, X_test_tf_idf) return Y_pred, train_acc elif classifier_type == 'ada_boost': ada_boost_class = supervised.SupervisedClassifier.AdaBoostClassifier( ) if Y_test is not None: Y_pred, train_acc, test_acc = ada_boost_class.classify( X_train_tf_idf, Y_train_binary, X_test_tf_idf, Y_test_binary) return Y_pred, train_acc, test_acc else: Y_pred, train_acc = ada_boost_class.classify( X_train_tf_idf, Y_train_binary, X_test_tf_idf) return Y_pred, train_acc elif classifier_type == 'cosineSim': cosine_sim_class = unsupervised.UnsupervisedClassifiers.CosineSimilarity( ) Y_pred_pos, Y_pred_both = cosine_sim_class.classify( X_train, keywords, vector_type='word_embeddings') return Y_pred_pos, Y_pred_both