def process_tweets(classifier_name='svc', train_data_src='tweets_and_news', keyword_classifier_name='svc'): ## News Data news_data = datasets.get_news_data( folder_name='keyword_data', file_name='annotator_data_dump_with_text') news_fields = ['text', 'category'] ## Tweets Data training_tweet_data = datasets.get_tweet_data(file_type='txt', file_name='tweet_truth.txt') unlabeled_tweet_data = datasets.get_tweet_data( file_type='json', file_name='intermed_dict.json') tweet_fields = ['tweet_cmplt', 'class_annotated'] X_test = unlabeled_tweet_data[tweet_fields[0]] if classifier_name == 'news': train_data = news_data X_train, Y_train = train_data[news_fields[0]], train_data[ news_fields[1]] if classifier_name == 'tweets': train_data = training_tweet_data X_train, Y_train = train_data[tweet_fields[0]], train_data[ tweet_fields[1]] ## Classify Below if train_data_src == 'tweets_and_news': X_train_news, Y_train_news = news_data[news_fields[0]], news_data[ news_fields[1]] X_train_tweets, Y_train_tweets = training_tweet_data[ tweet_fields[0]], training_tweet_data[tweet_fields[1]] X_train = X_train_tweets.append(X_train_news) Y_train = Y_train_tweets.append(Y_train_news) train_acc = [] if classifier_name in ['svc', 'lr', 'ada_boost']: Y_predicted, curr_train_acc = classifier.classify( classifier_name, X_train, Y_train, X_test) train_acc.append(curr_train_acc) return Y_predicted elif classifier_name == 'cosineSim': keywords = keyword_generator.keyword_driver(keyword_classifier_name, X_train, Y_train, num_of_keywords=50) Y_predicted_pos, Y_predicted_both = classifier.classify( classifier_name, X_test, keywords=keywords) return Y_predicted_both, Y_predicted_pos if classifier_name == 'svc': print('SVC train Acc : ', mean(train_acc))
def main(): fs = FeatureSelection() data = datasets.get_news_data('keyword_data', 'annotator_data_dump_with_text') X_data, Y_data = data['text'].values, data['category'].values tf_idf_vectorizer = utility.get_tf_idf_vectorizer(X_data) X_data_tf = tf_idf_vectorizer.transform(X_data) Y_binary = utility.binarize_data(data=Y_data) res = fs.ChiSquare.chiSquare(X_data_tf, Y_binary, 50) for i in range(20): print('%%%%%%%%%%%%%%%%%%%%%%%%%%%') print(res[0])
def validation_model(): ## Classifier Name classifier_name = 'svc' keyword_classifier_name = 'svc' train_data_src = 'tweets_and_news' num_splits = 5 ## News Data news_data = datasets.get_news_data( folder_name='keyword_data', file_name='annotator_data_dump_with_text') ## Tweet Data tweet_data = datasets.get_tweet_data(file_type='txt', file_name='tweet_truth.txt') if train_data_src == 'news': data = news_data field_names = ['text', 'category'] elif train_data_src == 'tweets': data = tweet_data field_names = ['tweet_cmplt', 'class_annotated'] elif train_data_src == 'tweets_and_news': data = tweet_data data_extra = news_data field_names = ['tweet_cmplt', 'class_annotated'] field_names_extra = ['text', 'category'] kf = KFold(n_splits=num_splits) kf.get_n_splits(data) train_acc = [] test_acc = [] pos_f_measure = [] both_f_measure = [] pos_acc_list = [] both_acc_list = [] for train_index, test_index in kf.split(data): X_train = data[field_names[0]].iloc[train_index] Y_train = data[field_names[1]].iloc[train_index] X_test = data[field_names[0]].iloc[test_index] Y_test = data[field_names[1]].iloc[test_index] if train_data_src == 'tweets_and_news': X_extra = data_extra[field_names_extra[0]] Y_extra = data_extra[field_names_extra[1]] X_train = X_train.append(X_extra) Y_train = Y_train.append(Y_extra) if classifier_name in ['svc', 'lr', 'ada_boost']: Y_predicted, curr_train_acc, curr_test_acc = classifier.classify( classifier_name, X_train, Y_train, X_test, Y_test) train_acc.append(curr_train_acc) test_acc.append(curr_test_acc) elif classifier_name == 'cosineSim': keywords = keyword_generator.keyword_driver( keyword_classifier_name, X_train, Y_train, num_of_keywords=50) Y_predicted_pos, Y_predicted_both = classifier.classify( classifier_name, X_test, keywords=keywords) Y_test_list = [] Y_pred_both_list = [] Y_pred_pos_list = [] for i in Y_test.keys(): Y_test_list.append(Y_test.get_value(i)) Y_pred_pos_list.append(Y_predicted_pos[i]) Y_pred_both_list.append(Y_predicted_both[i]) Y_test_binary = utility.binarize_data(Y_test_list) Y_pred_pos_binary = utility.binarize_data(Y_pred_pos_list) Y_pred_both_binary = utility.binarize_data(Y_pred_both_list) both_acc_list.append( ca.calculate_accuracy(Y_predicted_both, Y_test)) both_f_measure.append( cf.calculate_f_measure(Y_test_binary[1], Y_pred_both_binary[1])) pos_acc_list.append(ca.calculate_accuracy(Y_predicted_pos, Y_test)) pos_f_measure.append( cf.calculate_f_measure(Y_test_binary[1], Y_pred_pos_binary[1])) if classifier_name == 'svc': print('SVC train Acc : ', mean(train_acc)) print('SVC test Acc : ', mean(test_acc)) elif classifier_name == 'cosineSim': print('cosineSim POS Acc : ', mean(pos_acc_list)) print('cosineSim BOTH Acc : ', mean(both_acc_list)) print('cosineSim POS F : ', mean(pos_f_measure)) print('cosineSim BOTH F : ', mean(both_f_measure))
plt.show() if __name__ == '__main__': from lrf.utilities import datasets, utility import numpy as np from lrf.keyword_generator import keyword_generator from lrf.configs import lrf_config import os import json locations = lrf_config.get_locations() risk_cat_file = os.path.join(locations['INTERMED_DATA_PATH'], 'risk_category_file.json') news_data = datasets.get_news_data('keyword_data', 'annotator_data_dump_with_text') train_data_news, test_data_news = utility.split_data(news_data) field_names_news = ['text', 'category'] tweet_data = datasets.get_tweet_data('txt', 'tweet_truth.txt') train_data_tweets, test_data_tweets = utility.split_data(tweet_data) field_names_tweets = ['tweet_cmplt', 'class_annotated'] X_train_data = np.append(train_data_news[field_names_news[0]].values, train_data_tweets[field_names_tweets[0]].values) Y_train_data = np.append(train_data_news[field_names_news[1]].values, train_data_tweets[field_names_tweets[1]].values) category_keywords = keyword_generator.keyword_driver('svc', X_train_data, Y_train_data,
print('############ CLASSIFICATION COMPLETE ##################') return classified_tweets #################### Main ############################ if __name__=='__main__': from lrf.utilities import datasets file_type = 'txt' file_name = 'tweet' raw_tweets = datasets.get_tweet_data(file_type='txt', file_name='tweet_truth.txt') raw_news = datasets.get_news_data(folder_name='keyword_data',file_name='annotator_data_dump_with_text') model_dump_path = '../../classifier_data_n_model/' output_path = '../../classifier_data_n_model/' ## Training the Model # train_model(raw_tweets,raw_news,model_dump_path,classifier_type='unsupervised') ## Classify new tweets data classified_tweets = classify_tweets(raw_tweets['tweet_cmplt'],model_dump_path, output_path,classifier_type='unsupervised') ## Dumping the results from sklearn.externals import joblib result = joblib.load(os.path.join(output_path,'classified_tweets.pkl'))
def main(): news_dict = datasets.get_news_data( folder_name='keyword_data', file_name='annotator_data_dump_with_text') category_names = ['tweet_cmplt', 'class_annotated'] category_names_news = ['text', 'category'] twitter_dict = datasets.get_tweet_data('txt', 'tweet_truth.txt') kf = KFold(n_splits=5) kf.get_n_splits(twitter_dict) some_dict = {} train_acc = [] test_acc = [] acc_both = [] f_both = [] acc_pos = [] f_pos = [] ada_test_list = [] ada_train_list = [] news_train = news_dict['text'] news_class = news_dict['category'] for train_index, test_index in kf.split(twitter_dict): print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%') X_train = twitter_dict['tweet_cmplt'].iloc[train_index] Y_train = twitter_dict['class_annotated'].iloc[train_index] X_test = twitter_dict['tweet_cmplt'].iloc[test_index] Y_test = twitter_dict['class_annotated'].iloc[test_index] some_dict['tweet_cmplt'] = X_train.append(news_train) some_dict['class_annotated'] = Y_train.append(news_class) ada_predicted, ada_train_acc, ada_test_acc = classify( 'ada_boost', some_dict['tweet_cmplt'], some_dict['class_annotated'], X_test, Y_test) ada_train_list.append(ada_train_acc) ada_test_list.append(ada_test_acc) exit(0) print('ada_train_list : ', ada_train_list) print('ada_test_list : ', ada_test_list) keywords = keyword_generator.keyword_driver( 'svc', some_dict['tweet_cmplt'], some_dict['class_annotated'], num_of_keywords=50) for item in keywords: print(item, ' : ', keywords[item]) predicted, curr_train_acc, curr_test_acc = classify( 'svc', some_dict['tweet_cmplt'], some_dict['class_annotated'], X_test, Y_test) train_acc.append(curr_train_acc) test_acc.append(curr_test_acc) print('train_acc SVC: ', train_acc) print('test_acc SVC: ', test_acc) # Y_pred_pos, Y_pred_both = classify('cosineSim', X_test ,keywords = keywords) Y_test_list = [] Y_pred_both_list = [] Y_pred_pos_list = [] for i in Y_test.keys(): Y_test_list.append(Y_test.get_value(i)) Y_pred_pos_list.append(Y_pred_pos[i]) Y_pred_both_list.append(Y_pred_both[i]) Y_test_binary = utility.binarize_data(Y_test_list) Y_pred_pos_binary = utility.binarize_data(Y_pred_pos_list) Y_pred_both_binary = utility.binarize_data(Y_pred_both_list) acc_both.append(ca.calculate_accuracy(Y_pred_both, Y_test)) f_both.append( cf.calculate_f_measure(Y_test_binary[1], Y_pred_both_binary[1])) acc_pos.append(ca.calculate_accuracy(Y_pred_pos, Y_test)) f_pos.append( cf.calculate_f_measure(Y_test_binary[1], Y_pred_pos_binary[1])) print('################################ BOTH') print('acc_both : ', mean(acc_both)) print('f_both : ', mean(f_both)) print('################################ POS') print('acc_pos : ', mean(acc_pos)) print('f_pos : ', mean(f_pos)) print('############################### SVC') print('Train_Accuracy : ', mean(train_acc)) print('Test_Accuracy : ', mean(test_acc)) print('############################### ADA_Boost') print('Train_Accuracy : ', mean(ada_train_list)) print('Test_Accuracy : ', mean(ada_test_list)) exit(0) # TWEET DATA twitter_dict = datasets.get_tweet_data('txt', 'tweet_truth.txt') train_data, test_data = utility.split_data(twitter_dict) category_names = ['tweet_cmplt', 'class_annotated'] #category_names_tweet = ['tweet_word_list', 'class_annotated'] predicted_data, train_acc, test_acc = classify( 'lr', train_data[category_names[0]], train_data[category_names[1]], test_data[category_names[0]], test_data[category_names[1]]) #predicted_data, train_acc, test_acc = classify('svc', news_dict[category_names_news[0]], news_dict[category_names_news[1]], # twitter_dict[category_names_tweet[0]], twitter_dict[category_names_tweet[1]]) # print(predicted_data) print('train_acc : ', train_acc) print('test_acc : ', test_acc)