def classify(classifier_type, X_train, Y_train=None, X_test=None, Y_test=None, keywords=None, class_map=None, is_X_text=True): _, Y_train = utility.binarize_data(Y_train, class_mapping=class_map) _, Y_test = utility.binarize_data(Y_test, class_mapping=class_map) svc_class = supervised.SupervisedClassifier.SvcClassifier() Y_pred, train_acc, test_acc = svc_class.classify(X_train, Y_train, X_test, Y_test) return Y_pred, train_acc, test_acc
def keyword_driver(classifier_type,X_train,Y_train,num_of_keywords=50): if all(isinstance(n, list) for n in X_train): X_train = utility.get_str_from_list(X_train) binarizer = utility.get_multilabel_binarizer(lrf_config.get_class_map()) Y_train_binary = utility.binarize_data(Y_train) tfidf_vectorizer = utility.get_tf_idf_vectorizer(X_train) X_tfidf = tfidf_vectorizer.transform(X_train) model = classifier.get_classification_model(classifier_type, X_tfidf, Y_train_binary[1]) keywords = get_keywords(X_train, model, binarizer,num_of_keywords=num_of_keywords) return keywords
def main(): fs = FeatureSelection() data = datasets.get_news_data('keyword_data', 'annotator_data_dump_with_text') X_data, Y_data = data['text'].values, data['category'].values tf_idf_vectorizer = utility.get_tf_idf_vectorizer(X_data) X_data_tf = tf_idf_vectorizer.transform(X_data) Y_binary = utility.binarize_data(data=Y_data) res = fs.ChiSquare.chiSquare(X_data_tf, Y_binary, 50) for i in range(20): print('%%%%%%%%%%%%%%%%%%%%%%%%%%%') print(res[0])
def main(): print("main_code goes here") classifier_type = 'svc' news_data = datasets.get_news_data('keyword_data', 'annotator_data_dump_with_text') train_data, test_data = utility.split_data(news_data) X_train = train_data['text'] Y_train = train_data['category'] binarizer = utility.get_multilabel_binarizer(lrf_config.get_class_map()) Y_train_binary = utility.binarize_data(Y_train) tfidf_vectorizer = utility.get_tf_idf_vectorizer(X_train) X_tfidf = tfidf_vectorizer.transform(X_train) model = classifier.get_classification_model(classifier_type, X_tfidf, Y_train_binary) h = get_keywords(X_train, model, binarizer) print(h)
def validation_model(): ## Classifier Name classifier_name = 'svc' keyword_classifier_name = 'svc' train_data_src = 'tweets_and_news' num_splits = 5 ## News Data news_data = datasets.get_news_data( folder_name='keyword_data', file_name='annotator_data_dump_with_text') ## Tweet Data tweet_data = datasets.get_tweet_data(file_type='txt', file_name='tweet_truth.txt') if train_data_src == 'news': data = news_data field_names = ['text', 'category'] elif train_data_src == 'tweets': data = tweet_data field_names = ['tweet_cmplt', 'class_annotated'] elif train_data_src == 'tweets_and_news': data = tweet_data data_extra = news_data field_names = ['tweet_cmplt', 'class_annotated'] field_names_extra = ['text', 'category'] kf = KFold(n_splits=num_splits) kf.get_n_splits(data) train_acc = [] test_acc = [] pos_f_measure = [] both_f_measure = [] pos_acc_list = [] both_acc_list = [] for train_index, test_index in kf.split(data): X_train = data[field_names[0]].iloc[train_index] Y_train = data[field_names[1]].iloc[train_index] X_test = data[field_names[0]].iloc[test_index] Y_test = data[field_names[1]].iloc[test_index] if train_data_src == 'tweets_and_news': X_extra = data_extra[field_names_extra[0]] Y_extra = data_extra[field_names_extra[1]] X_train = X_train.append(X_extra) Y_train = Y_train.append(Y_extra) if classifier_name in ['svc', 'lr', 'ada_boost']: Y_predicted, curr_train_acc, curr_test_acc = classifier.classify( classifier_name, X_train, Y_train, X_test, Y_test) train_acc.append(curr_train_acc) test_acc.append(curr_test_acc) elif classifier_name == 'cosineSim': keywords = keyword_generator.keyword_driver( keyword_classifier_name, X_train, Y_train, num_of_keywords=50) Y_predicted_pos, Y_predicted_both = classifier.classify( classifier_name, X_test, keywords=keywords) Y_test_list = [] Y_pred_both_list = [] Y_pred_pos_list = [] for i in Y_test.keys(): Y_test_list.append(Y_test.get_value(i)) Y_pred_pos_list.append(Y_predicted_pos[i]) Y_pred_both_list.append(Y_predicted_both[i]) Y_test_binary = utility.binarize_data(Y_test_list) Y_pred_pos_binary = utility.binarize_data(Y_pred_pos_list) Y_pred_both_binary = utility.binarize_data(Y_pred_both_list) both_acc_list.append( ca.calculate_accuracy(Y_predicted_both, Y_test)) both_f_measure.append( cf.calculate_f_measure(Y_test_binary[1], Y_pred_both_binary[1])) pos_acc_list.append(ca.calculate_accuracy(Y_predicted_pos, Y_test)) pos_f_measure.append( cf.calculate_f_measure(Y_test_binary[1], Y_pred_pos_binary[1])) if classifier_name == 'svc': print('SVC train Acc : ', mean(train_acc)) print('SVC test Acc : ', mean(test_acc)) elif classifier_name == 'cosineSim': print('cosineSim POS Acc : ', mean(pos_acc_list)) print('cosineSim BOTH Acc : ', mean(both_acc_list)) print('cosineSim POS F : ', mean(pos_f_measure)) print('cosineSim BOTH F : ', mean(both_f_measure))
print("main_code goes here") classifier_type = 'svc' <<<<<<< HEAD news_data = datasets.get_news_data('keyword_data', 'annotator_data_dump_with_text') ======= news_data = datasets.get_news_data('keyword_data','annotator_data_dump_with_text') >>>>>>> 9af9d4b1e982bbca90d4f5989640d5baa4cc6377 train_data,test_data = utility.split_data(news_data) X_train = train_data['text'] Y_train = train_data['category'] binarizer = utility.get_multilabel_binarizer(lrf_config.get_class_map()) Y_train_binary = utility.binarize_data(Y_train) tfidf_vectorizer = utility.get_tf_idf_vectorizer(X_train) X_tfidf = tfidf_vectorizer.transform(X_train) <<<<<<< HEAD model = classifier.get_classification_model(classifier_type, X_tfidf, Y_train_binary) ======= model = classifier.get_classification_model(classifier_type,X_tfidf,Y_train_binary) >>>>>>> 9af9d4b1e982bbca90d4f5989640d5baa4cc6377 h = get_keywords(X_train,model,binarizer) print(h) ############################# if __name__=='__main__':
def classify_bkp(classifier_type, X_train, Y_train=None, X_test=None, Y_test=None, keywords=None, class_map=None, is_X_text=True): if (classifier_type in ['svc', 'lr', 'ada_boost']): if Y_train is None: raise ValueError( classifier_type, ' is a Supervised Algorithm, pass training labels ...') elif X_test is None and Y_test is None: train_data = zip(X_train, Y_train) train_data, test_data = sklearn.model_selection.train_test_split( pd.DataFrame.from_records(train_data)) X_train, Y_train = train_data[0], train_data[1] X_test, Y_test = test_data[0], test_data[1] print( 'Since no TEST Data provided, splitting given data into train and test' ) X_train = utility.get_str_from_list(X_train) X_test = utility.get_str_from_list(X_test) if class_map is not None: fitted_binarizer, Y_train_binary = utility.binarize_data( Y_train, class_mapping=class_map) else: fitted_binarizer, Y_train_binary = utility.binarize_data(Y_train) if Y_test is not None: f, Y_test_binary = utility.binarize_data(Y_test, class_mapping=class_map) if is_X_text == True: tf_idf_vectorizer = utility.get_tf_idf_vectorizer(X_train) X_train_tf_idf = tf_idf_vectorizer.transform(X_train) X_test_tf_idf = tf_idf_vectorizer.transform(X_test) else: X_train_tf_idf = X_train X_test_tf_idf = X_test if classifier_type == 'svc': svc_class = supervised.SupervisedClassifier.SvcClassifier() if Y_test is not None: Y_pred, train_acc, test_acc = svc_class.classify( X_train_tf_idf, Y_train_binary, X_test_tf_idf, Y_test_binary) return Y_pred, train_acc, test_acc else: Y_pred, train_acc = svc_class.classify(X_train_tf_idf, Y_train_binary, X_test_tf_idf) return Y_pred, train_acc return fitted_binarizer.inverse_transform( Y_pred), train_acc, test_acc elif classifier_type == 'lr': lr_class = supervised.SupervisedClassifier.LogisticRClassifier() if Y_test is not None: Y_pred, train_acc, test_acc = lr_class.classify( X_train_tf_idf, Y_train_binary, X_test_tf_idf, Y_test_binary) return Y_pred, train_acc, test_acc else: Y_pred, train_acc = lr_class.classify(X_train_tf_idf, Y_train_binary, X_test_tf_idf) return Y_pred, train_acc elif classifier_type == 'ada_boost': ada_boost_class = supervised.SupervisedClassifier.AdaBoostClassifier( ) if Y_test is not None: Y_pred, train_acc, test_acc = ada_boost_class.classify( X_train_tf_idf, Y_train_binary, X_test_tf_idf, Y_test_binary) return Y_pred, train_acc, test_acc else: Y_pred, train_acc = ada_boost_class.classify( X_train_tf_idf, Y_train_binary, X_test_tf_idf) return Y_pred, train_acc elif classifier_type == 'cosineSim': cosine_sim_class = unsupervised.UnsupervisedClassifiers.CosineSimilarity( ) Y_pred_pos, Y_pred_both = cosine_sim_class.classify( X_train, keywords, vector_type='word_embeddings') return Y_pred_pos, Y_pred_both
def main(): news_dict = datasets.get_news_data( folder_name='keyword_data', file_name='annotator_data_dump_with_text') # train_data, test_data = utility.split_data(news_dict) category_names = ['text', 'category'] category_names_news = ['text', 'category'] # X = news_dict[category_names[0]] # y = news_dict[category_names[1]] twitter_dict = datasets.get_tweet_data('txt', 'tweet_truth.txt') kf = KFold(n_splits=5) kf.get_n_splits(twitter_dict) some_dict = {} train_acc = [] test_acc = [] acc_both = [] f_both = [] acc_pos = [] f_pos = [] ada_test_list = [] ada_train_list = [] for train_index, test_index in kf.split(twitter_dict): print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%') X_train = twitter_dict['tweet_cmplt'].iloc[train_index] Y_train = twitter_dict['class_annotated'].iloc[train_index] X_test = twitter_dict['tweet_cmplt'].iloc[test_index] Y_test = twitter_dict['class_annotated'].iloc[test_index] news_train = news_dict['text'] news_class = news_dict['category'] some_dict['tweet_cmplt'] = X_train.append(news_train) some_dict['class_annotated'] = Y_train.append(news_class) ada_predicted, ada_train_acc, ada_test_acc = classify( 'ada_boost', some_dict['tweet_cmplt'], some_dict['class_annotated'], X_test, Y_test) ada_train_list.append(ada_train_acc) ada_test_list.append(ada_test_acc) print('ada_train_list : ', ada_train_list) print('ada_test_list : ', ada_test_list) keywords = keyword_generator.keyword_driver( 'svc', some_dict, ['tweet_cmplt', 'class_annotated'], num_of_keywords=50) for item in keywords: print(item, ' : ', keywords[item]) predicted, curr_train_acc, curr_test_acc = classify( 'svc', some_dict['tweet_cmplt'], some_dict['class_annotated'], X_test, Y_test) train_acc.append(curr_train_acc) test_acc.append(curr_test_acc) exit(0) print('train_acc SVC: ', train_acc) print('test_acc SVC: ', test_acc) # Y_pred_pos, Y_pred_both = classify('cosineSim', X_test ,keywords = keywords) Y_test_list = [] Y_pred_both_list = [] Y_pred_pos_list = [] for i in Y_test.keys(): Y_test_list.append(Y_test.get_value(i)) Y_pred_pos_list.append(Y_pred_pos[i]) Y_pred_both_list.append(Y_pred_both[i]) Y_test_binary = utility.binarize_data(Y_test_list) Y_pred_pos_binary = utility.binarize_data(Y_pred_pos_list) Y_pred_both_binary = utility.binarize_data(Y_pred_both_list) acc_both.append(ca.calculate_accuracy(Y_pred_both, Y_test)) f_both.append( cf.calculate_f_measure(Y_test_binary[1], Y_pred_both_binary[1])) acc_pos.append(ca.calculate_accuracy(Y_pred_pos, Y_test)) f_pos.append( cf.calculate_f_measure(Y_test_binary[1], Y_pred_pos_binary[1])) print('################################ BOTH') print('acc_both : ', mean(acc_both)) print('f_both : ', mean(f_both)) print('################################ POS') print('acc_pos : ', mean(acc_pos)) print('f_pos : ', mean(f_pos)) print('############################### SVC') print('Train_Accuracy : ', mean(train_acc)) print('Test_Accuracy : ', mean(test_acc)) print('############################### ADA_Boost') print('Train_Accuracy : ', mean(ada_train_list)) print('Test_Accuracy : ', mean(ada_test_list)) exit(0) # TWEET DATA twitter_dict = datasets.get_tweet_data('txt', 'tweet_truth.txt') train_data, test_data = utility.split_data(twitter_dict) category_names = ['tweet_cmplt', 'class_annotated'] #category_names_tweet = ['tweet_word_list', 'class_annotated'] predicted_data, train_acc, test_acc = classify( 'lr', train_data[category_names[0]], train_data[category_names[1]], test_data[category_names[0]], test_data[category_names[1]]) #predicted_data, train_acc, test_acc = classify('svc', news_dict[category_names_news[0]], news_dict[category_names_news[1]], # twitter_dict[category_names_tweet[0]], twitter_dict[category_names_tweet[1]]) # print(predicted_data) print('train_acc : ', train_acc) print('test_acc : ', test_acc)