def process_tweets(classifier_name='svc',
                   train_data_src='tweets_and_news',
                   keyword_classifier_name='svc'):

    ## News Data
    news_data = datasets.get_news_data(
        folder_name='keyword_data', file_name='annotator_data_dump_with_text')
    news_fields = ['text', 'category']

    ## Tweets Data
    training_tweet_data = datasets.get_tweet_data(file_type='txt',
                                                  file_name='tweet_truth.txt')
    unlabeled_tweet_data = datasets.get_tweet_data(
        file_type='json', file_name='intermed_dict.json')
    tweet_fields = ['tweet_cmplt', 'class_annotated']

    X_test = unlabeled_tweet_data[tweet_fields[0]]

    if classifier_name == 'news':
        train_data = news_data
        X_train, Y_train = train_data[news_fields[0]], train_data[
            news_fields[1]]

    if classifier_name == 'tweets':
        train_data = training_tweet_data
        X_train, Y_train = train_data[tweet_fields[0]], train_data[
            tweet_fields[1]]

    ## Classify Below
    if train_data_src == 'tweets_and_news':
        X_train_news, Y_train_news = news_data[news_fields[0]], news_data[
            news_fields[1]]
        X_train_tweets, Y_train_tweets = training_tweet_data[
            tweet_fields[0]], training_tweet_data[tweet_fields[1]]

        X_train = X_train_tweets.append(X_train_news)
        Y_train = Y_train_tweets.append(Y_train_news)

    train_acc = []

    if classifier_name in ['svc', 'lr', 'ada_boost']:
        Y_predicted, curr_train_acc = classifier.classify(
            classifier_name, X_train, Y_train, X_test)
        train_acc.append(curr_train_acc)

        return Y_predicted

    elif classifier_name == 'cosineSim':

        keywords = keyword_generator.keyword_driver(keyword_classifier_name,
                                                    X_train,
                                                    Y_train,
                                                    num_of_keywords=50)
        Y_predicted_pos, Y_predicted_both = classifier.classify(
            classifier_name, X_test, keywords=keywords)

        return Y_predicted_both, Y_predicted_pos

    if classifier_name == 'svc':
        print('SVC train Acc : ', mean(train_acc))
Exemplo n.º 2
0
def main():

    fs = FeatureSelection()

    data = datasets.get_news_data('keyword_data',
                                  'annotator_data_dump_with_text')

    X_data, Y_data = data['text'].values, data['category'].values

    tf_idf_vectorizer = utility.get_tf_idf_vectorizer(X_data)

    X_data_tf = tf_idf_vectorizer.transform(X_data)

    Y_binary = utility.binarize_data(data=Y_data)

    res = fs.ChiSquare.chiSquare(X_data_tf, Y_binary, 50)

    for i in range(20):
        print('%%%%%%%%%%%%%%%%%%%%%%%%%%%')
        print(res[0])
def validation_model():
    ## Classifier Name
    classifier_name = 'svc'
    keyword_classifier_name = 'svc'
    train_data_src = 'tweets_and_news'
    num_splits = 5

    ## News Data
    news_data = datasets.get_news_data(
        folder_name='keyword_data', file_name='annotator_data_dump_with_text')

    ## Tweet Data
    tweet_data = datasets.get_tweet_data(file_type='txt',
                                         file_name='tweet_truth.txt')

    if train_data_src == 'news':
        data = news_data
        field_names = ['text', 'category']

    elif train_data_src == 'tweets':
        data = tweet_data
        field_names = ['tweet_cmplt', 'class_annotated']

    elif train_data_src == 'tweets_and_news':
        data = tweet_data
        data_extra = news_data
        field_names = ['tweet_cmplt', 'class_annotated']
        field_names_extra = ['text', 'category']

    kf = KFold(n_splits=num_splits)
    kf.get_n_splits(data)
    train_acc = []
    test_acc = []

    pos_f_measure = []
    both_f_measure = []
    pos_acc_list = []
    both_acc_list = []

    for train_index, test_index in kf.split(data):
        X_train = data[field_names[0]].iloc[train_index]
        Y_train = data[field_names[1]].iloc[train_index]
        X_test = data[field_names[0]].iloc[test_index]
        Y_test = data[field_names[1]].iloc[test_index]

        if train_data_src == 'tweets_and_news':
            X_extra = data_extra[field_names_extra[0]]
            Y_extra = data_extra[field_names_extra[1]]

            X_train = X_train.append(X_extra)
            Y_train = Y_train.append(Y_extra)

        if classifier_name in ['svc', 'lr', 'ada_boost']:
            Y_predicted, curr_train_acc, curr_test_acc = classifier.classify(
                classifier_name, X_train, Y_train, X_test, Y_test)
            train_acc.append(curr_train_acc)
            test_acc.append(curr_test_acc)

        elif classifier_name == 'cosineSim':

            keywords = keyword_generator.keyword_driver(
                keyword_classifier_name, X_train, Y_train, num_of_keywords=50)
            Y_predicted_pos, Y_predicted_both = classifier.classify(
                classifier_name, X_test, keywords=keywords)

            Y_test_list = []
            Y_pred_both_list = []
            Y_pred_pos_list = []

            for i in Y_test.keys():
                Y_test_list.append(Y_test.get_value(i))
                Y_pred_pos_list.append(Y_predicted_pos[i])
                Y_pred_both_list.append(Y_predicted_both[i])

            Y_test_binary = utility.binarize_data(Y_test_list)
            Y_pred_pos_binary = utility.binarize_data(Y_pred_pos_list)
            Y_pred_both_binary = utility.binarize_data(Y_pred_both_list)

            both_acc_list.append(
                ca.calculate_accuracy(Y_predicted_both, Y_test))
            both_f_measure.append(
                cf.calculate_f_measure(Y_test_binary[1],
                                       Y_pred_both_binary[1]))
            pos_acc_list.append(ca.calculate_accuracy(Y_predicted_pos, Y_test))
            pos_f_measure.append(
                cf.calculate_f_measure(Y_test_binary[1], Y_pred_pos_binary[1]))

    if classifier_name == 'svc':
        print('SVC train Acc : ', mean(train_acc))
        print('SVC test Acc : ', mean(test_acc))

    elif classifier_name == 'cosineSim':

        print('cosineSim POS Acc : ', mean(pos_acc_list))
        print('cosineSim BOTH Acc : ', mean(both_acc_list))
        print('cosineSim POS F : ', mean(pos_f_measure))
        print('cosineSim BOTH F : ', mean(both_f_measure))
Exemplo n.º 4
0
    plt.show()


if __name__ == '__main__':
    from lrf.utilities import datasets, utility
    import numpy as np
    from lrf.keyword_generator import keyword_generator
    from lrf.configs import lrf_config
    import os
    import json

    locations = lrf_config.get_locations()
    risk_cat_file = os.path.join(locations['INTERMED_DATA_PATH'],
                                 'risk_category_file.json')

    news_data = datasets.get_news_data('keyword_data',
                                       'annotator_data_dump_with_text')
    train_data_news, test_data_news = utility.split_data(news_data)
    field_names_news = ['text', 'category']

    tweet_data = datasets.get_tweet_data('txt', 'tweet_truth.txt')
    train_data_tweets, test_data_tweets = utility.split_data(tweet_data)
    field_names_tweets = ['tweet_cmplt', 'class_annotated']

    X_train_data = np.append(train_data_news[field_names_news[0]].values,
                             train_data_tweets[field_names_tweets[0]].values)
    Y_train_data = np.append(train_data_news[field_names_news[1]].values,
                             train_data_tweets[field_names_tweets[1]].values)

    category_keywords = keyword_generator.keyword_driver('svc',
                                                         X_train_data,
                                                         Y_train_data,
    print('############ CLASSIFICATION COMPLETE ##################')

    return classified_tweets




#################### Main ############################
if __name__=='__main__':
    from lrf.utilities import datasets

    file_type = 'txt'
    file_name = 'tweet'

    raw_tweets = datasets.get_tweet_data(file_type='txt', file_name='tweet_truth.txt')
    raw_news = datasets.get_news_data(folder_name='keyword_data',file_name='annotator_data_dump_with_text')
    model_dump_path = '../../classifier_data_n_model/'
    output_path = '../../classifier_data_n_model/'

    ## Training the Model

    # train_model(raw_tweets,raw_news,model_dump_path,classifier_type='unsupervised')

    ## Classify new tweets data

    classified_tweets = classify_tweets(raw_tweets['tweet_cmplt'],model_dump_path, output_path,classifier_type='unsupervised')

    ## Dumping the results
    from sklearn.externals import joblib
    result = joblib.load(os.path.join(output_path,'classified_tweets.pkl'))
Exemplo n.º 6
0
def main():
    news_dict = datasets.get_news_data(
        folder_name='keyword_data', file_name='annotator_data_dump_with_text')

    category_names = ['tweet_cmplt', 'class_annotated']
    category_names_news = ['text', 'category']

    twitter_dict = datasets.get_tweet_data('txt', 'tweet_truth.txt')

    kf = KFold(n_splits=5)
    kf.get_n_splits(twitter_dict)

    some_dict = {}
    train_acc = []
    test_acc = []

    acc_both = []
    f_both = []
    acc_pos = []
    f_pos = []

    ada_test_list = []
    ada_train_list = []

    news_train = news_dict['text']
    news_class = news_dict['category']

    for train_index, test_index in kf.split(twitter_dict):
        print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')

        X_train = twitter_dict['tweet_cmplt'].iloc[train_index]
        Y_train = twitter_dict['class_annotated'].iloc[train_index]
        X_test = twitter_dict['tweet_cmplt'].iloc[test_index]
        Y_test = twitter_dict['class_annotated'].iloc[test_index]

        some_dict['tweet_cmplt'] = X_train.append(news_train)
        some_dict['class_annotated'] = Y_train.append(news_class)

        ada_predicted, ada_train_acc, ada_test_acc = classify(
            'ada_boost', some_dict['tweet_cmplt'],
            some_dict['class_annotated'], X_test, Y_test)

        ada_train_list.append(ada_train_acc)
        ada_test_list.append(ada_test_acc)

        exit(0)

        print('ada_train_list : ', ada_train_list)
        print('ada_test_list : ', ada_test_list)

        keywords = keyword_generator.keyword_driver(
            'svc',
            some_dict['tweet_cmplt'],
            some_dict['class_annotated'],
            num_of_keywords=50)

        for item in keywords:
            print(item, ' : ', keywords[item])

        predicted, curr_train_acc, curr_test_acc = classify(
            'svc', some_dict['tweet_cmplt'], some_dict['class_annotated'],
            X_test, Y_test)

        train_acc.append(curr_train_acc)
        test_acc.append(curr_test_acc)

        print('train_acc SVC: ', train_acc)
        print('test_acc SVC: ', test_acc)

        # Y_pred_pos, Y_pred_both = classify('cosineSim', X_test ,keywords = keywords)

        Y_test_list = []
        Y_pred_both_list = []
        Y_pred_pos_list = []

        for i in Y_test.keys():
            Y_test_list.append(Y_test.get_value(i))
            Y_pred_pos_list.append(Y_pred_pos[i])
            Y_pred_both_list.append(Y_pred_both[i])

        Y_test_binary = utility.binarize_data(Y_test_list)
        Y_pred_pos_binary = utility.binarize_data(Y_pred_pos_list)
        Y_pred_both_binary = utility.binarize_data(Y_pred_both_list)

        acc_both.append(ca.calculate_accuracy(Y_pred_both, Y_test))
        f_both.append(
            cf.calculate_f_measure(Y_test_binary[1], Y_pred_both_binary[1]))
        acc_pos.append(ca.calculate_accuracy(Y_pred_pos, Y_test))
        f_pos.append(
            cf.calculate_f_measure(Y_test_binary[1], Y_pred_pos_binary[1]))

    print('################################ BOTH')
    print('acc_both : ', mean(acc_both))
    print('f_both : ', mean(f_both))
    print('################################ POS')
    print('acc_pos : ', mean(acc_pos))
    print('f_pos : ', mean(f_pos))
    print('############################### SVC')
    print('Train_Accuracy : ', mean(train_acc))
    print('Test_Accuracy : ', mean(test_acc))
    print('############################### ADA_Boost')
    print('Train_Accuracy : ', mean(ada_train_list))
    print('Test_Accuracy : ', mean(ada_test_list))
    exit(0)

    # TWEET DATA
    twitter_dict = datasets.get_tweet_data('txt', 'tweet_truth.txt')
    train_data, test_data = utility.split_data(twitter_dict)
    category_names = ['tweet_cmplt', 'class_annotated']
    #category_names_tweet = ['tweet_word_list', 'class_annotated']

    predicted_data, train_acc, test_acc = classify(
        'lr', train_data[category_names[0]], train_data[category_names[1]],
        test_data[category_names[0]], test_data[category_names[1]])
    #predicted_data, train_acc, test_acc = classify('svc', news_dict[category_names_news[0]], news_dict[category_names_news[1]],
    #                                                   twitter_dict[category_names_tweet[0]], twitter_dict[category_names_tweet[1]])
    # print(predicted_data)

    print('train_acc : ', train_acc)

    print('test_acc : ', test_acc)