예제 #1
0
def logistic_regression_grid(x_train, y_train, x_test, y_test, class_ratio,
                        make_feature_analysis=False, feature_names=None, top_features=0, plot_name="coeff"):
    utils.print_model_title("Logistic Regression")
    C_range = [0.001, 0.01, 0.1, 1, 10, 100]
    parameters = {'C': C_range}
    log_regr = LogisticRegression(C=1.0, class_weight=class_ratio, penalty='l2')
    grid_classifier(x_train, y_train, x_test, y_test, log_regr, parameters,
                    make_feature_analysis, feature_names, top_features, plot_name)
예제 #2
0
def linear_svm_grid(x_train, y_train, x_test, y_test, class_ratio,
               make_feature_analysis=False, feature_names=None, top_features=0, plot_name="coeff"):
    utils.print_model_title("Linear SVM")
    C_range = get_regularization_params()
    parameters = {'C': C_range}
    linear_svm = LinearSVC(C=1.0, class_weight=class_ratio, penalty='l2')
    grid_classifier(x_train, y_train, x_test, y_test, linear_svm, parameters,
                    make_feature_analysis, feature_names, top_features, plot_name)
예제 #3
0
def nonlinear_svm_grid(x_train, y_train, x_test, y_test, class_ratio,
                  make_feature_analysis=False, feature_names=None, top_features=0, plot_name="coeff"):
    utils.print_model_title("Nonlinear SVM")
    C_range = get_regularization_params(a=-1, b=0, c=2, d=1, e=5)
    gamma_range = get_regularization_params(a=-2, b=-1, c=2, d=1, e=5)
    parameters = {'kernel': ['rbf'], 'C': C_range, 'gamma': gamma_range}
    nonlinear_svm = SVC(class_weight=class_ratio)
    grid_classifier(x_train, y_train, x_test, y_test, nonlinear_svm, parameters,
                    make_feature_analysis, feature_names, top_features, plot_name)
예제 #4
0
def logistic_regression(x_train,
                        y_train,
                        x_test,
                        y_test,
                        class_ratio='balanced'):
    utils.print_model_title("Logistic Regression")
    regr = LogisticRegression(C=0.01, class_weight=class_ratio, penalty='l2')
    regr.fit(x_train, y_train)
    y_hat = regr.predict(x_test)
    utils.print_statistics(y_test, y_hat)
예제 #5
0
def baseline(tweets_train, train_labels, tweets_test, test_labels):
    # Import the subjectivity lexicon
    subj_dict = data_proc.get_subj_lexicon()

    types_of_features = ['1', '2', '3', 'ngrams']
    for t in types_of_features:
        start = time.time()
        utils.print_model_title("Classification using feature type " + t)
        if t is '1':
            x_train_features = extract_baseline_features.get_features1(
                tweets_train, subj_dict)
            x_test_features = extract_baseline_features.get_features1(
                tweets_test, subj_dict)

        if t is '2':
            x_train_features = extract_baseline_features.get_features2(
                tweets_train, subj_dict)
            x_test_features = extract_baseline_features.get_features2(
                tweets_test, subj_dict)

        if t is '3':
            x_train_features = extract_baseline_features.get_features3(
                tweets_train, subj_dict)
            x_test_features = extract_baseline_features.get_features3(
                tweets_test, subj_dict)

        if t is 'ngrams':
            ngram_map, x_train_features = extract_baseline_features.get_ngram_features(
                tweets_train, n=1)
            x_test_features = extract_baseline_features.get_ngram_features_from_map(
                tweets_test, ngram_map, n=1)

        # Get the class ratio
        class_ratio = utils.get_classes_ratio_as_dict(train_labels)

        # Train on a Linear Support Vector Classifier
        print("\nEvaluating a linear SVM model...")
        classifiers.linear_svm(x_train_features, train_labels, x_test_features,
                               test_labels, class_ratio)

        # Train on a Logistic Regression Classifier
        print("\nEvaluating a logistic regression model...")
        classifiers.logistic_regression(x_train_features, train_labels,
                                        x_test_features, test_labels,
                                        class_ratio)
        end = time.time()
        print(
            "Completion time of the baseline model with features type %s: %.3f s = %.3f min"
            % (t, (end - start), (end - start) / 60.0))
예제 #6
0
def baseline(tweets_train, train_labels, tweets_test, test_labels):
    subj_dict = dproc.get_subj_lexicon('hindi_lexicon.tff')
    types_of_features = ['1', '2', 'ngrams']  # '3' is removed

    for t in types_of_features:

        start = time.time()
        utils.print_model_title("Classification using features type " + t)
        if t is '1':
            x_train_features = extract_baseline_features.get_features1(
                tweets_train, subj_dict)
            x_test_features = extract_baseline_features.get_features1(
                tweets_test, subj_dict)

        if t is '2':
            x_train_features = extract_baseline_features.get_features2(
                tweets_train, subj_dict)
            x_test_features = extract_baseline_features.get_features2(
                tweets_test, subj_dict)

        #if t is '3':
        #	x_train_features = extract_baseline_features.get_feature3(tweets_train, subj_dict)
        #	x_test_features = extract_baseline_features.get_feature3(tweets_test, subj_dict)

        if t is 'ngrams':
            ngram_map, x_train_features = extract_baseline_features.get_ngram_features(
                tweets_train, n=1)
            x_test_features = extract_baseline_features.get_ngram_features_from_map(
                tweets_test, ngram_map, n=1)

        #get the class ratio
        class_ratio = utils.get_classes_ratio_as_dict(train_labels)

        # train on a linear Support Vector Classifer
        print('\n Evaluating a linear SVM model...')
        classifiers.linear_svm(x_train_features, train_labels, x_test_features,
                               test_labels, class_ratio)

        #train on logistic regression
        classifiers.logistic_regression(x_train_features, train_labels,
                                        x_test_features, test_labels,
                                        class_ratio)
        end = time.time()

        print(
            "Completion time of the baseline model with features type %s: %.3f s = %.3f min"
            % (t, (end - start), (end - start) / 60.0))
예제 #7
0
def logistic_regression_grid(x_train,
                             y_train,
                             x_test,
                             y_test,
                             class_ratio,
                             make_feature_analysis=False,
                             feature_names=None,
                             top_features=0,
                             plot_name='coeff'):
    utils.print_model_title("Logistic Regression")
    C_range = [0.001, 0.01, 0.1, 1, 10, 100]
    #gamma_range = get_regularization_params(a=-2,b=-1,c=2,d=1,e=5)
    parameters = {'C': C_range}
    regr = LogisticRegression(C=1.0, class_weight=class_ratio, penalty='l2')
    grid_classifier(x_train, y_train, x_test, y_test, regr, parameters,
                    make_feature_analysis, feature_names, top_features,
                    plot_name)
예제 #8
0
def run_dl_analysis(train_tweets,
                    test_tweets,
                    y_train,
                    y_test,
                    path,
                    shuffle=True,
                    max_tweet_length=40,
                    emb_type='glove',
                    trainable=True,
                    plot=True,
                    dnn_models=None,
                    epochs=50,
                    batch_size=32,
                    embedding_dim=300,
                    hidden_units=256,
                    dropout=0.5):
    if shuffle:
        train_tweets = utils.shuffle_words(train_tweets)
        test_tweets = utils.shuffle_words(test_tweets)

    # Convert all tweets into sequences of word indices
    tokenizer, train_indices, test_indices = utils.encode_text_as_word_indexes(
        train_tweets, test_tweets, lower=True)
    word_to_index = tokenizer.word_index
    print('There are %s unique tokens.' % len(word_to_index))

    # Pad sequences with 0s
    x_train = pad_sequences(train_indices,
                            maxlen=max_tweet_length,
                            padding='post',
                            truncating='post',
                            value=0.)
    x_test = pad_sequences(test_indices,
                           maxlen=max_tweet_length,
                           padding='post',
                           truncating='post',
                           value=0.)

    print("Shape of the x train set ", x_train.shape)
    print("Shape of the x test set ", x_test.shape)

    ratio = utils.get_classes_ratio(train_labels)

    # Define the embedding layer (which will be the same for all the models)
    embedding_layer = build_embedding_layer(word_to_index, emb_type,
                                            embedding_dim, max_tweet_length,
                                            trainable)

    # Build the model
    for dnn_model in dnn_models:
        start = time.time()

        # Build the deep neural network architecture
        utils.print_model_title(dnn_model)
        model = build_model(max_tweet_length,
                            embedding_layer,
                            hidden_units,
                            dropout,
                            dnn_architecture=dnn_options(dnn_model))

        # Compile the model
        my_optimizer = Adam(lr=0.0001, beta_1=0.9, beta_2=0.99, decay=0.01)
        model.compile(loss='categorical_crossentropy',
                      optimizer=my_optimizer,
                      metrics=['categorical_accuracy', utils.f1_score])

        # Print the model summary
        print(model.summary())

        if plot:  # save an image of the current architecture
            plot_model(model,
                       to_file=path + '/models/dnn_models/' +
                       dnn_model.lower() + '_model_summary.png',
                       show_shapes=True,
                       show_layer_names=True)

        # Save the json representation of the model
        open(
            path + '/models/dnn_models/model_json/' + dnn_model.lower() +
            '_model.json', 'w').write(model.to_json())

        # Prepare the callbacks
        save_best = ModelCheckpoint(
            monitor='val_categorical_accuracy',
            save_best_only=True,
            mode='auto',
            filepath=path + '/models/dnn_models/best/' + dnn_model.lower() +
            '_model.json.hdf5')
        reduceLR = ReduceLROnPlateau(monitor='val_categorical_accuracy',
                                     factor=0.1,
                                     patience=3,
                                     verbose=1)
        early_stopping = EarlyStopping(monitor='val_categorical_accuracy',
                                       patience=20,
                                       verbose=1)

        # Fit the model on the training data
        history = model.fit(x_train,
                            y_train,
                            batch_size=batch_size,
                            epochs=epochs,
                            shuffle=True,
                            class_weight=ratio,
                            callbacks=[save_best, reduceLR, early_stopping],
                            validation_split=0.1,
                            verbose=1)

        if plot:
            utils.plot_training_statistics(history,
                                           "/plots/dnn_models/" + dnn_model,
                                           also_plot_validation=False,
                                           acc_mode='categorical_accuracy',
                                           loss_mode='loss')

        # Load the best model
        model = utils.load_model(
            json_name=path + '/models/dnn_models/model_json/' +
            dnn_model.lower() + '_model.json',
            h5_weights_name=path + '/models/dnn_models/best/' +
            dnn_model.lower() + '_model.json.hdf5')

        # Make prediction and evaluation
        predict(model, x_test, y_test)
        end = time.time()
        print(
            "==================================================================\n"
        )
        print("%s model analysis completion time: %.3f s = %.3f min" %
              (dnn_model, (end - start), (end - start) / 60.0))
        print(
            "==================================================================\n"
        )
예제 #9
0
# Load the labels
y_train = [
    int(l) for l in utils.load_file(path + "/res/datasets/ghosh/labels_" +
                                    train_filename)
]
y_test = [
    int(l) for l in utils.load_file(path + "/res/datasets/ghosh/labels_" +
                                    test_filename)
]

modes = ['binary', 'count', 'tfidf', 'freq']
results = DataFrame()

# For each selection-mode, make a BoW analysis using both SVMs and a simple feed-forward NN
for mode in modes:
    utils.print_model_title("BoW Analysis for Mode %s" % mode)
    tokenizer, x_train, x_test = utils.encode_text_as_matrix(train_tweets,
                                                             test_tweets,
                                                             mode,
                                                             lower=True)
    word_to_indices = tokenizer.word_index
    index_to_word = {i: w for w, i in word_to_indices.items()}
    start = time.time()
    run_supervised_learning_models(x_train,
                                   y_train,
                                   x_test,
                                   y_test,
                                   make_feature_analysis=True,
                                   feature_names=index_to_word,
                                   top_features=20,
                                   plot_name="/bow_models/bow_%s_" % mode)
예제 #10
0
    utils.initialize_writer(to_write_filename)

    train_filename = "train_sample.txt"
    test_filename = "test_sample.txt"
    tokens_filename = "clean_original_"
    data_path = path + "/res/tokens/tokens_"
    pos_path = path + "/res/pos/pos_"

    # Load data tokens and pos tags
    train_tokens = utils.load_file(data_path + tokens_filename +
                                   train_filename)
    test_tokens = utils.load_file(data_path + tokens_filename + test_filename)
    train_pos = utils.load_file(pos_path + tokens_filename + train_filename)
    test_pos = utils.load_file(pos_path + tokens_filename + test_filename)

    # Load the labels
    train_labels = [
        int(l) for l in utils.load_file(path + "/res/datasets/ghosh/labels_" +
                                        train_filename)
    ]
    test_labels = [
        int(l) for l in utils.load_file(path + "/res/datasets/ghosh/labels_" +
                                        test_filename)
    ]

    feature_sets = ['pragmatic', 'sentiment', 'syntactic', 'topic']
    for feature_set in feature_sets:
        utils.print_model_title("Current feature: %s" % feature_set)
        build_model(train_tokens, train_pos, train_labels, test_tokens,
                    test_pos, test_labels, feature_set)
예제 #11
0
def linear_svm(x_train, y_train, x_test, y_test, class_ratio='balanced'):
    utils.print_model_title("Linear SVM")
    svm = LinearSVC(C=0.01, class_weight=class_ratio, penalty='l2')
    svm.fit(x_train, y_train)
    y_hat = svm.predict(x_test)
    utils.print_statistics(y_test, y_hat)
예제 #12
0

path = os.getcwd()[:os.getcwd().rfind('/')]
to_write_filename = path + '/stats/key_features_analysis_rule_based.txt'
utils.initialize_writer(to_write_filename)

train_filename = "train.txt"
test_filename = "test.txt"
tokens_filename = "clean_original_"
data_path = path + "/res/tokens/tokens_"
vocab_filename = path + "/res/vocabulary/vocabulary.txt"

# Load the data
train_tweets = utils.load_file(data_path + tokens_filename + train_filename)
test_tweets = utils.load_file(data_path + tokens_filename + test_filename)

# Load the labels
train_labels = [
    int(l) for l in utils.load_file(path + "/res/datasets/ghosh/labels_" +
                                    train_filename)
]
test_labels = [
    int(l) for l in utils.load_file(path + "/res/datasets/ghosh/labels_" +
                                    test_filename)
]

# A rule-based approach used here to analyse the key-features that are actually learnt in a (non-)sarcastic context
utils.print_model_title("Rule-based analysis")
rule_based_comparison(train_tweets, train_labels, test_tweets, test_labels,
                      vocab_filename)