Пример #1
0
def compute_features_vector(file='FEATURES_test.json',
                            answers=question.get_question(0)):

    for index, answer in enumerate(answers):
        score = answer['score_1']
        text = answer['text']
        text = remove_stops(tokenizer(text))
        len1 = len(text)
        len2 = len(tokenizer(text))

        feature = feature_extraction(text)

        cosin = feature.cosine_similarity()
        keywords = feature.keywords_similarity()
        _, _, _, LSA = feature.lsa_similarity()
        part_words, _ = feature.partial_words_similarity()
        language = feature.language_similirity()
        lda = feature.lda_similarity()
        al1, al2 = feature.align_similarity()
        c1, c2 = feature.corpus_similarity()

        lda_ext = feature.lda_extract()
        cos_sc = feature.cos_score()
        bingo_sc = feature.bingo_score()

        jacc = feature.jaccard()
        dic_sim = feature.dice()
        keys_norm = feature.keywords_norm()
        lsi_query = feature.LSI_query()
        bleu_score = feature.bleuscore()
        keyf = feature.keyf_score()
        fgram = feature.fgram_score()
        holo = feature.holo_score()

        info = [
            cosin, keywords, LSA, part_words, language, lda, al1, c1, lda_ext,
            cos_sc, bingo_sc, jacc, dic_sim, keys_norm, lsi_query, bleu_score,
            keyf, fgram, holo, score
        ]
        print(info)
        info = [float(x) for x in info]

        data['data'].append(info)
        f = open(file, 'w')
        json.dump(data, f, indent=2)
        f.close()
        print("Index = ", index)

    # write features to file
    f = open(file, 'w')
    json.dump(data, f, indent=2)
    f.close()
Пример #2
0
def main():
    """
    The main function.
    Arguments:
        1. Takes no arguments.
    """

    train_data = data.load_training_data()
    # function call to load training data
    test_data = data.load_test_data()
    # function call to load test data
    count = CountVectorizer()
    # initialize the count vector
    tfidf_transformer = TfidfTransformer()
    # initialize a tfidf transformer
    models_dict = {}
    # empty dict
    train_tfidf = features.feature_extraction(train_data, count,
                                              tfidf_transformer)
    # function call for feature extraction
    bayes = naive_bayes(train_data, train_tfidf)
    # function call to fit the Naive Bayes classifier
    models_dict['Naive Bayes'] = bayes
    # add models to dictionary
    svm = svm_classifier(train_data, train_tfidf)
    # function call to fit SVM Classifier
    models_dict['SVM'] = svm
    # add models to a dictionary
    rand_forest = random_forest_classifier(train_data, train_tfidf)
    # function to build random forest classifier
    models_dict['Random Forest'] = rand_forest
    # add models to dictionary
    logistic = logistic_regression_classifier(train_data, train_tfidf)
    # function call to build logistic regression
    models_dict['Logistic Regression'] = logistic
    # add models to dictionary
    decision_tree = decision_tree_classifier(train_data, train_tfidf)
    # function call for decision tree classifier
    models_dict['Decision Tree'] = decision_tree
    # add model to the dictionary
    predict_test_data(train_data, test_data, models_dict, count,
                      tfidf_transformer, train_tfidf)
Пример #3
0
def predict_test_data(train_data, test_data, models_dict, count,
                      tfidf_transformer, train_tfidf):
    """
    This function helps to predict the class labels
    for the test data based on the models.
    Arguments:
        1. train_data: Train data which is a dictionary with features and
        class labels.
        2. test_data: Test data which is a dictionary with features
        and class labels.
        3. models_dict: The models object for Bernoulli and multinomial
        given in a dictionary.
    """

    for model in models_dict:
        # iterate through each model
        print model
        test_tfidf = features.feature_extraction(test_data,
                                                 count,
                                                 tfidf_transformer,
                                                 data='test')
        # function call to get TFIDF for test data
        predicted = models_dict[model].predict(test_tfidf)
        # get the predicted values for the test data
        predicted_cv = cross_validation.cross_val_predict(models_dict[model],
                                                          train_tfidf,
                                                          train_data.target,
                                                          cv=10)
        # value prediction using cross validation
        accuracy_cv = np.mean(predicted_cv == train_data.target) * 100
        # calculate accuracy with cross validation
        accuracy = np.mean(predicted == test_data.target) * 100
        # accuracy calculation
        print_results(test_data, predicted, accuracy, model)
        # function call to print results
        print_results(train_data, predicted_cv, accuracy_cv,
                      model + ' with cv')
Пример #4
0
# Signal conditioning filter design
#########################################
if silence == 0: print("Designing filter with %d taps" % numtaps)
# Set filter specs
lpf = remez(numtaps=numtaps, bands=cutoff, desired=[1.0, 0.0], Hz=fs)

#########################################
# Data filtering and Feature extraction for Training data
#########################################
if silence == 0: print("Passing the dataset through the filter")
filtered_train, valid_labels_train = utils.data_filtering(
    X_train_raw, y_train_raw, lpf)

if silence == 0: print("Calculating features...")
if silence == 0: print(features)
X_train, y_train = fe.feature_extraction(features, filtered_train,
                                         valid_labels_train, fs, window)

#########################################
# Overrides for handcrafted test data, for demo
#########################################
if cheat_test == 1:
    print("Handcrafting test set, for demonstration. You're cheating")
    np.random.seed(1)  # for reproducibility of results
    X_test_raw = np.concatenate((np.random.random(n_handcraft) * 3 - 3,
                                 np.random.random(n_handcraft) * 500 - 500))
    X_test_raw = X_test_raw.reshape((len(X_test_raw), 1))
    y_test_raw = np.concatenate((np.zeros(
        (n_handcraft, 1)), np.ones((n_handcraft, 1))))
    print("Overriding window size and sampling frequency")
    window = 32
    fs = 32
Пример #5
0
import numpy as np
from features import feature_extraction

dataObj = feature_extraction()
train, validation, test = dataObj.process_text(dataObj.data)
# calculate the most common words in data set
# will output a text file (common_words.txt)
dataObj.calc_common_words(dataObj.data)

# calculate the frequency if common words occurrences per comment
# now each comment has the frequency of all the most common words in the corpus
common_words = dataObj.calc_freq_words(
    dataObj.data)  # common_words is an array
print("done")