예제 #1
0
    def test_classification_test_dataset(self):
        # -------------------------------------------------------------------------------
        # Use the best method (perceptron, average perceptron or Pegasos) along with
        # the optimal hyperparameters according to validation accuracies to test
        # against the test dataset. The test data has been provided as
        # test_bow_features and test_labels.
        # -------------------------------------------------------------------------------
        T = 25
        L = 0.01
        theta, theta_0 = p1.pegasos(feature_matrix=train_bow_features, labels=train_labels, T=T, L=L)
        pred_labels = p1.classify(test_bow_features, theta, theta_0)
        accuracy = p1.accuracy(pred_labels, test_labels)
        print(f'Accuracy on test data : {accuracy}')
        # -------------------------------------------------------------------------------
        # Assign to best_theta, the weights (and not the bias!) learned by your most
        # accurate algorithm with the optimal choice of hyperparameters.
        # -------------------------------------------------------------------------------

        best_theta = theta
        wordlist = [word for (idx, word) in sorted(zip(dictionary.values(), dictionary.keys()))]
        sorted_word_features = utils.most_explanatory_word(best_theta, wordlist)
        print("Most Explanatory Word Features")
        print(sorted_word_features[:10])
        print("Least Explanatory Word Features")
        print(sorted_word_features[-10:])
        return
예제 #2
0
    for i, text in enumerate(reviews):
        word_list = extract_words(text)
        for word in word_list:
            if word in dictionary:
                feature_matrix[i, dictionary[word]] = word_list.count(word)
    return feature_matrix
#pragma: coderesponse end

train_data = utils.load_data('reviews_train.tsv')
train_texts, train_labels = zip(*((sample['text'], sample['sentiment']) for sample in train_data))
dictionary = bag_of_words(train_texts)
train_bow_features = extract_bow_feature_vectors(train_texts, dictionary)
theta, theta0 = pegasos(train_bow_features, train_labels, T=25, L=0.01)
wordlist   = [word for (idx, word) in sorted(zip(dictionary.values(), dictionary.keys()))]
sorted_word_features = utils.most_explanatory_word(theta, wordlist)
print(" *** Most Positive Word Features ***")
print(sorted_word_features[0:20])
print(" *** Most Negative Word Features ***")
print(sorted_word_features[-20:-1])

x = np.repeat([-1, 1], 10)
y = np.tile(np.arange(1,11), 2)
fig, ax = plt.subplots()
wlist = sorted_word_features[-11:-1] + sorted_word_features[0:10]
colors = ['g' if label == 1 else 'r' for label in x]
for i, word in enumerate(wlist):
    ax.scatter(x[i], y[i] , s=700*(len(word)) ,c =colors[i], \
                marker=r"$ {} $".format(word), edgecolors='none')
ax.spines['left'].set_position('zero')
ax.spines['right'].set_color('none')
예제 #3
0
#-------------------------------------------------------------------------------
T=25
L=0.0100
avg_peg_train_accuracy, avg_peg_val_accuracy = \
    p1.classifier_accuracy(p1.pegasos, train_bow_features,test_bow_features,train_labels,test_labels,T=T,L=L)
print("{:50} {:.4f}".format("Training accuracy for Pegasos:", avg_peg_train_accuracy))
print("{:50} {:.4f}".format("Validation accuracy for Pegasos:", avg_peg_val_accuracy))
thetas_pegasos = p1.pegasos(train_bow_features, train_labels, T, L)
# print(thetas_pegasos)
#-------------------------------------------------------------------------------
# Assign to best_theta, the weights (and not the bias!) learned by your most
# accurate algorithm with the optimal choice of hyperparameters.
#-------------------------------------------------------------------------------

best_theta =thetas_pegasos[0]
wordlist   = [word for (idx, word) in sorted(zip(dictionary.values(), dictionary.keys()))]
sorted_word_features = utils.most_explanatory_word(best_theta, p1.bag_of_words(wordlist))
print("Most Explanatory Word Features")
print(sorted_word_features[:10])
#-------------------------------------------------------------------------------










예제 #4
0
print "(train accuracy, test accuracy) before modification"
print p1.average_passive_aggressive_accuracy(train_bow_features,test_bow_features,train_labels,test_labels,T,L)

#-------------------------------------------------------------------------------
#
#-------------------------------------------------------------------------------
# 
#
# Assign to best_theta, the weights (and not the bias!) learned by the most
# accurate algorithm with the optimal choice of hyperparameters.
#-------------------------------------------------------------------------------

best_theta = p1.average_passive_aggressive(test_bow_features, test_labels, best_T, best_L)[0]
wordlist   = [word for (idx, word) in sorted(zip(dictionary.values(), dictionary.keys()))]
sorted_word_features = utils.most_explanatory_word(best_theta, wordlist)
print("Most Explanatory Word Features")
print(sorted_word_features[:10])

#-------------------------------------------------------------------------------
#
#-------------------------------------------------------------------------------
# 
#
# Assessing performance on the validation set.
# 
#-------------------------------------------------------------------------------
dictionary_mod = p1.modified_bag_of_words(train_texts)

train_final_features = p1.extract_final_features(train_texts, dictionary_mod)
val_final_features   = p1.extract_final_features(val_texts, dictionary_mod)
예제 #5
0
파일: main.py 프로젝트: yshen4/pymal
def problem9b(T = 25, L = 0.01):
    best_theta =  p1.pegasos(train_bow_features, train_labels, T, L)[0]
    wordlist   = [word for (idx, word) in sorted(zip(dictionary.values(), dictionary.keys()))]
    sorted_word_features = utils.most_explanatory_word(best_theta, wordlist)
    print("Most Explanatory Word Features")
    print(sorted_word_features[:10])