Exemplo n.º 1
0
from sklearn import svm
from clean_data import CleanData

__author__ = "Yacine Sibous"

cd = CleanData()

print "Getting the training data."
training_data = cd.bag_of_words(in_file="data/clean_train_input.csv")
print "Done collecting data."

X = [x[1] for x in training_data]
y = [y[2] for y in training_data]

print X[0:5]
print y[0:5]
clf = svm.SVC()
clf.fit(X, y)
Exemplo n.º 2
0
def plot_feature_size(num_iter):
    """Tests various feature sizes and plots the error.

    Args:
        num_iter: Number of times to test for each point.
    """
    points = [100, 500, 1000, 2000, 3000, 4000, 5000, 6000, 7000,
              8000, 9000, 10000]
    errors = []
    train_errors = []

    # Iterate over all points defined.
    for point in points:
        print "Testing for point", point, "features."
        error = 0
        train_error = 0

        # Repeat the test the desired number of times.
        for i in range(0, num_iter):
            cd = CleanData(tfidf=True, max_train_size=25000, max_features=point)

            try:
                # Get and train data.
                training_data = cd.bag_of_words(in_file="data/clean_train_input.csv")

                ids, X, y = get_numpy_matrices(training_data)

                del training_data

                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

                del X, y, ids

                nb = NaiveBayes()
                nb.train(X_train, y_train)

                # Calculate training and validation errors.
                out = nb.classify(X_test)
                error += nb.compute_error(out, y_test)

                train_out = nb.classify(X_train)
                train_error += nb.compute_error(train_out, y_train)
            except MemoryError:
                print "Memory error. Continuing."
                continue

            del X_train, X_test, y_train, y_test

        errors.append(error / num_iter)
        train_errors.append(train_error / num_iter)

    # PLOT.
    plt.figure(2)

    plt.title("Error vs Features")
    plt.xlabel("Number of features")
    plt.ylabel("Error")
    # plt.xscale('log')
    plt.plot(points, errors, '-ro')
    plt.plot(points, train_errors, '-bo')
    plt.show()
Exemplo n.º 3
0
from sklearn import svm
from sklearn.cross_validation import cross_val_score
from clean_data import CleanData
import numpy as np
import csv

__author__ = "Yacine Sibous, Jana Pavlasek"

# Initialize data for final submission.
cd = CleanData(tfidf=True, max_features=2500000, n_grams=3)

# Geat features and output.
print 'Getting Training data.'
X, y = cd.bag_of_words(in_file="data/clean_train_input.csv", sparse=True)
print 'Done collecting data.'

# Train.
print 'Training the model.'
lin_clf = svm.LinearSVC()
lin_clf.fit(X, y)
print 'Done training.'

# 3-fold cross validation.
print 'Cross Validation'
c_validation = cross_val_score(lin_clf, X, y, scoring='accuracy')
print c_validation.mean()

# Get and predict on the final test data.
print 'Collecting test data.'
test = cd.get_x_in(sparse=True)
print 'Done collecting data.'