示例#1
0
import numpy
import load_data
import representation
from sklearn import svm

from scipy.spatial.distance import correlation
from scipy.spatial.distance import cosine
from scipy.spatial.distance import euclidean

#### input paths
trainFilePath = '../Data/train-data.txt'
devFilePath = '../Data/dev-data.txt'
testFilePath = '../Data/task_2_test_set_to_release.txt'

#### loading data
trainClassID, trainTweetTxt = load_data.get_train(trainFilePath)
devClassID, devTweetTxt = load_data.get_dev(devFilePath)
testTweedID, testTweetTxt = load_data.get_test(testFilePath)

trainClassID = trainClassID + devClassID
#### representing as a matrix
mini_df = 1
for k in range (1, 2):
    trainDTMatirix, devDTMatirix, testDTMatirix = representation.get_dtm(trainTweetTxt, devTweetTxt, testTweetTxt,  k)
##    trainDTMatirix, devDTMatirix, testDTMatirix = representation.get_tdidf(trainTweetTxt, devTweetTxt, testTweetTxt,  k)

    trainDTMatirix = trainDTMatirix.todense()
    devDTMatirix = devDTMatirix.todense()
    testDTMatirix = testDTMatirix.todense()

    trainDTMatirix = numpy.concatenate((trainDTMatirix, devDTMatirix))
We also apply a nonlinear transform (x1, x2) -> (x1, x2, x1^2, x2^2, x1*x2, |x1 - x2|, |x1 + x2|).
We compare our error with and without regularization
'''


def nonlinear_transform(x):
    f = lambda x: [
        x[0], x[1], x[0]**2, x[1]**2, x[0] * x[1],
        abs(x[0] - x[1]),
        abs(x[0] + x[1])
    ]
    return np.array([f(x_i) for x_i in x])


if __name__ == '__main__':
    x_train, y_train = get_train()
    x_test, y_test = get_test()
    x_train = nonlinear_transform(x_train)
    x_test = nonlinear_transform(x_test)

    lrc = h2.LinearRegressionClassifier(7, x_train, y_train)
    in_sample_err = h2.calc_error_rate(y_train, lrc.classify(x_train))
    out_sample_err = h2.calc_error_rate(y_test, lrc.classify(x_test))
    h2.boldprint("Without Regularization:")
    print "In sample err", in_sample_err
    print "Out of sample err", out_sample_err

    _lambda = .5
    lrc = h2.LinearRegressionClassifier(7,
                                        x_train,
                                        y_train,