Пример #1
0
def load_data(num_records=1750):
    print('... loading data ...')
    data, labels = prep.get_data(num_records)
    labels += 1
    train_x, test_x, train_y, test_y = train_test_split(
        data, labels, test_size=0.3, random_state=28022016
    )
    train_x, valid_x, train_y, valid_y = train_test_split(
        train_x, train_y, test_size=0.3, random_state=10032016
    )

    def shared_dataset(data_xy, borrow=True):
        """ Function that loads the dataset into shared variables

        The reason we store our dataset in shared variables is to allow
        Theano to copy it into the GPU memory (when code is run on GPU).
        Since copying data into the GPU is slow, copying a minibatch everytime
        is needed (the default behaviour if the data is not in a shared
        variable) would lead to a large decrease in performance.
        """
        data_x, data_y = data_xy

        shared_x = theano.shared(np.asarray(data_x,
                                               dtype=theano.config.floatX),
                                 borrow=borrow)
        shared_y = theano.shared(np.asarray(data_y,
                                               dtype=theano.config.floatX),
                                 borrow=borrow)
        # When storing data on the GPU it has to be stored as floats
        # therefore we will store the labels as ``floatX`` as well
        # (``shared_y`` does exactly that). But during our computations
        # we need them as ints (we use labels as index, and if they are
        # floats it doesn't make sense) therefore instead of returning
        # ``shared_y`` we will have to cast it to int. This little hack
        # lets ous get around this issue
        return shared_x, T.cast(shared_y, 'int32')

    test_x, test_y = shared_dataset((test_x, test_y))
    valid_x, valid_y = shared_dataset((valid_x, valid_y))
    train_x, train_y = shared_dataset((train_x, train_y))

    rval = [(train_x, train_y), (valid_x, valid_y), (test_x, test_y)]
    return rval
Пример #2
0
2 tests - against twitter data and against polarity list
"""

from Python_code.classifiers.preprocessing import img_preprocess as prep
from sklearn.decomposition import RandomizedPCA
from sklearn.metrics import accuracy_score, classification_report
from sklearn.cross_validation import train_test_split
import pandas as pd
import time
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC

APPLY_PCA = False
prep.SIZE = (250, 250)
data, labels = prep.get_data(1000)

# split test & train sets
train_x, test_x, train_y, test_y = train_test_split(
    data, labels, test_size=0.3, random_state=28022016)

if APPLY_PCA:
    pca = RandomizedPCA(n_components=100, whiten=False)
    train_x = pca.fit_transform(train_x)
    test_x = pca.transform(test_x)

print('starting svm')

start_time = time.time()

parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}