Python get_training_data示例，utils.loading.get_training_data Python示例

示例#1

0

显示文件

文件： preprocessing2.py 项目： EdwardBetts/kaggle_otto

def preprocess(filepath=TRAIN_DATA_PATH, test_frac=0.1):
    print 'loading data'
    #all_data, all_labels = load_train_data_csv()
    all_data, all_labels = get_training_data(filepath=filepath)[:2]

    print 'shuffling data'
    # seed = random() makes sure it's different each time
    all_data, all_labels = shuffle(all_data, all_labels, seed=random())[:2]

    print 'splitting data into train/test'
    train, train_labels, test, test_labels = split_data(all_data,
                                                        all_labels,
                                                        test_frac=test_frac)
    #print train.shape, test.shape, all_data.shape

    print 'splitting into classes'
    class_set, class_set_labels = class_split(train, train_labels)

    print 'outliers using mahalanobis percentage'
    class_set_1, class_set_labels_1, outlier_data_1, outlier_labels_1, smallest_size_1 = remove_outliers_mahalanobis(
        class_set, class_set_labels, offset=0.95)
    print 'outliers using mahalanobis treshold'
    class_set_2, class_set_labels_2, outlier_data_2, outlier_labels_2, smallest_size_2 = remove_outliers_mahalanobis(
        class_set, class_set_labels, offset=155)

    return class_set_1, class_set_labels_1, outlier_data_1, outlier_labels_1, smallest_size_1, class_set_2, class_set_labels_2, outlier_data_2, outlier_labels_2, smallest_size_2, test, test_labels

示例#2

0

显示文件

文件： normalize.py 项目： EdwardBetts/kaggle_otto

def store_column_maxima():
    """
		Calculate and store the maxima of the combined train and test columns (for normalization).

		This loads all data so it's not fast.
	"""
    train_data = get_training_data()[0]
    test_data = get_testing_data()[0]
    maxima = array(
        [max(tr, te) for tr, te in zip(train_data.max(0), test_data.max(0))])
    save(COLUMN_MAX_PATH, maxima)
    if VERBOSITY >= 1:
        print 'updated the column maxima file'
    return maxima

示例#3

0

显示文件

文件： subclassifier.py 项目： EdwardBetts/kaggle_otto

def getSubClassifierData(subclasses=[2, 3],
                         train_data=None,
                         true_classes=None):
    """Gets training data for classification, from only the given classes
   Either filter existing data, or load the default training data, and filter that.
   If either train_data or true_classes is None, the data will be loaded
       using get_training_data() from utils.loading
   """
    if (train_data is None) or (true_classes is None):
        train_data, true_classes, _ = get_training_data()

    assert len(true_classes) == np.shape(train_data)[0]

    validsample = np.array([x in subclasses for x in true_classes])
    return train_data[validsample, :], true_classes[validsample]

示例#4

0

显示文件

文件： predict.py 项目： EdwardBetts/kaggle_otto

def predict_300_v1():
	data, labels = get_training_data()[:2]
	params = {
		'dense1_nonlinearity': 'rectify',
		'dense1_init': 'glorot_normal',
		'dense1_size': 300,
		'dense2_size': 0,
		'dense3_size': None,
		'dropout1_rate': 0.5,
		'dropout2_rate': None,
		'dropout3_rate': None,
		'extra_feature_count': 0,
	}
	probs = predict(params, 'results/pretrain/single_pretrain_300_0_0.net.npz', data)
	print 'logloss', calc_logloss(probs, labels)

示例#5

0

显示文件

文件： outliers.py 项目： EdwardBetts/kaggle_otto

def get_filtered_data(cut_outlier_frac=0.05,
                      method='EE',
                      filepath=TRAIN_DATA_PATH):
    """
		Load and filter data and classes to remove an approximate fraction of outliers.

		:param cut_outlier_frac: fraction of outliers to remove (approximate, especially for OCSVM)
		:param method: either 'EE' for Elliptical Envelope or 'OCSVM' for One Class Support Vector Machines
		:return: filtered train data, train classes and features
	"""
    data, classes, features = get_training_data()
    hash = '{0:s}_{1:04d}_all.npy'.format(method, int(cut_outlier_frac * 1000))
    data, classes = filter_data_cache(data,
                                      classes,
                                      hash=hash,
                                      method=method,
                                      cut_outlier_frac=cut_outlier_frac)
    return data, classes, features

示例#6

0

显示文件

from os.path import basename, splitext
from sys import modules

from nnet.oldstyle.train_test import train_test_NN
from utils.loading import get_training_data
from validation.crossvalidate import SampleCrossValidator
from validation.optimize_parallel import ParallelGridOptimizer

train_data, true_labels = get_training_data()[:2]
name = '{0:s}'.format(
    splitext(
        basename(getattr(modules['__main__'], '__file__',
                         'optimize.default')))[0])
pretrain = None  # join(PRETRAIN_DIR, 'pt_varx256x256x64.net.npz')
params = {
    'name': name,
    'dense1_nonlinearity':
    'leaky20',  # tanh, sigmoid, rectify, leaky2, leaky20, softmax
    'dense1_init':
    'glorot_uniform',  # orthogonal, sparse, glorot_normal, glorot_uniform, he_normal, he_uniform
    'dense1_size': 256,  # [30, 25, 80, 120, 180]
    'dense2_size': 128,
    'dense3_size': None,
    'learning_rate':
    0.001,  # initial learning rate (learning rate is effectively higher for higher momentum)
    'learning_rate_scaling':
    1000,  # pogression over time; 0.1 scaled by 10 is 0.01
    'momentum': 0.9,  # initial momentum
    'momentum_scaling': 100,  # 0.9 scaled by 10 is 0.99
    'dropout1_rate': 0,  # [0, 0.5]
    'dropout2_rate': 0,

示例#7

0

显示文件

from sklearn.preprocessing import MinMaxScaler
from nnet.oldstyle.base_optimize import name_from_file
from nnet.prepare import LogTransform
from nnet.scikit import NNet
from settings import LOGS_DIR, VERBOSITY, SUBMISSIONS_DIR
from utils.features import PositiveSparseFeatureGenerator, PositiveSparseRowFeatureGenerator, DistanceFeatureGenerator
from utils.ioutil import makeSubmission
from utils.loading import get_preproc_data, get_training_data, get_testing_data


#train, labels, test = get_preproc_data(Pipeline([
#	('row', PositiveSparseRowFeatureGenerator()),
#	('distp31', DistanceFeatureGenerator(n_neighbors = 3, distance_p = 1)),
#	('distp52', DistanceFeatureGenerator(n_neighbors = 5, distance_p = 2)),
#]), expand_confidence = 0.9)
train, labels = get_training_data()[:2]
test = get_testing_data()[0]

#cpus = max(cpu_count() - 1, 1)
#random = RandomState()

opt = RandomizedSearchCV(
	estimator = Pipeline([
		#('gen23', PositiveSparseFeatureGenerator(difficult_classes = (2, 3), extra_features = 40)),
		#('gen234', PositiveSparseFeatureGenerator(difficult_classes = (2, 3, 4), extra_features = 40)),
		#('gen19', PositiveSparseFeatureGenerator(difficult_classes = (1, 9), extra_features = 63)),
		('log', LogTransform()),
		('scale03', MinMaxScaler(feature_range = (0, 3))),
		('nn', NNet(**{
			'dense1_nonlinearity': 'rectify',
			'dense1_init': 'glorot_normal',

示例#8

0

显示文件

文件： basic_net.py 项目： EdwardBetts/kaggle_otto

from lasagne.layers import get_all_params, InputLayer, DenseLayer
from lasagne.nonlinearities import tanh, softmax
from lasagne.updates import sgd
from theano import function, config
import theano.tensor as T
import numpy as np
from matplotlib.pyplot import subplots, show, cm
from utils.loading import get_training_data
from validation.crossvalidate import SampleCrossValidator

filterwarnings('ignore', '.*topo.*')
config.optimizer = 'None'  # todo: turn off, it's very slow

#todo: normalize data

train_data, true_classes, features = get_training_data()
validator = SampleCrossValidator(train_data,
                                 true_classes,
                                 rounds=5,
                                 test_frac=0.3)
for train, classes, test in validator.yield_cross_validation_sets():
    # create tensor objects
    trainT = train.astype(config.floatX)
    testT = train.astype(config.floatX)
    classT = classes.astype('int32')

    # First, construct an input layer.
    # The shape parameter defines the expected input shape, which is just the shape of our data matrix X.
    l_in = InputLayer(shape=trainT.shape, W=Constant())
    # A dense layer implements a linear mix (xW + b) followed by a nonlinearity.
    l_hidden = DenseLayer(

示例#9

0

显示文件

文件： randomforest.py 项目： EdwardBetts/kaggle_otto

                  labels[train_rows:],
                  sample_weight=sample_weight[train_rows:])

    predictions = model.predict_proba(test)

    if rescale_pred:
        predictions = rescale_prior(predictions, bincount(labels))
    return predictions


if __name__ == '__main__':

    from utils.loading import get_training_data
    from validation.crossvalidate import SampleCrossValidator

    train_data, true_classes, _ = get_training_data()
    validator = SampleCrossValidator(train_data,
                                     true_classes,
                                     rounds=1,
                                     test_frac=0.1,
                                     use_data_frac=1.0)
    for train, classes, test in validator.yield_cross_validation_sets():
        prediction = randomForest(train,
                                  classes,
                                  test,
                                  n_estimators=200,
                                  max_depth=35,
                                  verbose=1,
                                  class_weight="auto",
                                  calibration=3,
                                  rescale_pred=True)

示例#10

0

显示文件

	parameters['dense2_nonlinearity'] = parameters['dense1_nonlinearity']  # hack1
	parameters['dense2_init'] = parameters['dense1_init']  # hack2
	net = make_net(**parameters)
	net.fit(train, classes - 1)
	return net.predict_proba(test)



# one big job for size1 + size2 + dropout
# one job for nonlinearity1 + initialization1
# one job for learning rate + learning rate scaling + momentum + momentum scaling (split if too many)


name = '{0:s}.log'.format(splitext(basename(getattr(modules['__main__'], '__file__', 'optimize.default')))[0])  # automatic based on filename

train_data, true_classes, features = get_training_data()  # load the train data
train_data, true_classes = equalize_class_sizes(train_data, true_classes)
train_data, true_classes = filter_data(train_data, true_classes, cut_outlier_frac = 0.06, method = 'OCSVM')  # remove ourliers
train_data = normalize_data(train_data, use_log = True)[0]  # also converts to floats
validator = SampleCrossValidator(train_data, true_classes, rounds = 3, test_frac = 0.2, use_data_frac = 1)
optimizer = ParallelGridOptimizer(train_test_func = train_test_NN, validator = validator, use_caching = True,
	name = name,                      # just choose something sensible
	dense1_size = 80,                 # [30, 25, 80, 120, 180]
	dense1_nonlinearity = ['tanh', 'sigmoid', 'rectify', 'leaky2', 'leaky20' 'softmax'],
	dense1_init = ['orthogonal', 'sparse', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'],
	dense2_size = 80,                 # [30, 25, 80, 120, 180]
	dense2_nonlinearity = 'leaky20',  # this is coupled to dense1_nonlinearity through hack#1
	dense2_init = 'orthogonal',       # idem hack2
	learning_rate = 0.001,            # [0.1, 0.01, 0.001, 0.0001]
	learning_rate_scaling = 100,      # [1, 10, 100]
	momentum = 0.9,                   # [0, 0.9, 0.99]

示例#11

0

显示文件

def optimize_NN(name=name_from_file(),
                rounds=1,
                debug=False,
                use_caching=True,
                train_test_func=train_test_NN,
                test_only=False,
                **special_params):
    """
		Some default code for optimization, adding default parameters and debug, and using mostly other classes to do the rest of the work.
	"""
    """
		Default parameters.
	"""
    for key in special_params.keys():
        assert key in DEFAULT_PARAMS.keys(
        ), '"{0:s}" is not a known parameter'.format(key)
    params = copy(DEFAULT_PARAMS)
    params.update(special_params)
    """
		Load data.
	"""
    train_data, true_labels, features = get_training_data()
    """
		Pre-training.
	"""
    if params['pretrain'] or params['pretrain'] is None:
        layer_sizes = [
            params['extra_feature_count'] or 0, params['dense1_size'] or 0,
            params['dense2_size'] or 0, params['dense3_size'] or 0,
            params['dropout1_rate'] or 0, params['dropout2_rate'] or 0
        ]
        if any(is_nonstr_iterable(nr) for nr in layer_sizes):
            """ Different layouts, so no pre-training. """
            if params['pretrain'] is None:
                print 'No pre-training since layer sizes are not constant.'
                params['pretrain'] = False
            else:
                raise AssertionError(
                    'Pre-training is not available when there are different network layouts (e.g. different numbers of neurons or features).'
                )
        else:
            """ Constant layout, so can use pre-training. """
            if params['pretrain'] is None or params['pretrain'] is True:
                params['pretrain'] = join(
                    PRETRAIN_DIR, 'pt{0:s}.net.npz'.format('x'.join(
                        str(nr) for nr in layer_sizes if nr is not None)))
            make_pretrain(params['pretrain'], train_data, true_labels,
                          **params)
    """
		The actual optimization, optionally in debug mode (non-parallel for stacktrace and resource use).
	"""
    validator = SampleCrossValidator(train_data,
                                     true_labels,
                                     rounds=rounds,
                                     test_frac=0.2,
                                     use_data_frac=1)
    if debug:
        optimizer = GridOptimizer(validator, use_caching=use_caching, **params)
        for subparams, train, labels, test in optimizer.yield_batches():
            optimizer.register_results(
                train_test_func(train, labels, test, **subparams))
        optimizer.print_plot_results()
    else:
        ParallelGridOptimizer(train_test_func=train_test_func,
                              validator=validator,
                              use_caching=use_caching,
                              **params).readygo(save_fig_basename=name,
                                                log_name=name + '.log',
                                                only_show_top=True)

示例#12

0

显示文件

文件： playground.py 项目： EdwardBetts/kaggle_otto

                  outlier_method='OCSVM',
                  normalize_log=True,
                  use_calibration=False,
                  **parameters):
    net = make_net(**parameters)
    train, test = conormalize_data(train, test, use_log=normalize_log)
    load_knowledge(net, 'results/nnets/optimize_new.log_1000.net.npz')
    prediction = net.predict_proba(test)
    if use_rescale_priors:
        prediction = scale_to_priors(prediction,
                                     priors=bincount(labels)[1:] /
                                     float64(len(labels)))
    return prediction


train_data, true_labels, features = get_training_data()
name = 'play'
pretrain = None
params = {
    'name': name,
    'dense1_nonlinearity':
    'tanh',  # tanh, sigmoid, rectify, leaky2, leaky20, softmax
    'dense1_init':
    'glorot_uniform',  # orthogonal, sparse, glorot_normal, glorot_uniform, he_normal, he_uniform
    'dense1_size': 128,  # [30, 25, 80, 120, 180]
    'dense2_size': None,
    'dense3_size': None,
    'learning_rate': 0.0001,  # initial learning reate
    'learning_rate_scaling':
    10,  # pogression over time; 0.1 scaled by 10 is 0.01
    'momentum': 0.99,  # initial momentum

示例#13

0

显示文件

文件： makeprediction.py 项目： EdwardBetts/kaggle_otto

from utils.loading import get_testing_data, get_training_data

cv = False  #If false, make a real prediction, if true, use cv data, named 'testmat.npy', 'trainmat.npy', 'trainclas.npy'
doForest = True
doGradient = True
doBoostedTrees = True
doSVM = False

if cv:
    testmat = load('data/testmat.npy').astype('uint16')
    testmat = testmat[:, 1:]
    trainmat = load('data/trainmat.npy').astype('uint16')
    trainmat = trainmat[:, 1:]
    trainclas = load('data/trainclas.npy')
else:
    trainmat, trainclas, _ = get_training_data()
    testmat, _ = get_testing_data()

forestparams = {
    "n_estimators": 300,
    "criterion": 'gini',  #['gini', 'entropy'], #gini seems better (default)
    "max_features":
    'sqrt',  #['sqrt', 'log2', None], #sqrt seems good (default), None is about as good but much slower
    "max_depth": 35,  #[None, 75, 100, 200, 300], #at least higher than 20
    "min_samples_split":
    2,  #[2,4,6] #not much difference, 2 (default) seems slightly the best
    "min_samples_leaf": 1,  #[1,2,3] #1 seems clearly the best (default)
    "min_weight_fraction_leaf":
    0,  #[0.,0.1,0.2] #0 seems clearly the best (default)
    "max_leaf_nodes":
    None,  #[None, 5000,10000,50000] #can vary, don't do simultaneously with max_depth

示例#14

0

显示文件

# -*- coding: utf-8 -*-
"""
Created on Sat May 09 22:14:06 2015

@author: Fenno
"""
from random_forest.randomforest import randomForest
from gradient_boosting.gradientboosting import gradientBoosting
from utils.loading import get_training_data
from validation.crossvalidate import SampleCrossValidator
import numpy as np

train, labels, features = get_training_data()

class1 = np.where(labels == 1)[0]
class2 = np.where(labels == 2)[0]
class3 = np.where(labels == 3)[0]
class4 = np.where(labels == 4)[0]
class9 = np.where(labels == 9)[0]

trainindexes = np.concatenate((class2, class3))

newtrain = train[trainindexes, :]
newlabels = labels[trainindexes].astype('int32')

x = np.where(newlabels == np.min(newlabels))[0]
y = np.where(newlabels == np.max(newlabels))[0]
newlabels[x] = 1  #np.zeros(len(x), dtype = 'int32')
newlabels[y] = 2  #np.ones(len(y), dtype = 'int32')

print len(x)

示例#15

0

显示文件

文件： manifold.py 项目： EdwardBetts/kaggle_otto

                   dissimilarity=dissimilarity)
        super(MDFeatures, self).__init__(method_inst=inst,
                                         only_upto=only_upto,
                                         use_only=use_only)


class SEFeatures(ManifoldFeatures):
    """
		SpectralEmbedding(n_components=17,n_neighbors=50)
	"""
    def __init__(self,
                 extra_featurs,
                 n_neighbors=50,
                 only_upto=RAW_NFEATS,
                 use_only=5000):
        inst = SpectralEmbedding(n_components=extra_featurs,
                                 n_neighbors=n_neighbors)
        super(SEFeatures, self).__init__(method_inst=inst,
                                         only_upto=only_upto,
                                         use_only=use_only)


if __name__ == '__main__':
    data = get_training_data()[0].astype(float64)
    trans = ManifoldFeatures(
        LocallyLinearEmbedding(n_neighbors=50,
                               n_components=10,
                               eigen_solver='auto',
                               method='standard'))
    print trans.fit_transform(data, use_only=1000).shape

示例#16

0

显示文件

"""
	From http://stats.stackexchange.com/questions/28593/mahalanobis-distance-distribution-of-multivariate-normally-distributed-points
"""

#imports and definitions
import numpy as np
import scipy.stats as stats
import scipy.spatial.distance as distance
import matplotlib.pyplot as plt
from utils.loading import get_training_data

chi2 = stats.chi2
np.random.seed(111)

all_data, all_labels = get_training_data()[:2]

#covariance matrix: X and Y are normally distributed with std of 1
#and are independent one of another
covCircle = np.array([[1, 0.], [0., 1.]])
circle = np.random.multivariate_normal([0, 0], covCircle,
                                       1000)  #1000 points around [0, 0]
print circle
mahalanobis = lambda p: distance.mahalanobis(p, [0, 0], covCircle.T)
d = np.array(map(mahalanobis,
                 circle))  #Mahalanobis distance values for the 1000 points
d2 = d**2  #MD squared

degrees_of_freedom = 2

x = range(len(d2))