def preprocess(filepath=TRAIN_DATA_PATH, test_frac=0.1): print 'loading data' #all_data, all_labels = load_train_data_csv() all_data, all_labels = get_training_data(filepath=filepath)[:2] print 'shuffling data' # seed = random() makes sure it's different each time all_data, all_labels = shuffle(all_data, all_labels, seed=random())[:2] print 'splitting data into train/test' train, train_labels, test, test_labels = split_data(all_data, all_labels, test_frac=test_frac) #print train.shape, test.shape, all_data.shape print 'splitting into classes' class_set, class_set_labels = class_split(train, train_labels) print 'outliers using mahalanobis percentage' class_set_1, class_set_labels_1, outlier_data_1, outlier_labels_1, smallest_size_1 = remove_outliers_mahalanobis( class_set, class_set_labels, offset=0.95) print 'outliers using mahalanobis treshold' class_set_2, class_set_labels_2, outlier_data_2, outlier_labels_2, smallest_size_2 = remove_outliers_mahalanobis( class_set, class_set_labels, offset=155) return class_set_1, class_set_labels_1, outlier_data_1, outlier_labels_1, smallest_size_1, class_set_2, class_set_labels_2, outlier_data_2, outlier_labels_2, smallest_size_2, test, test_labels
def store_column_maxima(): """ Calculate and store the maxima of the combined train and test columns (for normalization). This loads all data so it's not fast. """ train_data = get_training_data()[0] test_data = get_testing_data()[0] maxima = array( [max(tr, te) for tr, te in zip(train_data.max(0), test_data.max(0))]) save(COLUMN_MAX_PATH, maxima) if VERBOSITY >= 1: print 'updated the column maxima file' return maxima
def getSubClassifierData(subclasses=[2, 3], train_data=None, true_classes=None): """Gets training data for classification, from only the given classes Either filter existing data, or load the default training data, and filter that. If either train_data or true_classes is None, the data will be loaded using get_training_data() from utils.loading """ if (train_data is None) or (true_classes is None): train_data, true_classes, _ = get_training_data() assert len(true_classes) == np.shape(train_data)[0] validsample = np.array([x in subclasses for x in true_classes]) return train_data[validsample, :], true_classes[validsample]
def predict_300_v1(): data, labels = get_training_data()[:2] params = { 'dense1_nonlinearity': 'rectify', 'dense1_init': 'glorot_normal', 'dense1_size': 300, 'dense2_size': 0, 'dense3_size': None, 'dropout1_rate': 0.5, 'dropout2_rate': None, 'dropout3_rate': None, 'extra_feature_count': 0, } probs = predict(params, 'results/pretrain/single_pretrain_300_0_0.net.npz', data) print 'logloss', calc_logloss(probs, labels)
def get_filtered_data(cut_outlier_frac=0.05, method='EE', filepath=TRAIN_DATA_PATH): """ Load and filter data and classes to remove an approximate fraction of outliers. :param cut_outlier_frac: fraction of outliers to remove (approximate, especially for OCSVM) :param method: either 'EE' for Elliptical Envelope or 'OCSVM' for One Class Support Vector Machines :return: filtered train data, train classes and features """ data, classes, features = get_training_data() hash = '{0:s}_{1:04d}_all.npy'.format(method, int(cut_outlier_frac * 1000)) data, classes = filter_data_cache(data, classes, hash=hash, method=method, cut_outlier_frac=cut_outlier_frac) return data, classes, features
from os.path import basename, splitext from sys import modules from nnet.oldstyle.train_test import train_test_NN from utils.loading import get_training_data from validation.crossvalidate import SampleCrossValidator from validation.optimize_parallel import ParallelGridOptimizer train_data, true_labels = get_training_data()[:2] name = '{0:s}'.format( splitext( basename(getattr(modules['__main__'], '__file__', 'optimize.default')))[0]) pretrain = None # join(PRETRAIN_DIR, 'pt_varx256x256x64.net.npz') params = { 'name': name, 'dense1_nonlinearity': 'leaky20', # tanh, sigmoid, rectify, leaky2, leaky20, softmax 'dense1_init': 'glorot_uniform', # orthogonal, sparse, glorot_normal, glorot_uniform, he_normal, he_uniform 'dense1_size': 256, # [30, 25, 80, 120, 180] 'dense2_size': 128, 'dense3_size': None, 'learning_rate': 0.001, # initial learning rate (learning rate is effectively higher for higher momentum) 'learning_rate_scaling': 1000, # pogression over time; 0.1 scaled by 10 is 0.01 'momentum': 0.9, # initial momentum 'momentum_scaling': 100, # 0.9 scaled by 10 is 0.99 'dropout1_rate': 0, # [0, 0.5] 'dropout2_rate': 0,
from sklearn.preprocessing import MinMaxScaler from nnet.oldstyle.base_optimize import name_from_file from nnet.prepare import LogTransform from nnet.scikit import NNet from settings import LOGS_DIR, VERBOSITY, SUBMISSIONS_DIR from utils.features import PositiveSparseFeatureGenerator, PositiveSparseRowFeatureGenerator, DistanceFeatureGenerator from utils.ioutil import makeSubmission from utils.loading import get_preproc_data, get_training_data, get_testing_data #train, labels, test = get_preproc_data(Pipeline([ # ('row', PositiveSparseRowFeatureGenerator()), # ('distp31', DistanceFeatureGenerator(n_neighbors = 3, distance_p = 1)), # ('distp52', DistanceFeatureGenerator(n_neighbors = 5, distance_p = 2)), #]), expand_confidence = 0.9) train, labels = get_training_data()[:2] test = get_testing_data()[0] #cpus = max(cpu_count() - 1, 1) #random = RandomState() opt = RandomizedSearchCV( estimator = Pipeline([ #('gen23', PositiveSparseFeatureGenerator(difficult_classes = (2, 3), extra_features = 40)), #('gen234', PositiveSparseFeatureGenerator(difficult_classes = (2, 3, 4), extra_features = 40)), #('gen19', PositiveSparseFeatureGenerator(difficult_classes = (1, 9), extra_features = 63)), ('log', LogTransform()), ('scale03', MinMaxScaler(feature_range = (0, 3))), ('nn', NNet(**{ 'dense1_nonlinearity': 'rectify', 'dense1_init': 'glorot_normal',
from lasagne.layers import get_all_params, InputLayer, DenseLayer from lasagne.nonlinearities import tanh, softmax from lasagne.updates import sgd from theano import function, config import theano.tensor as T import numpy as np from matplotlib.pyplot import subplots, show, cm from utils.loading import get_training_data from validation.crossvalidate import SampleCrossValidator filterwarnings('ignore', '.*topo.*') config.optimizer = 'None' # todo: turn off, it's very slow #todo: normalize data train_data, true_classes, features = get_training_data() validator = SampleCrossValidator(train_data, true_classes, rounds=5, test_frac=0.3) for train, classes, test in validator.yield_cross_validation_sets(): # create tensor objects trainT = train.astype(config.floatX) testT = train.astype(config.floatX) classT = classes.astype('int32') # First, construct an input layer. # The shape parameter defines the expected input shape, which is just the shape of our data matrix X. l_in = InputLayer(shape=trainT.shape, W=Constant()) # A dense layer implements a linear mix (xW + b) followed by a nonlinearity. l_hidden = DenseLayer(
labels[train_rows:], sample_weight=sample_weight[train_rows:]) predictions = model.predict_proba(test) if rescale_pred: predictions = rescale_prior(predictions, bincount(labels)) return predictions if __name__ == '__main__': from utils.loading import get_training_data from validation.crossvalidate import SampleCrossValidator train_data, true_classes, _ = get_training_data() validator = SampleCrossValidator(train_data, true_classes, rounds=1, test_frac=0.1, use_data_frac=1.0) for train, classes, test in validator.yield_cross_validation_sets(): prediction = randomForest(train, classes, test, n_estimators=200, max_depth=35, verbose=1, class_weight="auto", calibration=3, rescale_pred=True)
parameters['dense2_nonlinearity'] = parameters['dense1_nonlinearity'] # hack1 parameters['dense2_init'] = parameters['dense1_init'] # hack2 net = make_net(**parameters) net.fit(train, classes - 1) return net.predict_proba(test) # one big job for size1 + size2 + dropout # one job for nonlinearity1 + initialization1 # one job for learning rate + learning rate scaling + momentum + momentum scaling (split if too many) name = '{0:s}.log'.format(splitext(basename(getattr(modules['__main__'], '__file__', 'optimize.default')))[0]) # automatic based on filename train_data, true_classes, features = get_training_data() # load the train data train_data, true_classes = equalize_class_sizes(train_data, true_classes) train_data, true_classes = filter_data(train_data, true_classes, cut_outlier_frac = 0.06, method = 'OCSVM') # remove ourliers train_data = normalize_data(train_data, use_log = True)[0] # also converts to floats validator = SampleCrossValidator(train_data, true_classes, rounds = 3, test_frac = 0.2, use_data_frac = 1) optimizer = ParallelGridOptimizer(train_test_func = train_test_NN, validator = validator, use_caching = True, name = name, # just choose something sensible dense1_size = 80, # [30, 25, 80, 120, 180] dense1_nonlinearity = ['tanh', 'sigmoid', 'rectify', 'leaky2', 'leaky20' 'softmax'], dense1_init = ['orthogonal', 'sparse', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'], dense2_size = 80, # [30, 25, 80, 120, 180] dense2_nonlinearity = 'leaky20', # this is coupled to dense1_nonlinearity through hack#1 dense2_init = 'orthogonal', # idem hack2 learning_rate = 0.001, # [0.1, 0.01, 0.001, 0.0001] learning_rate_scaling = 100, # [1, 10, 100] momentum = 0.9, # [0, 0.9, 0.99]
def optimize_NN(name=name_from_file(), rounds=1, debug=False, use_caching=True, train_test_func=train_test_NN, test_only=False, **special_params): """ Some default code for optimization, adding default parameters and debug, and using mostly other classes to do the rest of the work. """ """ Default parameters. """ for key in special_params.keys(): assert key in DEFAULT_PARAMS.keys( ), '"{0:s}" is not a known parameter'.format(key) params = copy(DEFAULT_PARAMS) params.update(special_params) """ Load data. """ train_data, true_labels, features = get_training_data() """ Pre-training. """ if params['pretrain'] or params['pretrain'] is None: layer_sizes = [ params['extra_feature_count'] or 0, params['dense1_size'] or 0, params['dense2_size'] or 0, params['dense3_size'] or 0, params['dropout1_rate'] or 0, params['dropout2_rate'] or 0 ] if any(is_nonstr_iterable(nr) for nr in layer_sizes): """ Different layouts, so no pre-training. """ if params['pretrain'] is None: print 'No pre-training since layer sizes are not constant.' params['pretrain'] = False else: raise AssertionError( 'Pre-training is not available when there are different network layouts (e.g. different numbers of neurons or features).' ) else: """ Constant layout, so can use pre-training. """ if params['pretrain'] is None or params['pretrain'] is True: params['pretrain'] = join( PRETRAIN_DIR, 'pt{0:s}.net.npz'.format('x'.join( str(nr) for nr in layer_sizes if nr is not None))) make_pretrain(params['pretrain'], train_data, true_labels, **params) """ The actual optimization, optionally in debug mode (non-parallel for stacktrace and resource use). """ validator = SampleCrossValidator(train_data, true_labels, rounds=rounds, test_frac=0.2, use_data_frac=1) if debug: optimizer = GridOptimizer(validator, use_caching=use_caching, **params) for subparams, train, labels, test in optimizer.yield_batches(): optimizer.register_results( train_test_func(train, labels, test, **subparams)) optimizer.print_plot_results() else: ParallelGridOptimizer(train_test_func=train_test_func, validator=validator, use_caching=use_caching, **params).readygo(save_fig_basename=name, log_name=name + '.log', only_show_top=True)
outlier_method='OCSVM', normalize_log=True, use_calibration=False, **parameters): net = make_net(**parameters) train, test = conormalize_data(train, test, use_log=normalize_log) load_knowledge(net, 'results/nnets/optimize_new.log_1000.net.npz') prediction = net.predict_proba(test) if use_rescale_priors: prediction = scale_to_priors(prediction, priors=bincount(labels)[1:] / float64(len(labels))) return prediction train_data, true_labels, features = get_training_data() name = 'play' pretrain = None params = { 'name': name, 'dense1_nonlinearity': 'tanh', # tanh, sigmoid, rectify, leaky2, leaky20, softmax 'dense1_init': 'glorot_uniform', # orthogonal, sparse, glorot_normal, glorot_uniform, he_normal, he_uniform 'dense1_size': 128, # [30, 25, 80, 120, 180] 'dense2_size': None, 'dense3_size': None, 'learning_rate': 0.0001, # initial learning reate 'learning_rate_scaling': 10, # pogression over time; 0.1 scaled by 10 is 0.01 'momentum': 0.99, # initial momentum
from utils.loading import get_testing_data, get_training_data cv = False #If false, make a real prediction, if true, use cv data, named 'testmat.npy', 'trainmat.npy', 'trainclas.npy' doForest = True doGradient = True doBoostedTrees = True doSVM = False if cv: testmat = load('data/testmat.npy').astype('uint16') testmat = testmat[:, 1:] trainmat = load('data/trainmat.npy').astype('uint16') trainmat = trainmat[:, 1:] trainclas = load('data/trainclas.npy') else: trainmat, trainclas, _ = get_training_data() testmat, _ = get_testing_data() forestparams = { "n_estimators": 300, "criterion": 'gini', #['gini', 'entropy'], #gini seems better (default) "max_features": 'sqrt', #['sqrt', 'log2', None], #sqrt seems good (default), None is about as good but much slower "max_depth": 35, #[None, 75, 100, 200, 300], #at least higher than 20 "min_samples_split": 2, #[2,4,6] #not much difference, 2 (default) seems slightly the best "min_samples_leaf": 1, #[1,2,3] #1 seems clearly the best (default) "min_weight_fraction_leaf": 0, #[0.,0.1,0.2] #0 seems clearly the best (default) "max_leaf_nodes": None, #[None, 5000,10000,50000] #can vary, don't do simultaneously with max_depth
# -*- coding: utf-8 -*- """ Created on Sat May 09 22:14:06 2015 @author: Fenno """ from random_forest.randomforest import randomForest from gradient_boosting.gradientboosting import gradientBoosting from utils.loading import get_training_data from validation.crossvalidate import SampleCrossValidator import numpy as np train, labels, features = get_training_data() class1 = np.where(labels == 1)[0] class2 = np.where(labels == 2)[0] class3 = np.where(labels == 3)[0] class4 = np.where(labels == 4)[0] class9 = np.where(labels == 9)[0] trainindexes = np.concatenate((class2, class3)) newtrain = train[trainindexes, :] newlabels = labels[trainindexes].astype('int32') x = np.where(newlabels == np.min(newlabels))[0] y = np.where(newlabels == np.max(newlabels))[0] newlabels[x] = 1 #np.zeros(len(x), dtype = 'int32') newlabels[y] = 2 #np.ones(len(y), dtype = 'int32') print len(x)
dissimilarity=dissimilarity) super(MDFeatures, self).__init__(method_inst=inst, only_upto=only_upto, use_only=use_only) class SEFeatures(ManifoldFeatures): """ SpectralEmbedding(n_components=17,n_neighbors=50) """ def __init__(self, extra_featurs, n_neighbors=50, only_upto=RAW_NFEATS, use_only=5000): inst = SpectralEmbedding(n_components=extra_featurs, n_neighbors=n_neighbors) super(SEFeatures, self).__init__(method_inst=inst, only_upto=only_upto, use_only=use_only) if __name__ == '__main__': data = get_training_data()[0].astype(float64) trans = ManifoldFeatures( LocallyLinearEmbedding(n_neighbors=50, n_components=10, eigen_solver='auto', method='standard')) print trans.fit_transform(data, use_only=1000).shape
""" From http://stats.stackexchange.com/questions/28593/mahalanobis-distance-distribution-of-multivariate-normally-distributed-points """ #imports and definitions import numpy as np import scipy.stats as stats import scipy.spatial.distance as distance import matplotlib.pyplot as plt from utils.loading import get_training_data chi2 = stats.chi2 np.random.seed(111) all_data, all_labels = get_training_data()[:2] #covariance matrix: X and Y are normally distributed with std of 1 #and are independent one of another covCircle = np.array([[1, 0.], [0., 1.]]) circle = np.random.multivariate_normal([0, 0], covCircle, 1000) #1000 points around [0, 0] print circle mahalanobis = lambda p: distance.mahalanobis(p, [0, 0], covCircle.T) d = np.array(map(mahalanobis, circle)) #Mahalanobis distance values for the 1000 points d2 = d**2 #MD squared degrees_of_freedom = 2 x = range(len(d2))