def multiclass_grid_ensemble(predictionmatrix, trueclasses, probssofar=None, column=0, numinterval=None, printWeights=False, printtop=20, data_frac=1.0, rounds=1): Q, _, C = np.shape(predictionmatrix) if column == C: return probssofar if numinterval is None: numinterval = Q + 1 weightDict = dict(enumerate(make_weights_list(Q, numinterval))) if printWeights: print "The key-value pairs for all possible combinations of weights:" for k, v in weightDict.iteritems(): print str(k) + ": [" + ', '.join([str(e) for e in v]) + "]" if probssofar is None: probssofar = np.ones((Q, C)) * (1.0 / C) probsclone = np.copy(probssofar) unstackpredict = unstack_predictions(predictionmatrix) validator = SampleCrossValidator(unstackpredict, trueclasses, rounds=rounds, test_frac=data_frac, use_data_frac=data_frac) optimizer = ParallelGridOptimizer(train_test_func=train_test, validator=validator, use_caching=False, process_count=1, Q=Q, column=column, weights=weightDict.keys()) optimizer.readygo() """ optimizer = GridOptimizer(validator = validator, use_caching = False, weights = weightDict.keys()) count = 0 for weights, _, _, test in optimizer.yield_batches(False): stackedPredict = stack_predictions(test, Q) probsclone[:,column] = weightDict[weights['weights']] prediction = multiclass_mean_ensemble(stackedPredict, probsclone) optimizer.register_results(prediction) count += 1 if (count % 100) == 0: print count """ bestweight = weightDict[optimizer.print_top(printtop, True)[0][0]] probsclone[:, column] = bestweight print probsclone return multiclass_grid_ensemble(predictionmatrix, trueclasses, probsclone, column + 1, numinterval, False, printtop, data_frac, rounds)
def grid_ensemble(predictionmatrix, trueclasses, numinterval=None, printWeights=True, printtop=20, data_frac=1.0, rounds=1): """ Does a grid search to find good weights for the ensemble predictionmatrix is a QxNxC matrix, where Q is the number of models, N is the number of samples, C is the number of classes The parameters for the crossvalidator are fixed because taking a weighted average does not need training and is deterministic numinterval is the number of possible weights to consider, evenly spaced between 0 and 1. important: numinterval is a postive integer, not a float! By default, this is Q + 1, where Q is the number of models """ Q, _, _ = np.shape(predictionmatrix) if numinterval is None: numinterval = Q + 1 weightDict = dict(enumerate(make_weights_list(Q, numinterval))) if printWeights: print "The key-value pairs for all possible combinations of weights:" for k, v in weightDict.iteritems(): print str(k) + ": [" + ', '.join([str(e) for e in v]) + "]" unstackpredict = unstack_predictions(predictionmatrix) validator = SampleCrossValidator(unstackpredict, trueclasses, rounds=rounds, test_frac=data_frac, use_data_frac=data_frac) optimizer = GridOptimizer(validator=validator, use_caching=False, weights=weightDict.keys()) count = 0 for weights, _, _, test in optimizer.yield_batches(False): stackedPredict = stack_predictions(test, Q) prediction = mean_ensemble(stackedPredict, weights=weightDict[weights['weights']]) optimizer.register_results(prediction) count += 1 if (count % 100) == 0: print count print optimizer.print_top(printtop, True)
'max_epochs': 1000, # it terminates when overfitting or increasing, so just leave high 'auto_stopping': True, # stop training automatically if it seems to be failing 'pretrain': pretrain, # use pretraining? (True for automatic, filename for specific) 'outlier_method': 'OCSVM', # method for outlier removal ['OCSVM', 'EE'] 'outlier_frac': None, # which fraction of each class to remove as outliers 'normalize_log': True, # use logarithm for normalization 'use_calibration': False, # use calibration of probabilities 'use_rescale_priors': True, # rescale predictions to match priors 'extra_feature_count': [0, 30, 80, 163, 300], # how many new features to generate 'extra_feature_seed': 0, # a seed for the feature generation } # make_pretrain(pretrain, train_data, true_labels, **params) validator = SampleCrossValidator(train_data, true_labels, rounds=3, test_frac=0.2, use_data_frac=1) ParallelGridOptimizer(train_test_func=train_test_NN, validator=validator, use_caching=False, **params).readygo(topprint=20, save_fig_basename=name, log_name=name + '.log', only_show_top=True)
from lasagne.updates import sgd from theano import function, config import theano.tensor as T import numpy as np from matplotlib.pyplot import subplots, show, cm from utils.loading import get_training_data from validation.crossvalidate import SampleCrossValidator filterwarnings('ignore', '.*topo.*') config.optimizer = 'None' # todo: turn off, it's very slow #todo: normalize data train_data, true_classes, features = get_training_data() validator = SampleCrossValidator(train_data, true_classes, rounds=5, test_frac=0.3) for train, classes, test in validator.yield_cross_validation_sets(): # create tensor objects trainT = train.astype(config.floatX) testT = train.astype(config.floatX) classT = classes.astype('int32') # First, construct an input layer. # The shape parameter defines the expected input shape, which is just the shape of our data matrix X. l_in = InputLayer(shape=trainT.shape, W=Constant()) # A dense layer implements a linear mix (xW + b) followed by a nonlinearity. l_hidden = DenseLayer( l_in, # The first argument is the input to this layer num_units=25, # This defines the layer's output dimensionality nonlinearity=tanh,
"rescale_pred": rescale_pred } testparams = baseparams.copy() testparams.update({ "n_estimators": n_estimators, "learning_rate": learning_rate, "algorithm": algorithm, "calibration": calibration, "calibrationmethod": calibrationmethod }) train_data, true_classes, _ = get_training_data() validator = SampleCrossValidator(train_data, true_classes, rounds=1, test_frac=0.1, use_data_frac=1.0) #The parallelized code """ optimizer = ParallelGridOptimizer(adaBoost, validator, process_count = 4, **testparams) optimizer.readygo() """ """ optimizer = ParallelGridOptimizer(adaBoost, validator, process_count = 4, **testparams) """ #The non parallelized-code
def optimize_NN(name=name_from_file(), rounds=1, debug=False, use_caching=True, train_test_func=train_test_NN, test_only=False, **special_params): """ Some default code for optimization, adding default parameters and debug, and using mostly other classes to do the rest of the work. """ """ Default parameters. """ for key in special_params.keys(): assert key in DEFAULT_PARAMS.keys( ), '"{0:s}" is not a known parameter'.format(key) params = copy(DEFAULT_PARAMS) params.update(special_params) """ Load data. """ train_data, true_labels, features = get_training_data() """ Pre-training. """ if params['pretrain'] or params['pretrain'] is None: layer_sizes = [ params['extra_feature_count'] or 0, params['dense1_size'] or 0, params['dense2_size'] or 0, params['dense3_size'] or 0, params['dropout1_rate'] or 0, params['dropout2_rate'] or 0 ] if any(is_nonstr_iterable(nr) for nr in layer_sizes): """ Different layouts, so no pre-training. """ if params['pretrain'] is None: print 'No pre-training since layer sizes are not constant.' params['pretrain'] = False else: raise AssertionError( 'Pre-training is not available when there are different network layouts (e.g. different numbers of neurons or features).' ) else: """ Constant layout, so can use pre-training. """ if params['pretrain'] is None or params['pretrain'] is True: params['pretrain'] = join( PRETRAIN_DIR, 'pt{0:s}.net.npz'.format('x'.join( str(nr) for nr in layer_sizes if nr is not None))) make_pretrain(params['pretrain'], train_data, true_labels, **params) """ The actual optimization, optionally in debug mode (non-parallel for stacktrace and resource use). """ validator = SampleCrossValidator(train_data, true_labels, rounds=rounds, test_frac=0.2, use_data_frac=1) if debug: optimizer = GridOptimizer(validator, use_caching=use_caching, **params) for subparams, train, labels, test in optimizer.yield_batches(): optimizer.register_results( train_test_func(train, labels, test, **subparams)) optimizer.print_plot_results() else: ParallelGridOptimizer(train_test_func=train_test_func, validator=validator, use_caching=use_caching, **params).readygo(save_fig_basename=name, log_name=name + '.log', only_show_top=True)
x = np.where(newlabels == np.min(newlabels))[0] y = np.where(newlabels == np.max(newlabels))[0] newlabels[x] = 1 #np.zeros(len(x), dtype = 'int32') newlabels[y] = 2 #np.ones(len(y), dtype = 'int32') print len(x) print len(y) """ print newlabels[:10] print newlabels[-10:] print y[-10:] print np.min(newlabels), np.max(newlabels) print np.shape(newtrain) print np.shape(newlabels) """ validator = SampleCrossValidator(newtrain, newlabels, rounds=1, test_frac=0.1, use_data_frac=1.0) for train, classes, test in validator.yield_cross_validation_sets(): prediction = randomForest(train, classes, test, 5, n_estimators=200) #prediction = gradientBoosting(train, classes, test, n_estimators = 1000) validator.add_prediction(prediction) validator.print_results()