示例#1
0
def multiclass_grid_ensemble(predictionmatrix,
                             trueclasses,
                             probssofar=None,
                             column=0,
                             numinterval=None,
                             printWeights=False,
                             printtop=20,
                             data_frac=1.0,
                             rounds=1):
    Q, _, C = np.shape(predictionmatrix)

    if column == C:
        return probssofar

    if numinterval is None:
        numinterval = Q + 1
    weightDict = dict(enumerate(make_weights_list(Q, numinterval)))

    if printWeights:
        print "The key-value pairs for all possible combinations of weights:"
        for k, v in weightDict.iteritems():
            print str(k) + ": [" + ', '.join([str(e) for e in v]) + "]"
    if probssofar is None:
        probssofar = np.ones((Q, C)) * (1.0 / C)
    probsclone = np.copy(probssofar)

    unstackpredict = unstack_predictions(predictionmatrix)
    validator = SampleCrossValidator(unstackpredict,
                                     trueclasses,
                                     rounds=rounds,
                                     test_frac=data_frac,
                                     use_data_frac=data_frac)

    optimizer = ParallelGridOptimizer(train_test_func=train_test,
                                      validator=validator,
                                      use_caching=False,
                                      process_count=1,
                                      Q=Q,
                                      column=column,
                                      weights=weightDict.keys())
    optimizer.readygo()
    """
    optimizer = GridOptimizer(validator = validator, use_caching = False, weights = weightDict.keys())
    count = 0
    for weights, _, _, test in optimizer.yield_batches(False):
        stackedPredict = stack_predictions(test, Q)
        probsclone[:,column] = weightDict[weights['weights']]
        prediction = multiclass_mean_ensemble(stackedPredict, probsclone)
        optimizer.register_results(prediction)
        count += 1
        if (count % 100) == 0:
            print count 
    """
    bestweight = weightDict[optimizer.print_top(printtop, True)[0][0]]
    probsclone[:, column] = bestweight
    print probsclone
    return multiclass_grid_ensemble(predictionmatrix, trueclasses, probsclone,
                                    column + 1, numinterval, False, printtop,
                                    data_frac, rounds)
示例#2
0
    'max_epochs':
    1000,  # it terminates when overfitting or increasing, so just leave high
    'auto_stopping':
    True,  # stop training automatically if it seems to be failing
    'pretrain':
    pretrain,  # use pretraining? (True for automatic, filename for specific)
    'outlier_method': 'OCSVM',  # method for outlier removal ['OCSVM', 'EE']
    'outlier_frac': None,  # which fraction of each class to remove as outliers
    'normalize_log': True,  # use logarithm for normalization
    'use_calibration': False,  # use calibration of probabilities
    'use_rescale_priors': True,  # rescale predictions to match priors
    'extra_feature_count': [0, 30, 80, 163,
                            300],  # how many new features to generate
    'extra_feature_seed': 0,  # a seed for the feature generation
}

# make_pretrain(pretrain, train_data, true_labels, **params)

validator = SampleCrossValidator(train_data,
                                 true_labels,
                                 rounds=3,
                                 test_frac=0.2,
                                 use_data_frac=1)
ParallelGridOptimizer(train_test_func=train_test_NN,
                      validator=validator,
                      use_caching=False,
                      **params).readygo(topprint=20,
                                        save_fig_basename=name,
                                        log_name=name + '.log',
                                        only_show_top=True)
示例#3
0
from demo.fake_testing_probabilities import get_random_probabilities
from utils.loading import get_training_data
from validation.crossvalidate import SampleCrossValidator
from validation.optimize_parallel import ParallelGridOptimizer


def train_test(train, classes, test, **parameters):
	prediction = get_random_probabilities(sample_count = test.shape[0])
	return prediction

train_data, true_labels = get_training_data()[:2]
validator = SampleCrossValidator(train_data, true_labels, rounds = 6, test_frac = 0.1, use_data_frac = 1)
optimizer = ParallelGridOptimizer(train_test_func = train_test, validator = validator,  use_caching = True, process_count = 3,
	learning_rate = [10, 1, 0.1, 0.01, 0.001],
	hidden_layer_size = [60, 30, 50, 40, 20],
	weight_decay = 0.1,
	momentum = 0.9
).readygo()


示例#4
0

name = '{0:s}.log'.format(splitext(basename(getattr(modules['__main__'], '__file__', 'optimize.default')))[0])  # automatic based on filename

train_data, true_classes, features = get_training_data()  # load the train data
train_data, true_classes = equalize_class_sizes(train_data, true_classes)
train_data, true_classes = filter_data(train_data, true_classes, cut_outlier_frac = 0.06, method = 'OCSVM')  # remove ourliers
train_data = normalize_data(train_data, use_log = True)[0]  # also converts to floats
validator = SampleCrossValidator(train_data, true_classes, rounds = 3, test_frac = 0.2, use_data_frac = 1)
optimizer = ParallelGridOptimizer(train_test_func = train_test_NN, validator = validator, use_caching = True,
	name = name,                      # just choose something sensible
	dense1_size = 80,                 # [30, 25, 80, 120, 180]
	dense1_nonlinearity = ['tanh', 'sigmoid', 'rectify', 'leaky2', 'leaky20' 'softmax'],
	dense1_init = ['orthogonal', 'sparse', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'],
	dense2_size = 80,                 # [30, 25, 80, 120, 180]
	dense2_nonlinearity = 'leaky20',  # this is coupled to dense1_nonlinearity through hack#1
	dense2_init = 'orthogonal',       # idem hack2
	learning_rate = 0.001,            # [0.1, 0.01, 0.001, 0.0001]
	learning_rate_scaling = 100,      # [1, 10, 100]
	momentum = 0.9,                   # [0, 0.9, 0.99]
	momentum_scaling = 100,           # [1, 10, 100]
	dropout1_rate = 0.5,              # [0, 0.5]
	dropout2_rate = None,
	weight_decay = 0,                 # doesn't work
	max_epochs = 3000,                # it terminates when overfitting or increasing, so just leave high
	output_nonlinearity = 'softmax',  # just keep softmax
	auto_stopping = True,
).readygo(topprint = 100, save_fig_basename = name, log_name = name, only_show_top = True)


示例#5
0
dropout1_rate = 0                # [0, 0.5]
dropout2_rate = None
weight_decay = 0                 # doesn't work
max_epochs = 3000                # it terminates when overfitting or increasing, so just leave high
output_nonlinearity = 'softmax'  # just keep softmax
verbosity = 1                    # farts output hard, but at least you get some idea of progress
auto_stopping = True

baseparams = { "name" : name, "verbosity" : verbosity} #, 'dense1_nonlinearity' : dense1_nonlinearity, 'dense1_init': dense1_init}
testparams = baseparams.copy()
testparams.update(  {"dense1_size" : dense1_size, "dense2_size" : dense2_size, "dropout1_rate" : dropout1_rate} )
testparams.update(  {'dense1_init' : dense1_init, 'dense1_nonlinearity' : dense1_nonlinearity } )
testparams.update ( {'momentum' : momentum, 'momentum_scaling' : momentum_scaling} )
testparams.update ( {'learning_rate' : learning_rate, 'learning_rate_scaling' : learning_rate_scaling} )

train_data, true_classes, features = get_training_data()  # load the train data

from numpy import shape
print shape(train_data)

validator = SampleCrossValidator(train_data, true_classes, rounds = 1, test_frac = 0.1, use_data_frac = 1)
optimizer = ParallelGridOptimizer(train_test_func = train_test_NN, validator = validator, use_caching = False, process_count = 36, **testparams)
print("Declared optimizer complete")
optimizer.readygo(topprint = 100, save_fig_basename = name, log_name = name, only_show_top = True)



#train_test_NN(train_data, true_classes, train_data, **baseparams)


示例#6
0
        trainFeatures, classLabels, 0.3)
    rf = RandomForestClassifier(n_estimators=n_estimators,
                                criterion=criterion,
                                max_features=max_features,
                                n_jobs=n_jobs)
    clf = rf.fit(trainSet1, labelSet1)
    calibrated_clf_sigmoid = CalibratedClassifierCV(clf,
                                                    method='sigmoid',
                                                    cv='prefit')
    calibrated_clf_sigmoid.fit(trainSet2, labelSet2)
    prob = calibrated_clf_sigmoid.predict_proba(testFeatures)
    return prob


train_data, true_labels = get_training_data()[:2]
validator = SampleCrossValidator(train_data,
                                 true_labels,
                                 rounds=1,
                                 test_frac=0.1,
                                 use_data_frac=1)
optimizer = ParallelGridOptimizer(
    calibrated_random_forest,
    validator=validator,
    use_caching=True,
    criterion='gini',
    n_estimators=[100, 150,
                  200],  # change too [100, 150, 200] to compare those values
    max_features='auto',
    n_jobs=1)
optimizer.readygo()
示例#7
0
    estimator.fit(train, labels)
    prediction = estimator.predict_proba(test)
    return prediction


train_data, true_labels = get_training_data()[:2]
validator = SampleCrossValidator(train_data,
                                 true_labels,
                                 rounds=1,
                                 test_frac=0.2,
                                 use_data_frac=1)  # 0.3!!
optimizer = ParallelGridOptimizer(train_test_func=train_test,
                                  validator=validator,
                                  use_caching=False,
                                  process_count=max(cpu_count() - 1, 1),
                                  **{
                                      'weight_decay':
                                      logspace(-1, -7, base=10, num=30),
                                  }).readygo(
                                      save_fig_basename=name_from_file(),
                                      log_name=name_from_file() + '_stats.txt')
"""
 1    0.5254       4.89390091848e-05
 2    0.5257       3.03919538231e-05
 3    0.5284       1.88739182214e-05
 4    0.5308       7.88046281567e-05
 5    0.5313       1.17210229753e-05
 6    0.5315       7.27895384398e-06
 7    0.5317       4.52035365636e-06
 8    0.5320       2.80721620394e-06
 9    0.5336       1.08263673387e-06
10    0.5338       1e-07           
示例#8
0
def optimize_NN(name=name_from_file(),
                rounds=1,
                debug=False,
                use_caching=True,
                train_test_func=train_test_NN,
                test_only=False,
                **special_params):
    """
		Some default code for optimization, adding default parameters and debug, and using mostly other classes to do the rest of the work.
	"""
    """
		Default parameters.
	"""
    for key in special_params.keys():
        assert key in DEFAULT_PARAMS.keys(
        ), '"{0:s}" is not a known parameter'.format(key)
    params = copy(DEFAULT_PARAMS)
    params.update(special_params)
    """
		Load data.
	"""
    train_data, true_labels, features = get_training_data()
    """
		Pre-training.
	"""
    if params['pretrain'] or params['pretrain'] is None:
        layer_sizes = [
            params['extra_feature_count'] or 0, params['dense1_size'] or 0,
            params['dense2_size'] or 0, params['dense3_size'] or 0,
            params['dropout1_rate'] or 0, params['dropout2_rate'] or 0
        ]
        if any(is_nonstr_iterable(nr) for nr in layer_sizes):
            """ Different layouts, so no pre-training. """
            if params['pretrain'] is None:
                print 'No pre-training since layer sizes are not constant.'
                params['pretrain'] = False
            else:
                raise AssertionError(
                    'Pre-training is not available when there are different network layouts (e.g. different numbers of neurons or features).'
                )
        else:
            """ Constant layout, so can use pre-training. """
            if params['pretrain'] is None or params['pretrain'] is True:
                params['pretrain'] = join(
                    PRETRAIN_DIR, 'pt{0:s}.net.npz'.format('x'.join(
                        str(nr) for nr in layer_sizes if nr is not None)))
            make_pretrain(params['pretrain'], train_data, true_labels,
                          **params)
    """
		The actual optimization, optionally in debug mode (non-parallel for stacktrace and resource use).
	"""
    validator = SampleCrossValidator(train_data,
                                     true_labels,
                                     rounds=rounds,
                                     test_frac=0.2,
                                     use_data_frac=1)
    if debug:
        optimizer = GridOptimizer(validator, use_caching=use_caching, **params)
        for subparams, train, labels, test in optimizer.yield_batches():
            optimizer.register_results(
                train_test_func(train, labels, test, **subparams))
        optimizer.print_plot_results()
    else:
        ParallelGridOptimizer(train_test_func=train_test_func,
                              validator=validator,
                              use_caching=use_caching,
                              **params).readygo(save_fig_basename=name,
                                                log_name=name + '.log',
                                                only_show_top=True)
示例#9
0
})
testparams.update({
    "outlier_frac": outlier_frac,
    "outlier_method": outlier_method,
    "rescale_pred": rescale_pred,
    "sample_weight": sample_weight
})

train_data, true_classes, _ = get_training_data()
validator = SampleCrossValidator(train_data,
                                 true_classes,
                                 rounds=1,
                                 test_frac=0.1,
                                 use_data_frac=1.0)

#The parallelized code
optimizer = ParallelGridOptimizer(randomForest,
                                  validator,
                                  use_caching=False,
                                  process_count=24,
                                  **testparams)
optimizer.readygo(only_show_top=False)
"""
optimizer = GridOptimizer(validator, **testparams)
for params, train, classes, test in optimizer.yield_batches():
    prediction = randomForest(train, classes, test, **params)
    optimizer.register_results(prediction)
optimizer.print_top(12)

"""
示例#10
0
max_features = ['sqrt', 'log2', None]
verbose = 1
calibration = [0, 0.1]
calibrationmethod = 'sigmoid', 'isotonic'

baseparams = {"verbose": verbose, "n_estimators": n_estimators}
testparams = baseparams.copy()
testparams.update({"learning_rate": learning_rate, "max_depth": max_depth})

train_data, true_classes, _ = get_training_data()
validator = SampleCrossValidator(train_data,
                                 true_classes,
                                 rounds=1,
                                 test_frac=0.1,
                                 use_data_frac=1.0)

#The parallelized code
optimizer = ParallelGridOptimizer(gradientBoosting,
                                  validator,
                                  process_count=4,
                                  **testparams)
optimizer.readygo()

#The non parallelized-code
"""
optimizer = GridOptimizer(validator, **testparams)
for params, train, classes, test in optimizer.yield_batches():
    prediction = gradientBoosting(train, classes, test, **params)
    optimizer.register_results(prediction)
optimizer.print_plot_results()
"""
示例#11
0
	clf = DistanceClassifier(n_neighbors = n_neighbors, distance_p = distance_p)
	if use_calibration:
		clf = CalibratedClassifierCV(clf, cv = 3)
	clf.fit(train, labels)
	probs = clf.predict_proba(test)
	probs = scale_to_priors(probs)
	return probs

#probs = train_test(train_data[::10, :], true_labels[::10], validation_data[::10], n_neighbors = 1, distance_p = 1, use_log = False)
#print probs.shape

validator = SampleCrossValidator(train_data, true_labels, rounds = 5, test_frac = 0.1, use_data_frac = 0.4)
optimizer = ParallelGridOptimizer(train_test_func = train_test, validator = validator, use_caching = False, process_count = max(cpu_count() - 1, 1),
	n_neighbors = [4, 8, 12, 20, 32],
	distance_p = 2,
	use_log = True,
	use_autoscale = False,
	use_calibration = [True, False],
).readygo()


"""
pos     loss      n neighbors       use autoscale     use log           distance p
 1    0.7255       20                False             True              2
 2    0.7260       8                 False             True              2
 3    0.7265       12                False             True              2
 4    0.7279       20                True              True              2
 5    0.7291       4                 False             True              2
 6    0.7300       8                 True              True              2
 7    0.7301       12                True              True              2
 8    0.7318       32                False             True              2