예제 #1
0
def multiclass_grid_ensemble(predictionmatrix,
                             trueclasses,
                             probssofar=None,
                             column=0,
                             numinterval=None,
                             printWeights=False,
                             printtop=20,
                             data_frac=1.0,
                             rounds=1):
    Q, _, C = np.shape(predictionmatrix)

    if column == C:
        return probssofar

    if numinterval is None:
        numinterval = Q + 1
    weightDict = dict(enumerate(make_weights_list(Q, numinterval)))

    if printWeights:
        print "The key-value pairs for all possible combinations of weights:"
        for k, v in weightDict.iteritems():
            print str(k) + ": [" + ', '.join([str(e) for e in v]) + "]"
    if probssofar is None:
        probssofar = np.ones((Q, C)) * (1.0 / C)
    probsclone = np.copy(probssofar)

    unstackpredict = unstack_predictions(predictionmatrix)
    validator = SampleCrossValidator(unstackpredict,
                                     trueclasses,
                                     rounds=rounds,
                                     test_frac=data_frac,
                                     use_data_frac=data_frac)

    optimizer = ParallelGridOptimizer(train_test_func=train_test,
                                      validator=validator,
                                      use_caching=False,
                                      process_count=1,
                                      Q=Q,
                                      column=column,
                                      weights=weightDict.keys())
    optimizer.readygo()
    """
    optimizer = GridOptimizer(validator = validator, use_caching = False, weights = weightDict.keys())
    count = 0
    for weights, _, _, test in optimizer.yield_batches(False):
        stackedPredict = stack_predictions(test, Q)
        probsclone[:,column] = weightDict[weights['weights']]
        prediction = multiclass_mean_ensemble(stackedPredict, probsclone)
        optimizer.register_results(prediction)
        count += 1
        if (count % 100) == 0:
            print count 
    """
    bestweight = weightDict[optimizer.print_top(printtop, True)[0][0]]
    probsclone[:, column] = bestweight
    print probsclone
    return multiclass_grid_ensemble(predictionmatrix, trueclasses, probsclone,
                                    column + 1, numinterval, False, printtop,
                                    data_frac, rounds)
예제 #2
0
def grid_ensemble(predictionmatrix,
                  trueclasses,
                  numinterval=None,
                  printWeights=True,
                  printtop=20,
                  data_frac=1.0,
                  rounds=1):
    """
    Does a grid search to find good weights for the ensemble
    predictionmatrix is a QxNxC matrix, where Q is the number of models, N is the number of samples, C is the number of classes
    The parameters for the crossvalidator are fixed because taking a weighted average does not need training and is deterministic
    numinterval is the number of possible weights to consider, evenly spaced between 0 and 1.
    important: numinterval is a postive integer, not a float!
    By default, this is Q + 1, where Q is the number of models
    """
    Q, _, _ = np.shape(predictionmatrix)
    if numinterval is None:
        numinterval = Q + 1
    weightDict = dict(enumerate(make_weights_list(Q, numinterval)))

    if printWeights:
        print "The key-value pairs for all possible combinations of weights:"
        for k, v in weightDict.iteritems():
            print str(k) + ": [" + ', '.join([str(e) for e in v]) + "]"

    unstackpredict = unstack_predictions(predictionmatrix)
    validator = SampleCrossValidator(unstackpredict,
                                     trueclasses,
                                     rounds=rounds,
                                     test_frac=data_frac,
                                     use_data_frac=data_frac)
    optimizer = GridOptimizer(validator=validator,
                              use_caching=False,
                              weights=weightDict.keys())
    count = 0
    for weights, _, _, test in optimizer.yield_batches(False):
        stackedPredict = stack_predictions(test, Q)
        prediction = mean_ensemble(stackedPredict,
                                   weights=weightDict[weights['weights']])
        optimizer.register_results(prediction)
        count += 1
        if (count % 100) == 0:
            print count
    print optimizer.print_top(printtop, True)
예제 #3
0
    'max_epochs':
    1000,  # it terminates when overfitting or increasing, so just leave high
    'auto_stopping':
    True,  # stop training automatically if it seems to be failing
    'pretrain':
    pretrain,  # use pretraining? (True for automatic, filename for specific)
    'outlier_method': 'OCSVM',  # method for outlier removal ['OCSVM', 'EE']
    'outlier_frac': None,  # which fraction of each class to remove as outliers
    'normalize_log': True,  # use logarithm for normalization
    'use_calibration': False,  # use calibration of probabilities
    'use_rescale_priors': True,  # rescale predictions to match priors
    'extra_feature_count': [0, 30, 80, 163,
                            300],  # how many new features to generate
    'extra_feature_seed': 0,  # a seed for the feature generation
}

# make_pretrain(pretrain, train_data, true_labels, **params)

validator = SampleCrossValidator(train_data,
                                 true_labels,
                                 rounds=3,
                                 test_frac=0.2,
                                 use_data_frac=1)
ParallelGridOptimizer(train_test_func=train_test_NN,
                      validator=validator,
                      use_caching=False,
                      **params).readygo(topprint=20,
                                        save_fig_basename=name,
                                        log_name=name + '.log',
                                        only_show_top=True)
예제 #4
0
from lasagne.updates import sgd
from theano import function, config
import theano.tensor as T
import numpy as np
from matplotlib.pyplot import subplots, show, cm
from utils.loading import get_training_data
from validation.crossvalidate import SampleCrossValidator

filterwarnings('ignore', '.*topo.*')
config.optimizer = 'None'  # todo: turn off, it's very slow

#todo: normalize data

train_data, true_classes, features = get_training_data()
validator = SampleCrossValidator(train_data,
                                 true_classes,
                                 rounds=5,
                                 test_frac=0.3)
for train, classes, test in validator.yield_cross_validation_sets():
    # create tensor objects
    trainT = train.astype(config.floatX)
    testT = train.astype(config.floatX)
    classT = classes.astype('int32')

    # First, construct an input layer.
    # The shape parameter defines the expected input shape, which is just the shape of our data matrix X.
    l_in = InputLayer(shape=trainT.shape, W=Constant())
    # A dense layer implements a linear mix (xW + b) followed by a nonlinearity.
    l_hidden = DenseLayer(
        l_in,  # The first argument is the input to this layer
        num_units=25,  # This defines the layer's output dimensionality
        nonlinearity=tanh,
예제 #5
0
    "rescale_pred": rescale_pred
}

testparams = baseparams.copy()
testparams.update({
    "n_estimators": n_estimators,
    "learning_rate": learning_rate,
    "algorithm": algorithm,
    "calibration": calibration,
    "calibrationmethod": calibrationmethod
})

train_data, true_classes, _ = get_training_data()
validator = SampleCrossValidator(train_data,
                                 true_classes,
                                 rounds=1,
                                 test_frac=0.1,
                                 use_data_frac=1.0)

#The parallelized code
"""
optimizer = ParallelGridOptimizer(adaBoost, validator, process_count = 4, **testparams)
optimizer.readygo()
"""
"""
optimizer = ParallelGridOptimizer(adaBoost, validator, process_count = 4, **testparams)

"""

#The non parallelized-code
예제 #6
0
def optimize_NN(name=name_from_file(),
                rounds=1,
                debug=False,
                use_caching=True,
                train_test_func=train_test_NN,
                test_only=False,
                **special_params):
    """
		Some default code for optimization, adding default parameters and debug, and using mostly other classes to do the rest of the work.
	"""
    """
		Default parameters.
	"""
    for key in special_params.keys():
        assert key in DEFAULT_PARAMS.keys(
        ), '"{0:s}" is not a known parameter'.format(key)
    params = copy(DEFAULT_PARAMS)
    params.update(special_params)
    """
		Load data.
	"""
    train_data, true_labels, features = get_training_data()
    """
		Pre-training.
	"""
    if params['pretrain'] or params['pretrain'] is None:
        layer_sizes = [
            params['extra_feature_count'] or 0, params['dense1_size'] or 0,
            params['dense2_size'] or 0, params['dense3_size'] or 0,
            params['dropout1_rate'] or 0, params['dropout2_rate'] or 0
        ]
        if any(is_nonstr_iterable(nr) for nr in layer_sizes):
            """ Different layouts, so no pre-training. """
            if params['pretrain'] is None:
                print 'No pre-training since layer sizes are not constant.'
                params['pretrain'] = False
            else:
                raise AssertionError(
                    'Pre-training is not available when there are different network layouts (e.g. different numbers of neurons or features).'
                )
        else:
            """ Constant layout, so can use pre-training. """
            if params['pretrain'] is None or params['pretrain'] is True:
                params['pretrain'] = join(
                    PRETRAIN_DIR, 'pt{0:s}.net.npz'.format('x'.join(
                        str(nr) for nr in layer_sizes if nr is not None)))
            make_pretrain(params['pretrain'], train_data, true_labels,
                          **params)
    """
		The actual optimization, optionally in debug mode (non-parallel for stacktrace and resource use).
	"""
    validator = SampleCrossValidator(train_data,
                                     true_labels,
                                     rounds=rounds,
                                     test_frac=0.2,
                                     use_data_frac=1)
    if debug:
        optimizer = GridOptimizer(validator, use_caching=use_caching, **params)
        for subparams, train, labels, test in optimizer.yield_batches():
            optimizer.register_results(
                train_test_func(train, labels, test, **subparams))
        optimizer.print_plot_results()
    else:
        ParallelGridOptimizer(train_test_func=train_test_func,
                              validator=validator,
                              use_caching=use_caching,
                              **params).readygo(save_fig_basename=name,
                                                log_name=name + '.log',
                                                only_show_top=True)
예제 #7
0
x = np.where(newlabels == np.min(newlabels))[0]
y = np.where(newlabels == np.max(newlabels))[0]
newlabels[x] = 1  #np.zeros(len(x), dtype = 'int32')
newlabels[y] = 2  #np.ones(len(y), dtype = 'int32')

print len(x)
print len(y)
"""
print newlabels[:10]
print newlabels[-10:]
print y[-10:]
print np.min(newlabels), np.max(newlabels)

print np.shape(newtrain)
print np.shape(newlabels)
"""
validator = SampleCrossValidator(newtrain,
                                 newlabels,
                                 rounds=1,
                                 test_frac=0.1,
                                 use_data_frac=1.0)

for train, classes, test in validator.yield_cross_validation_sets():

    prediction = randomForest(train, classes, test, 5, n_estimators=200)
    #prediction = gradientBoosting(train, classes, test, n_estimators = 1000)

    validator.add_prediction(prediction)
validator.print_results()