def train_test_NN(train, classes, test, **parameters): train, classes = equalize_class_sizes(train, classes) train, classes = filter_data(train, classes, cut_outlier_frac = 0.06, method = 'OCSVM') # remove ourliers train = normalize_data(train, use_log = True)[0] # also converts to floats test = normalize_data(test, use_log = True)[0] parameters['dense2_nonlinearity'] = parameters['dense1_nonlinearity'] # hack1 parameters['dense2_init'] = parameters['dense1_init'] # hack2 net = make_net(**parameters) net.fit(train, classes - 1) return net.predict_proba(test)
def train_NN(train, labels, test, outlier_frac=0, outlier_method='OCSVM', use_calibration=False, normalize_log=True, use_rescale_priors=False, extra_feature_count=0, extra_feature_seed=0, test_data_confidence=None, test_only=False, **parameters): """ Train a neural network, for internal use by other functions in this file. """ train, labels = expand_from_test(train, labels, get_testing_data()[0], confidence=test_data_confidence) train, test = chain_feature_generators(train, labels, test, extra_features=extra_feature_count, seed=extra_feature_seed) train, test = conormalize_data(train, test, use_log=normalize_log) if outlier_frac: train, labels = filter_data(train, labels, cut_outlier_frac=outlier_frac, method=outlier_method) net = make_net(NFEATS=train.shape[1], **parameters) if use_calibration: net = CalibratedClassifierCV(net, method='sigmoid', cv=ShuffleSplit(train.shape[0], n_iter=1, test_size=0.2)) if not test_only: net.fit(train, labels - 1) return net, train, test
def randomForest(train, labels, test, calibration=0.0, calibrationmethod='sigmoid', sample_weight=None, n_estimators=100, criterion='gini', max_features="auto", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_leaf_nodes=None, n_jobs=1, verbose=0, outlier_frac=0.0, outlier_method='EE', rescale_pred=False, class_weight=None): """ Trains a model by giving it a feature matrix, as well as the labels (the ground truth) then using that model, predicts the given test samples output is 9 probabilities, one for each class :param train: The training data, to train the model :param labels: The labels of the training data, an array :param calibration: How much data to use for calibration. If calibration is 0, no calibration is done. The data is simply split, no shuffling is done, so if the data is ordered, shuffle it first! If calibration is n > 1, then crossvalidation will be done, using n folds. :param verbose: See sklearn documentation """ if outlier_frac > 0: train, labels = filter_data(train, labels, cut_outlier_frac=outlier_frac, method=outlier_method) # remove ourliers if isinstance(sample_weight, str): sample_weight = obtain_class_weights(labels, sample_weight) model = RandomForestClassifier( n_estimators=n_estimators, criterion=criterion, max_features=max_features, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_leaf_nodes=max_leaf_nodes, n_jobs=n_jobs, verbose=verbose, class_weight=class_weight) if calibration == 0.0: model.fit(train, labels, sample_weight) elif calibration > 1: model = CalibratedClassifierCV(model, calibrationmethod, calibration) model.fit(train, labels, sample_weight) else: N = len(labels) if sample_weight is None: sample_weight = ones(N) train_rows = floor((1.0 - calibration) * N) model.fit(train[:train_rows, :], labels[:train_rows], sample_weight[:train_rows]) model = CalibratedClassifierCV(model, calibrationmethod, "prefit") model.fit(train[train_rows:, :], labels[train_rows:], sample_weight=sample_weight[train_rows:]) predictions = model.predict_proba(test) if rescale_pred: predictions = rescale_prior(predictions, bincount(labels)) return predictions
net = make_net(**parameters) net.fit(train, classes - 1) return net.predict_proba(test) # one big job for size1 + size2 + dropout # one job for nonlinearity1 + initialization1 # one job for learning rate + learning rate scaling + momentum + momentum scaling (split if too many) name = '{0:s}.log'.format(splitext(basename(getattr(modules['__main__'], '__file__', 'optimize.default')))[0]) # automatic based on filename train_data, true_classes, features = get_training_data() # load the train data train_data, true_classes = equalize_class_sizes(train_data, true_classes) train_data, true_classes = filter_data(train_data, true_classes, cut_outlier_frac = 0.06, method = 'OCSVM') # remove ourliers train_data = normalize_data(train_data, use_log = True)[0] # also converts to floats validator = SampleCrossValidator(train_data, true_classes, rounds = 3, test_frac = 0.2, use_data_frac = 1) optimizer = ParallelGridOptimizer(train_test_func = train_test_NN, validator = validator, use_caching = True, name = name, # just choose something sensible dense1_size = 80, # [30, 25, 80, 120, 180] dense1_nonlinearity = ['tanh', 'sigmoid', 'rectify', 'leaky2', 'leaky20' 'softmax'], dense1_init = ['orthogonal', 'sparse', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'], dense2_size = 80, # [30, 25, 80, 120, 180] dense2_nonlinearity = 'leaky20', # this is coupled to dense1_nonlinearity through hack#1 dense2_init = 'orthogonal', # idem hack2 learning_rate = 0.001, # [0.1, 0.01, 0.001, 0.0001] learning_rate_scaling = 100, # [1, 10, 100] momentum = 0.9, # [0, 0.9, 0.99] momentum_scaling = 100, # [1, 10, 100] dropout1_rate = 0.5, # [0, 0.5]
def svm(train, labels, test, C=10, kernel='rbf', degree=3, gamma=0.5, calibration=0.0, calibrationmethod='sigmoid', coef0=0.0, probability=True, shrinking=True, tol=1e-3, verbose=0, outlier_frac=0.0, outlier_method='EE', rescale_pred=False, class_weight=None, sample_weight=None, rescale=True): """ Trains a model by giving it a feature matrix, as well as the labels (the ground truth) then using that model, predicts the given test samples output is 9 probabilities, one for each class :param train: The training data, to train the model :param labels: The labels of the training data, an array :param C: trades off misclassification of training examples against simplicity of the decision surface low C makes the decision surface smooth, while a high C aims at classifying all training examples correctly :param gamma: parameter defines how far the influence of a single training example reaches low values meaning ‘far’ and high values meaning ‘close’. :param verbose: See sklearn documentation :param rescale: both the training and testing data are taken square root of, rescaled to unit variance, and moved to interval [0,1] """ if outlier_frac > 0: train, labels = filter_data(train, labels, cut_outlier_frac=outlier_frac, method=outlier_method) # remove ourliers if isinstance(sample_weight, str): sample_weight = obtain_class_weights(labels, sample_weight) if rescale: #take square root, rescale variance to unit, rescale to [0,1] #this should preserve sparsity of matrix train = sqrt(train) test = sqrt(test) scaler = StandardScaler(with_mean=False, with_std=True, copy=True) train = scaler.fit_transform(train) scaler = StandardScaler(with_mean=False, with_std=True, copy=True) test = scaler.fit_transform(test) scaler = MinMaxScaler() train = scaler.fit_transform(train) scaler = MinMaxScaler() test = scaler.fit_transform(test) model = SVC(C=C, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, probability=probability, shrinking=shrinking, tol=tol, verbose=verbose, class_weight=class_weight) if calibration == 0.0: model.fit(train, labels, sample_weight) elif calibration > 1: model = CalibratedClassifierCV(model, calibrationmethod, calibration) model.fit(train, labels, sample_weight) else: N = len(labels) if sample_weight is None: sample_weight = ones(N) train_rows = floor((1.0 - calibration) * N) model.fit(train[:train_rows, :], labels[:train_rows], sample_weight[:train_rows]) model = CalibratedClassifierCV(model, calibrationmethod, "prefit") model.fit(train[train_rows:, :], labels[train_rows:], sample_weight=sample_weight[train_rows:]) model.fit(train, labels, sample_weight) predictions = model.predict_proba(test) if rescale_pred: predictions = rescale_prior(predictions, bincount(labels)) return predictions
def AdaBoost(train, labels, test, calibration=0.0, calibrationmethod='sigmoid', base_estimator= 'DecisionTreeClassifier', n_estimators=50, learning_rate=1.0, algorithm='SAMME', random_state=None, sample_weight=None, outlier_frac=False, outlier_method='EE', undersample=False, rescale_pred= False, verbose=0, class_weight=None): """ Trains a model by giving it a feature matrix, as well as the labels (the ground truth) then using that model, predicts the given test samples output is 9 probabilities, one for each class :param train: The training data, to train the model :param labels: The labels of the training data, an array :param test: the data to predict :param n_estimators: See sklearn documentation :param max_depth: See sklearn documentation :param calibration: How much data to use for calibration. If calibration is False (including 0.0), no calibration is done. The data is simply split, no shuffling is done, so if the data is ordered, shuffle it first! If calibration is n > 1, then crossvalidation will be done, using n folds. :param verbose: See sklearn documentation """ if outlier_frac: train, labels = filter_data(train, labels, cut_outlier_frac = outlier_frac, method = outlier_method) #remove outliers if undersample: train, labels = equalize_class_sizes(train, labels) if isinstance(sample_weight, str): sample_weight = obtain_class_weights(labels, sample_weight) N = len(labels) trainrows = int((1.0 - calibration) * N) model = AdaBoostClassifier(base_estimator = base_estimator, n_estimators=n_estimators, learning_rate=learning_rate, algorithm=algorithm, random_state=random_state) if not calibration: model.fit(train, labels, sample_weight) predictions = model.predict_proba(test) elif calibration > 1: calibratedmodel = CalibratedClassifierCV(model, calibrationmethod, calibration) calibratedmodel.fit(train, labels, sample_weight) predictions = calibratedmodel.predict_proba(test) else: if sample_weight is None: sample_weight = ones((len(labels))) print 'trainrows', trainrows model.fit(train[:trainrows, :], labels[:trainrows],sample_weight[:trainrows]) calibratedmodel = CalibratedClassifierCV(model, calibrationmethod, "prefit") calibratedmodel.fit(train[trainrows:,:], labels[trainrows:], sample_weight = sample_weight[trainrows:]) predictions = calibratedmodel.predict_proba(test) if rescale_pred: predictions = rescale_prior(predictions, bincount(labels)) return predictions
def boostedTrees(train, labels, test, column_names=None, target='target', max_iterations=200, min_child_weight=5, step_size=0.2, max_depth=10, class_weights=None, min_loss_reduction=0.5, verbose=0, outlier_frac=0.0, outlier_method='EE', rescale_pred=False): """ train, labels, test are numpy matrices containing tha data column_names is a list of column names of the test/train data target is the column name of the labels column Because it's graphlab and not sklearn, the calibration is not implemented (it's possible, but harder) Also, seemingly, setting sample weights is also not supported by graphlab """ if outlier_frac > 0: train, labels = filter_data(train, labels, cut_outlier_frac=outlier_frac, method=outlier_method, use_caching=False) # remove ourliers if column_names is None: column_names = range(np.shape(train)[1]) target = 'target' newTrain = np.vstack((train.T, labels)).T pdTrain = pd.DataFrame(newTrain, columns=np.append(column_names, target)) trainFrame = gl.SFrame(pdTrain) del newTrain, pdTrain pdTest = pd.DataFrame(test, columns=column_names) testFrame = gl.SFrame(pdTest) del pdTest model = gl.boosted_trees_classifier.create( trainFrame, target=target, max_iterations=max_iterations, min_child_weight=min_child_weight, step_size=step_size, max_depth=max_depth, class_weights=class_weights, min_loss_reduction=min_loss_reduction, verbose=verbose) preds = model.predict_topk(testFrame, output_type='probability', k=9) preds['id'] = preds['id'].astype(int) #some hacky dataframe magic, creates Nx10 matrix (id in first column) preds = preds.unstack(['class', 'probability'], 'probs').unpack('probs', '').sort('id') newPreds = preds.to_dataframe().values newPreds = newPreds[:, 1:] #remove the id column del preds, model assert np.shape(newPreds)[0] == np.shape( test)[0], "conversion failed somewhere, size doesn't match" if rescale_pred: newPreds = rescale_prior(newPreds, np.bincount(labels)) return newPreds
def adaBoost(train, labels, test, base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None, calibration=0.0, calibrationmethod='isotonic', sample_weight=None, verbose=1, outlier_frac=0.0, outlier_method='EE', rescale_pred=False): """ Trains a model by giving it a feature matrix, as well as the labels (the ground truth) then using that model, predicts the given test samples output is 9 probabilities, one for each class :param train: The training data, to train the model :param labels: The labels of the training data, an array :param calibration: How much data to use for calibration. If calibration is 0, no calibration is done. The data is simply split, no shuffling is done, so if the data is ordered, shuffle it first! If calibration is n > 1, then crossvalidation will be done, using n folds. :param verbose: See sklearn documentation """ if outlier_frac > 0: train, labels = filter_data(train, labels, cut_outlier_frac=outlier_frac, method=outlier_method) # remove ourliers if isinstance(sample_weight, str): sample_weight = obtain_class_weights(labels, sample_weight) model = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=n_estimators, learning_rate=learning_rate, algorithm=algorithm, random_state=random_state) if calibration == 0.0: model.fit(train, labels, sample_weight) elif calibration > 1: model = CalibratedClassifierCV(model, calibrationmethod, calibration) model.fit(train, labels, sample_weight) else: N = len(labels) if sample_weight is None: sample_weight = ones(N) train_rows = floor((1.0 - calibration) * N) model.fit(train[:train_rows, :], labels[:train_rows], sample_weight[:train_rows]) model = CalibratedClassifierCV(model, calibrationmethod, "prefit") model.fit(train[train_rows:, :], labels[train_rows:], sample_weight=sample_weight[train_rows:]) predictions = model.predict_proba(test) if rescale_pred: predictions = rescale_prior(predictions, bincount(labels)) return predictions