Python filter_data примеры, utils.outliers.filter_data Python примеры использования

Пример #1

0

Показать файл

Файл: neuraloptimize.py Проект: EdwardBetts/kaggle_otto

def train_test_NN(train, classes, test, **parameters):

	train, classes = equalize_class_sizes(train, classes)
	train, classes = filter_data(train, classes, cut_outlier_frac = 0.06, method = 'OCSVM')  # remove ourliers
	train = normalize_data(train, use_log = True)[0]  # also converts to floats
	test = normalize_data(test, use_log = True)[0]

	parameters['dense2_nonlinearity'] = parameters['dense1_nonlinearity']  # hack1
	parameters['dense2_init'] = parameters['dense1_init']  # hack2
	net = make_net(**parameters)
	net.fit(train, classes - 1)
	return net.predict_proba(test)

Пример #2

0

Показать файл

Файл: train_test.py Проект: EdwardBetts/kaggle_otto

def train_NN(train,
             labels,
             test,
             outlier_frac=0,
             outlier_method='OCSVM',
             use_calibration=False,
             normalize_log=True,
             use_rescale_priors=False,
             extra_feature_count=0,
             extra_feature_seed=0,
             test_data_confidence=None,
             test_only=False,
             **parameters):
    """
		Train a neural network, for internal use by other functions in this file.
	"""
    train, labels = expand_from_test(train,
                                     labels,
                                     get_testing_data()[0],
                                     confidence=test_data_confidence)
    train, test = chain_feature_generators(train,
                                           labels,
                                           test,
                                           extra_features=extra_feature_count,
                                           seed=extra_feature_seed)
    train, test = conormalize_data(train, test, use_log=normalize_log)
    if outlier_frac:
        train, labels = filter_data(train,
                                    labels,
                                    cut_outlier_frac=outlier_frac,
                                    method=outlier_method)
    net = make_net(NFEATS=train.shape[1], **parameters)
    if use_calibration:
        net = CalibratedClassifierCV(net,
                                     method='sigmoid',
                                     cv=ShuffleSplit(train.shape[0],
                                                     n_iter=1,
                                                     test_size=0.2))
    if not test_only:
        net.fit(train, labels - 1)
    return net, train, test

Пример #3

0

Показать файл

Файл: randomforest.py Проект: EdwardBetts/kaggle_otto

def randomForest(train,
                 labels,
                 test,
                 calibration=0.0,
                 calibrationmethod='sigmoid',
                 sample_weight=None,
                 n_estimators=100,
                 criterion='gini',
                 max_features="auto",
                 max_depth=None,
                 min_samples_split=2,
                 min_samples_leaf=1,
                 min_weight_fraction_leaf=0.,
                 max_leaf_nodes=None,
                 n_jobs=1,
                 verbose=0,
                 outlier_frac=0.0,
                 outlier_method='EE',
                 rescale_pred=False,
                 class_weight=None):
    """
    Trains a model by giving it a feature matrix, as well as the labels (the ground truth)
    then using that model, predicts the given test samples
    output is 9 probabilities, one for each class
    :param train: The training data, to train the model
    :param labels: The labels of the training data, an array
    :param calibration: How much data to use for calibration. If calibration is 0, no calibration is done.
        The data is simply split, no shuffling is done, so if the data is ordered, shuffle it first!
        If calibration is n > 1, then crossvalidation will be done, using n folds.
    :param verbose: See sklearn documentation
    """
    if outlier_frac > 0:
        train, labels = filter_data(train,
                                    labels,
                                    cut_outlier_frac=outlier_frac,
                                    method=outlier_method)  # remove ourliers
    if isinstance(sample_weight, str):
        sample_weight = obtain_class_weights(labels, sample_weight)

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        criterion=criterion,
        max_features=max_features,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        min_weight_fraction_leaf=min_weight_fraction_leaf,
        max_leaf_nodes=max_leaf_nodes,
        n_jobs=n_jobs,
        verbose=verbose,
        class_weight=class_weight)

    if calibration == 0.0:
        model.fit(train, labels, sample_weight)
    elif calibration > 1:
        model = CalibratedClassifierCV(model, calibrationmethod, calibration)
        model.fit(train, labels, sample_weight)
    else:
        N = len(labels)
        if sample_weight is None:
            sample_weight = ones(N)
        train_rows = floor((1.0 - calibration) * N)
        model.fit(train[:train_rows, :], labels[:train_rows],
                  sample_weight[:train_rows])
        model = CalibratedClassifierCV(model, calibrationmethod, "prefit")
        model.fit(train[train_rows:, :],
                  labels[train_rows:],
                  sample_weight=sample_weight[train_rows:])

    predictions = model.predict_proba(test)

    if rescale_pred:
        predictions = rescale_prior(predictions, bincount(labels))
    return predictions

Пример #4

0

Показать файл

	net = make_net(**parameters)
	net.fit(train, classes - 1)
	return net.predict_proba(test)



# one big job for size1 + size2 + dropout
# one job for nonlinearity1 + initialization1
# one job for learning rate + learning rate scaling + momentum + momentum scaling (split if too many)


name = '{0:s}.log'.format(splitext(basename(getattr(modules['__main__'], '__file__', 'optimize.default')))[0])  # automatic based on filename

train_data, true_classes, features = get_training_data()  # load the train data
train_data, true_classes = equalize_class_sizes(train_data, true_classes)
train_data, true_classes = filter_data(train_data, true_classes, cut_outlier_frac = 0.06, method = 'OCSVM')  # remove ourliers
train_data = normalize_data(train_data, use_log = True)[0]  # also converts to floats
validator = SampleCrossValidator(train_data, true_classes, rounds = 3, test_frac = 0.2, use_data_frac = 1)
optimizer = ParallelGridOptimizer(train_test_func = train_test_NN, validator = validator, use_caching = True,
	name = name,                      # just choose something sensible
	dense1_size = 80,                 # [30, 25, 80, 120, 180]
	dense1_nonlinearity = ['tanh', 'sigmoid', 'rectify', 'leaky2', 'leaky20' 'softmax'],
	dense1_init = ['orthogonal', 'sparse', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'],
	dense2_size = 80,                 # [30, 25, 80, 120, 180]
	dense2_nonlinearity = 'leaky20',  # this is coupled to dense1_nonlinearity through hack#1
	dense2_init = 'orthogonal',       # idem hack2
	learning_rate = 0.001,            # [0.1, 0.01, 0.001, 0.0001]
	learning_rate_scaling = 100,      # [1, 10, 100]
	momentum = 0.9,                   # [0, 0.9, 0.99]
	momentum_scaling = 100,           # [1, 10, 100]
	dropout1_rate = 0.5,              # [0, 0.5]

Пример #5

0

Показать файл

Файл: svm.py Проект: EdwardBetts/kaggle_otto

def svm(train,
        labels,
        test,
        C=10,
        kernel='rbf',
        degree=3,
        gamma=0.5,
        calibration=0.0,
        calibrationmethod='sigmoid',
        coef0=0.0,
        probability=True,
        shrinking=True,
        tol=1e-3,
        verbose=0,
        outlier_frac=0.0,
        outlier_method='EE',
        rescale_pred=False,
        class_weight=None,
        sample_weight=None,
        rescale=True):
    """
    Trains a model by giving it a feature matrix, as well as the labels (the ground truth)
    then using that model, predicts the given test samples
    output is 9 probabilities, one for each class
    :param train: The training data, to train the model
    :param labels: The labels of the training data, an array
    :param C: trades off misclassification of training examples against simplicity of the decision surface
              low C makes the decision surface smooth, while a high C aims at classifying all training examples correctly 
    :param gamma: parameter defines how far the influence of a single training example reaches
                  low values meaning ‘far’ and high values meaning ‘close’. 
    :param verbose: See sklearn documentation
    :param rescale: both the training and testing data are taken square root of, rescaled to unit variance, and moved to interval [0,1]
    """
    if outlier_frac > 0:
        train, labels = filter_data(train,
                                    labels,
                                    cut_outlier_frac=outlier_frac,
                                    method=outlier_method)  # remove ourliers
    if isinstance(sample_weight, str):
        sample_weight = obtain_class_weights(labels, sample_weight)

    if rescale:  #take square root, rescale variance to unit, rescale to [0,1]
        #this should preserve sparsity of matrix
        train = sqrt(train)
        test = sqrt(test)
        scaler = StandardScaler(with_mean=False, with_std=True, copy=True)
        train = scaler.fit_transform(train)
        scaler = StandardScaler(with_mean=False, with_std=True, copy=True)
        test = scaler.fit_transform(test)
        scaler = MinMaxScaler()
        train = scaler.fit_transform(train)
        scaler = MinMaxScaler()
        test = scaler.fit_transform(test)

    model = SVC(C=C,
                kernel=kernel,
                degree=degree,
                gamma=gamma,
                coef0=coef0,
                probability=probability,
                shrinking=shrinking,
                tol=tol,
                verbose=verbose,
                class_weight=class_weight)

    if calibration == 0.0:
        model.fit(train, labels, sample_weight)
    elif calibration > 1:
        model = CalibratedClassifierCV(model, calibrationmethod, calibration)
        model.fit(train, labels, sample_weight)
    else:
        N = len(labels)
        if sample_weight is None:
            sample_weight = ones(N)
        train_rows = floor((1.0 - calibration) * N)
        model.fit(train[:train_rows, :], labels[:train_rows],
                  sample_weight[:train_rows])
        model = CalibratedClassifierCV(model, calibrationmethod, "prefit")
        model.fit(train[train_rows:, :],
                  labels[train_rows:],
                  sample_weight=sample_weight[train_rows:])

    model.fit(train, labels, sample_weight)

    predictions = model.predict_proba(test)

    if rescale_pred:
        predictions = rescale_prior(predictions, bincount(labels))
    return predictions

Пример #6

0

Показать файл

def AdaBoost(train,
			labels,
			test,
			calibration=0.0,
			calibrationmethod='sigmoid',
			base_estimator= 'DecisionTreeClassifier',
			n_estimators=50,
			learning_rate=1.0,
			algorithm='SAMME',
			random_state=None,
			sample_weight=None,
			outlier_frac=False,
			outlier_method='EE',
			undersample=False,
			rescale_pred= False,
			verbose=0,
			class_weight=None):


	"""
	Trains a model by giving it a feature matrix, as well as the labels (the ground truth)
	then using that model, predicts the given test samples
	output is 9 probabilities, one for each class
	:param train: The training data, to train the model
	:param labels: The labels of the training data, an array
	:param test: the data to predict
	:param n_estimators: See sklearn documentation
	:param max_depth: See sklearn documentation
	:param calibration: How much data to use for calibration. If calibration is False (including 0.0), no calibration is done.
		The data is simply split, no shuffling is done, so if the data is ordered, shuffle it first!
		If calibration is n > 1, then crossvalidation will be done, using n folds.
	:param verbose: See sklearn documentation
	"""

	if outlier_frac:
		train, labels = filter_data(train, labels, cut_outlier_frac = outlier_frac, method = outlier_method) #remove outliers
	if undersample:
		train, labels = equalize_class_sizes(train, labels)
	if isinstance(sample_weight, str):
		sample_weight = obtain_class_weights(labels, sample_weight)

	N = len(labels)
	trainrows = int((1.0 - calibration) * N)

	model = AdaBoostClassifier(base_estimator = base_estimator,
							   n_estimators=n_estimators,
							   learning_rate=learning_rate,
							   algorithm=algorithm,
							   random_state=random_state)
	if not calibration:
		model.fit(train, labels, sample_weight)
		predictions = model.predict_proba(test)
	elif calibration > 1:
		calibratedmodel = CalibratedClassifierCV(model, calibrationmethod, calibration)
		calibratedmodel.fit(train, labels, sample_weight)
		predictions = calibratedmodel.predict_proba(test)
	else:
		if sample_weight is None:
			sample_weight = ones((len(labels)))
		print 'trainrows', trainrows
		model.fit(train[:trainrows, :], labels[:trainrows],sample_weight[:trainrows])
		calibratedmodel = CalibratedClassifierCV(model, calibrationmethod, "prefit")
		calibratedmodel.fit(train[trainrows:,:], labels[trainrows:], sample_weight = sample_weight[trainrows:])
		predictions = calibratedmodel.predict_proba(test)

	if rescale_pred:
		predictions = rescale_prior(predictions, bincount(labels))
	return predictions

Пример #7

0

Показать файл

def boostedTrees(train,
                 labels,
                 test,
                 column_names=None,
                 target='target',
                 max_iterations=200,
                 min_child_weight=5,
                 step_size=0.2,
                 max_depth=10,
                 class_weights=None,
                 min_loss_reduction=0.5,
                 verbose=0,
                 outlier_frac=0.0,
                 outlier_method='EE',
                 rescale_pred=False):
    """
    train, labels, test are numpy matrices containing tha data 
    column_names is a list of column names of the test/train data
    target is the column name of the labels column
    Because it's graphlab and not sklearn, the calibration is not implemented (it's possible, but harder)
    Also, seemingly, setting sample weights is also not supported by graphlab
    """
    if outlier_frac > 0:
        train, labels = filter_data(train,
                                    labels,
                                    cut_outlier_frac=outlier_frac,
                                    method=outlier_method,
                                    use_caching=False)  # remove ourliers
    if column_names is None:
        column_names = range(np.shape(train)[1])
    target = 'target'
    newTrain = np.vstack((train.T, labels)).T
    pdTrain = pd.DataFrame(newTrain, columns=np.append(column_names, target))
    trainFrame = gl.SFrame(pdTrain)
    del newTrain, pdTrain
    pdTest = pd.DataFrame(test, columns=column_names)
    testFrame = gl.SFrame(pdTest)
    del pdTest
    model = gl.boosted_trees_classifier.create(
        trainFrame,
        target=target,
        max_iterations=max_iterations,
        min_child_weight=min_child_weight,
        step_size=step_size,
        max_depth=max_depth,
        class_weights=class_weights,
        min_loss_reduction=min_loss_reduction,
        verbose=verbose)
    preds = model.predict_topk(testFrame, output_type='probability', k=9)
    preds['id'] = preds['id'].astype(int)
    #some hacky dataframe magic, creates Nx10 matrix (id in first column)
    preds = preds.unstack(['class', 'probability'],
                          'probs').unpack('probs', '').sort('id')

    newPreds = preds.to_dataframe().values
    newPreds = newPreds[:, 1:]  #remove the id column
    del preds, model

    assert np.shape(newPreds)[0] == np.shape(
        test)[0], "conversion failed somewhere, size doesn't match"

    if rescale_pred:
        newPreds = rescale_prior(newPreds, np.bincount(labels))
    return newPreds

Пример #8

0

Показать файл

Файл: adaBoost.py Проект: EdwardBetts/kaggle_otto

def adaBoost(train,
             labels,
             test,
             base_estimator=None,
             n_estimators=50,
             learning_rate=1.0,
             algorithm='SAMME.R',
             random_state=None,
             calibration=0.0,
             calibrationmethod='isotonic',
             sample_weight=None,
             verbose=1,
             outlier_frac=0.0,
             outlier_method='EE',
             rescale_pred=False):
    """
    Trains a model by giving it a feature matrix, as well as the labels (the ground truth)
    then using that model, predicts the given test samples
    output is 9 probabilities, one for each class
    :param train: The training data, to train the model
    :param labels: The labels of the training data, an array
    :param calibration: How much data to use for calibration. If calibration is 0, no calibration is done.
        The data is simply split, no shuffling is done, so if the data is ordered, shuffle it first!
        If calibration is n > 1, then crossvalidation will be done, using n folds.
    :param verbose: See sklearn documentation
    """
    if outlier_frac > 0:
        train, labels = filter_data(train,
                                    labels,
                                    cut_outlier_frac=outlier_frac,
                                    method=outlier_method)  # remove ourliers
    if isinstance(sample_weight, str):
        sample_weight = obtain_class_weights(labels, sample_weight)

    model = AdaBoostClassifier(base_estimator=base_estimator,
                               n_estimators=n_estimators,
                               learning_rate=learning_rate,
                               algorithm=algorithm,
                               random_state=random_state)

    if calibration == 0.0:
        model.fit(train, labels, sample_weight)
    elif calibration > 1:
        model = CalibratedClassifierCV(model, calibrationmethod, calibration)
        model.fit(train, labels, sample_weight)
    else:
        N = len(labels)
        if sample_weight is None:
            sample_weight = ones(N)
        train_rows = floor((1.0 - calibration) * N)
        model.fit(train[:train_rows, :], labels[:train_rows],
                  sample_weight[:train_rows])
        model = CalibratedClassifierCV(model, calibrationmethod, "prefit")
        model.fit(train[train_rows:, :],
                  labels[train_rows:],
                  sample_weight=sample_weight[train_rows:])

    predictions = model.predict_proba(test)

    if rescale_pred:
        predictions = rescale_prior(predictions, bincount(labels))
    return predictions

Python filter_data примеры использования