def objective(parameters):
        i.append(0)
        set_weights(model, parameters)
        p = model.predict(X, batch_size=256, verbose=0)[:, 1]
        auc = roc_auc_truncated(y, p)

        pa = model.predict(Xa, batch_size=256, verbose=0)[:, 1]
        ks = compute_ks(pa[ya == 0], pa[ya == 1], wa[ya == 0], wa[ya == 1])
        
        pc = model.predict(Xc, batch_size=256, verbose=0)[:, 1]        
        cvm = compute_cvm(pc, mc)

        ks_importance = 1  # relative KS importance
        ks_target = ks_threshold
        cvm_importance = 1  # relative CVM importance
        cvm_target = cvm_threshold
        
        alpha = 0.001        # LeakyReLU
        ks_loss = (1 if ks > ks_target else alpha) * (ks - ks_target)
        cvm_loss = (1 if cvm > cvm_target else alpha) * (cvm - cvm_target)
        loss = -auc + ks_importance*ks_loss + cvm_importance*cvm_loss        

        if ks < ks_threshold and cvm < cvm_threshold and auc > auc_log[0]:
            d.append(0)
            dump_transductor_model(model, transductor_model_file.format(len(d)))
            auc_log.pop()
            auc_log.append(auc)
            message = "iteration {:7}: Best AUC={:7.5f} achieved, KS={:7.5f}, CVM={:7.5f}".format(len(i), auc, ks, cvm)
            logger.info(message)

        if verbose:
            print("iteration {:7}: AUC: {:7.5f}, KS: {:7.5f}, CVM: {:7.5f}, loss: {:8.5f}".format(len(i), 
                  auc, ks, cvm, loss))
        return loss
Exemplo n.º 2
0
def cv_model(model_list):
	print "generating cv csv files...."
	train, test = gen_data()
	label = train['signal']
	train_id = train.id
	test_id = test.id

	train_del, test_del = delete_features(train), delete_features(test)

	check_agreement = pd.read_csv('../data/check_agreement.csv')
	check_correlation = pd.read_csv('../data/check_correlation.csv')
	check_agreement= add_features(check_agreement)
	check_correlation  = add_features(check_correlation)

	X, X_test = train_del.as_matrix(), test_del.as_matrix()
	print X.shape, X_test.shape

	kf = KFold(label, n_folds = 4)
	for j, (clf, clf_name) in enumerate(model_list):
		
		print "modelling model %i ...."%j
		cv_train = np.zeros(len(label))
		for i, (train_fold, validate) in enumerate(kf):
			X_train, X_validate, label_train, label_validate = X[train_fold,:], X[validate,:], label[train_fold], label[validate]
			clf.fit(X_train,label_train)
			cv_train[validate] = clf.predict_proba(X_validate)[:,1]
		auc_score = evaluation.roc_auc_truncated(label[train['min_ANNmuon'] > 0.4], 
			pd.Series(cv_train)[train['min_ANNmuon'] > 0.4])
		print "the true roc_auc_truncated is %.6f"%auc_score

		clf.fit(X, label)
		test_probs = clf.predict_proba(X_test)[:,1]
		# check if it passes the tests
		print "check if it passes the tests"
		agreement_probs = clf.predict_proba(delete_features(check_agreement).as_matrix())[:,1]
		ks = evaluation.compute_ks(
			agreement_probs[check_agreement['signal'].values == 0],
			agreement_probs[check_agreement['signal'].values == 1],
			check_agreement[check_agreement['signal'] == 0]['weight'].values,
			check_agreement[check_agreement['signal'] == 1]['weight'].values)
		print ('KS metric', ks, ks <= 0.09)

		correlation_probs = clf.predict_proba(delete_features(check_correlation).as_matrix())[:,1]
		print ('Checking correlation...')
		cvm = evaluation.compute_cvm(correlation_probs, check_correlation['mass'])
		print ('CvM metric', cvm, cvm <= 0.002)
		#if ks <= 0.09 and cvm <= 0.002 and auc_score > 0.975: # no need to check here
		if auc_score > 0.965: # the minimum threshold
			# save the cv
			cv_sub = pd.DataFrame({"id": train_id, "prediction": cv_train, "label": label})
			cv_sub.to_csv("../data/cv_folder/xgb%i.csv"%j, index=False)
			# save the prediction
			submission = pd.DataFrame({"id": test_id, "prediction": test_probs})
			submission.to_csv("../data/pred_folder/xgb%i.csv"%j, index=False)
			# save agreement
			submission = pd.DataFrame({"id": check_agreement['id'], "prediction": agreement_probs})
			submission.to_csv("../data/agree_folder/xgb%i.csv"%j, index=False)
			# save correlation
			submission = pd.DataFrame({"id": check_correlation['id'], "prediction": correlation_probs})
			submission.to_csv("../data/correlation_folder/xgb%i.csv"%j, index=False)
    def auc_func(weights):
        final_prediction = 0
        for weight, prediction in zip(weights, predictions):
            final_prediction += weight*prediction

        # final_prediction = map(lambda x: 1 if x > 0.5 else 0, final_prediction)
        # return -1.0 * accuracy_score(test_y, final_prediction)
        return -1.0 * evaluation.roc_auc_truncated(test_y, final_prediction)
    def auc_func(weights):
        final_prediction = 0
        for weight, prediction in zip(weights, predictions):
            final_prediction += weight * prediction

        # final_prediction = map(lambda x: 1 if x > 0.5 else 0, final_prediction)
        # return -1.0 * accuracy_score(test_y, final_prediction)
        return -1.0 * evaluation.roc_auc_truncated(test_y, final_prediction)
def cv_loop(X, y, model, N):
    mean_auc_truncated = 0.
    for i in range(N):
        X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
                                       X, y, test_size=.02, 
                                       random_state = i*SEED)
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_cv)[:,1]
        auc = evaluation.roc_auc_truncated(y_cv, preds)
        print "AUC (fold %d/%d): %f" % (i + 1, N, auc)
        mean_auc_truncated += auc
    return mean_auc_truncated/N
Exemplo n.º 6
0
    def objective(parameters):
        i.append(0)
        set_weights(model, parameters)
        p = model.predict(X, batch_size=256, verbose=0)[:, 1]
        auc = roc_auc_truncated(y, p)

        pa = model.predict(Xa, batch_size=256, verbose=0)[:, 1]
        ks = compute_ks(pa[ya == 0], pa[ya == 1], wa[ya == 0], wa[ya == 1])

        pc = model.predict(Xc, batch_size=256, verbose=0)[:, 1]
        cvm = compute_cvm(pc, mc)

        ks_importance = 1  # relative KS importance
        ks_target = ks_threshold
        cvm_importance = 1  # relative CVM importance
        cvm_target = cvm_threshold

        alpha = 0.001  # LeakyReLU
        ks_loss = (1 if ks > ks_target else alpha) * (ks - ks_target)
        cvm_loss = (1 if cvm > cvm_target else alpha) * (cvm - cvm_target)
        loss = -auc + ks_importance * ks_loss + cvm_importance * cvm_loss

        if ks < ks_threshold and cvm < cvm_threshold and auc > auc_log[0]:
            d.append(0)
            dump_transductor_model(model,
                                   transductor_model_file.format(len(d)))
            auc_log.pop()
            auc_log.append(auc)
            message = "iteration {:7}: Best AUC={:7.5f} achieved, KS={:7.5f}, CVM={:7.5f}".format(
                len(i), auc, ks, cvm)
            logger.info(message)

        if verbose:
            print(
                "iteration {:7}: AUC: {:7.5f}, KS: {:7.5f}, CVM: {:7.5f}, loss: {:8.5f}"
                .format(len(i), auc, ks, cvm, loss))
        return loss
            "colsample_bytree": 0.7,
            "seed": 1,
            "eval_metric": "auc",
            "nthread": 2,
        }
        n_rounds = 4000  # Just a big number to trigger early stopping and best iteration

        # Train
        xgb_model = xgb.train(
            params, xg_train, n_rounds, [(xg_train, "train"), (xg_test, "test")], early_stopping_rounds=40
        )
        # Predict
        predictions = xgb_model.predict(xg_test)

        # Compute weighted AUC
        AUC = evaluation.roc_auc_truncated(test_y, predictions)
        errors.append(AUC)
        print "AUC", AUC

        # Save best iteration
        best_iterations.append(xgb_model.best_iteration)

    # Append new grid error
    grid_errors.append(np.mean(errors))
    grid_best_iterations.append(list(best_iterations))

# Show results
for i in xrange(len(params_space)):
    print "Params: %s, wighted AUC: %f, best iterations: %s, mean: %f" % (
        str(params_space[i]),
        grid_errors[i],
Exemplo n.º 8
0
	def search(self):
		'''
		main function to perform searching...
		'''
		# initial status
		current_cv = self.init_cv
		current_pred = self.init_pred
		current_agree = self.init_agree
		current_corr = self.init_correlation
		train = pd.read_csv("../data/training.csv") # for cross validation purposes
		label = train['signal']

		current_ks = self.check_agreement_func(self.init_agree)
		current_cvm = self.check_corr_func(self.init_correlation)
		current_auc = 	evaluation.roc_auc_truncated(label[train['min_ANNmuon'] > 0.4], 
					pd.Series(current_cv)[train['min_ANNmuon'] > 0.4])
		print "the initial test results..."
		print ('KS metric',current_ks, current_ks <= 0.09)
		print ('Cvm metric',current_cvm, current_cvm <= 0.002)
		
		# start searching
		num_round = 0
		while current_ks > 0.09 or current_cvm > 0.002:
			num_round += 1
			print "doing round %i..."%num_round
			if num_round > 10:
				print "finished all the rounds and can't find a solution..."
				break
			random_files = random.sample(self.files, len(self.files)) # add some randomness
			for f in random_files:
				# read all the files
				tmp_cv = pd.read_csv(self.cv_folder + f)
				tmp_pred = pd.read_csv(self.pred_folder + f)
				tmp_agree = pd.read_csv(self.agree_folder + f)
				tmp_corr = pd.read_csv(self.correlation_folder + f)

				tmp_agree_average = (tmp_agree['prediction'].values + current_agree) * 0.5
				tmp_corr_average = (tmp_corr['prediction'].values + current_corr) * 0.5
				tmp_cv_average = (tmp_cv['prediction'].values + current_cv) * 0.5
				tmp_auc = evaluation.roc_auc_truncated(label[train['min_ANNmuon'] > 0.4], 
					pd.Series(current_cv)[train['min_ANNmuon'] > 0.4])
				if self.check_agreement_func(tmp_agree_average) < current_ks and self.check_corr_func(tmp_corr_average) <= 0.002:
					# update them
					current_ks = self.check_agreement_func(tmp_agree_average)
					current_cvm = self.check_corr_func(tmp_corr_average)
					current_cv = tmp_cv_average
					current_pred = (tmp_pred['prediction'].values + current_pred) * 0.5
					current_agree,current_corr = tmp_agree_average, tmp_corr_average
					print "find a reduced ks score %.3f..."%current_ks
					current_auc = tmp_auc
					print "the corresponding AUC score is %.5f"%current_auc
					if current_ks <= 0.09:
						print "found one that passes the test, and now start to optimize the AUC"
						print "doing 5 rounds..."
						n_r = 0
						while n_r < 2:
							n_r += 1
							print n_r
							for e2, f2 in enumerate(random_files):
								# read all the files
								tmp_cv2 = pd.read_csv(self.cv_folder + f2)
								tmp_pred2 = pd.read_csv(self.pred_folder + f2)
								tmp_agree2 = pd.read_csv(self.agree_folder + f2)
								tmp_corr2 = pd.read_csv(self.correlation_folder + f2)

								tmp_agree_average2 = (tmp_agree2['prediction'].values + current_agree) * 0.5
								tmp_corr_average2 = (tmp_corr2['prediction'].values + current_corr) * 0.5
								tmp_cv_average2 = (tmp_cv2['prediction'].values + current_cv) * 0.5
								tmp_auc2 = evaluation.roc_auc_truncated(label[train['min_ANNmuon'] > 0.4], 
									pd.Series(tmp_cv_average2)[train['min_ANNmuon'] > 0.4])
								if self.check_agreement_func(tmp_agree_average2) <= 0.09 and self.check_corr_func(tmp_corr_average2) <= 0.002 and \
								tmp_auc2 > current_auc:
									# update them
									current_ks = self.check_agreement_func(tmp_agree_average2)
									current_cvm = self.check_corr_func(tmp_corr_average2)
									current_cv = tmp_cv_average2
									current_pred = (tmp_pred2['prediction'].values + current_pred) * 0.5
									current_agree,current_corr = tmp_agree_average2, tmp_corr_average2
									print "current ks score %.3f..."%current_ks
									current_auc = tmp_auc2
									print "the corresponding AUC score is %.5f"%current_auc
						print "Yeah! We've found one good prediction!"
						submission = pd.DataFrame({"id": tmp_pred['id'], "prediction": current_pred})
						submission.to_csv("../submissions/xgb_search_%.4f.csv"%current_auc, index=False)
						break
Exemplo n.º 9
0
Xt, yt, _, _ = load(pt.training_file, pt.train_prediction_file)  # shuffled
Xa, ya, wa, _ = load(pt.check_agreement_file,
                     pt.check_agreement_prediction_file,
                     tail=len(yt),
                     weight=True)
Xc, yc, _, mc = load(pt.check_correlation_file,
                     pt.check_correlation_prediction_file,
                     mass=True)
Xt, scaler = preprocess_data(Xt)
Xa = preprocess_data(Xa, scaler)[0]
Xc = preprocess_data(Xc, scaler)[0]
with open(pt.transductor_scaler_file, 'wb') as fid:
    cPickle.dump(scaler, fid)

AUC = roc_auc_truncated(yt, Xt[:, -1])
print('AUC before transductor', AUC)

model = create_model(Xt.shape[1])

pretrain = True
if pretrain:
    # pretrain model
    print("Pretrain model")
    yt_categorical = np_utils.to_categorical(yt, nb_classes=2)
    model.fit(Xt,
              yt_categorical,
              batch_size=64,
              nb_epoch=1,
              validation_data=None,
              verbose=2,
Exemplo n.º 10
0
def stacked_models(train, features, test, in_sample=True):
    """
    Build stacked generalization models, set in_sample to False
    to predict on test set.
    """

    if in_sample:

        np.random.seed(1)
        new_indices = np.asarray(train.index.copy())
        np.random.shuffle(new_indices)

        train = train.iloc[new_indices].reset_index(drop=True).copy()

        # not used in CV testing..
        del test

        cutoff = int(new_indices.shape[0] * 0.75)

        X_dev = train[:cutoff].reset_index(drop=True).copy()
        Y_dev = train[:cutoff]['signal'].reset_index(drop=True).copy()

        X_test = train[cutoff:][
            train[cutoff:]['min_ANNmuon'] > 0.4].reset_index(drop=True).copy()
        Y_test = train[cutoff:][
            train[cutoff:]['min_ANNmuon'] > 0.4]['signal'].reset_index(
                drop=True).copy()

    else:
        np.random.seed(1)
        new_indices = np.asarray(train.index.copy())
        np.random.shuffle(new_indices)

        train = train.iloc[new_indices].reset_index(drop=True).copy()

        X_dev = train.reset_index(drop=True).copy()
        Y_dev = train['signal'].reset_index(drop=True).copy()

        X_test = test.reset_index(drop=True).copy()
        Y_test = None

    n_folds = 5

    # put ur parameter tuned CLFs in this list.

    clfs = [
        RandomForestClassifier(n_estimators=200,
                               criterion='entropy',
                               random_state=20,
                               n_jobs=-1),
        RandomForestClassifier(n_estimators=200,
                               criterion='entropy',
                               random_state=20,
                               n_jobs=-1,
                               max_depth=6),
        ExtraTreesClassifier(n_estimators=200,
                             criterion='entropy',
                             random_state=50,
                             n_jobs=-1),
        ExtraTreesClassifier(n_estimators=200,
                             criterion='entropy',
                             random_state=50,
                             n_jobs=-1,
                             max_depth=6),
        Pipeline([('scaler', StandardScaler()), ('lr', LogisticRegression())]),
        UGradientBoostingClassifier(loss=BinFlatnessLossFunction(
            ['mass'], n_bins=15, uniform_label=0),
                                    n_estimators=150,
                                    subsample=0.1,
                                    max_depth=6,
                                    min_samples_leaf=10,
                                    learning_rate=0.1,
                                    train_features=features,
                                    random_state=11),
        UGradientBoostingClassifier(loss=KnnFlatnessLossFunction(
            ['mass'], n_neighbours=30, uniform_label=0),
                                    n_estimators=150,
                                    subsample=0.1,
                                    max_depth=6,
                                    min_samples_leaf=10,
                                    learning_rate=0.1,
                                    train_features=features,
                                    random_state=11),
        UGradientBoostingClassifier(loss=BinFlatnessLossFunction(
            ['mass'], n_bins=15, uniform_label=0),
                                    n_estimators=100,
                                    subsample=0.8,
                                    max_depth=6,
                                    min_samples_leaf=10,
                                    learning_rate=0.1,
                                    train_features=features,
                                    random_state=11),
        UGradientBoostingClassifier(loss=KnnFlatnessLossFunction(
            ['mass'], n_neighbours=30, uniform_label=0),
                                    n_estimators=100,
                                    subsample=0.8,
                                    max_depth=6,
                                    min_samples_leaf=10,
                                    learning_rate=0.1,
                                    train_features=features,
                                    random_state=11),
        XGBoostClassifier(eval_metric='auc',
                          objective='binary:logistic',
                          num_class=2,
                          nthread=4,
                          silent=1,
                          colsample_bytree=0.6,
                          eta=0.005,
                          max_depth=6,
                          min_child_weight=13,
                          seed=1337,
                          subsample=0.7),
        NN1(len(features)),
        NN2(len(features)),
        NN3(len(features)),
        NN4(len(features))
    ]

    skf = list(StratifiedKFold(Y_dev, n_folds))

    # Number of training data x Number of classifiers
    blend_train = np.zeros((X_dev.shape[0], len(clfs)))
    # Number of testing data x Number of classifiers
    blend_test = np.zeros((X_test.shape[0], len(clfs)))

    print 'X_test.shape = %s' % (str(X_test.shape))
    print 'blend_train.shape = %s' % (str(blend_train.shape))
    print 'blend_test.shape = %s' % (str(blend_test.shape))

    # For each classifier, we train the number of fold times (=len(skf))
    for j, clf in enumerate(clfs):
        print 'Training classifier [%s]' % (j)
        # Number of testing data x Number of folds , we will take the mean of
        # the predictions later
        blend_test_j = np.zeros((X_test.shape[0], len(skf)))
        for i, (train_index, cv_index) in enumerate(skf):
            print 'Fold [%s]' % (i)

            # This is the training and validation set
            X_train = X_dev.iloc[train_index].copy()
            Y_train = Y_dev.iloc[train_index].copy()
            X_cv = X_dev.iloc[cv_index].copy()
            Y_cv = Y_dev.iloc[cv_index].copy()

            # handle the case of hep.ml stuff
            if type(clf) == type(UGradientBoostingClassifier()):
                clf.fit(X_train[features + ['mass']],
                        Y_train.values.astype(np.int32))
            else:
                clf.fit(X_train[features], Y_train.values.astype(np.int32))

            # This output will be the basis for our blended classifier to train against,
            # which is also the output of our classifiers
            blend_train[cv_index, j] = clf.predict_proba(X_cv[features])[:, 1]
            blend_test_j[:, i] = clf.predict_proba(X_test[features])[:, 1]
        # Take the mean of the predictions of the cross validation set
        blend_test[:, j] = blend_test_j.mean(1)

    print 'Y_dev.shape = %s' % (Y_dev.shape)

    # blend with LR...
    bclf = LogisticRegression()
    bclf.fit(blend_train, Y_dev)

    bclf2 = GradientBoostingClassifier(n_estimators=150,
                                       learning_rate=0.02,
                                       max_depth=4,
                                       subsample=0.9,
                                       verbose=3,
                                       random_state=1337)
    bclf2.fit(blend_train, Y_dev)

    bclf3 = NeuralNet(
        layers=[('input', layers.InputLayer), ('hidden', layers.DenseLayer),
                ('output', layers.DenseLayer)],

        # layer parameters:
        input_shape=(None, blend_train.shape[1]),
        hidden_num_units=blend_train.shape[1],
        output_nonlinearity=nonlinearities.
        softmax,  # output layer uses identity function
        output_num_units=2,  # 2 target values

        # optimization method:
        update=nesterov_momentum,
        update_learning_rate=0.01,
        update_momentum=0.9,
        regression=
        False,  # flag to indicate we're dealing with regression problem
        max_epochs=53,  # TRY 50 and 46 epochs!
        verbose=1,
        eval_size=0.10)

    bclf3.fit(blend_train.astype(np.float32), Y_dev.astype(np.int32))

    bclf4 = AdaBoostClassifier(n_estimators=400, random_state=88)
    bclf4.fit(blend_train, Y_dev)

    # Predict now
    Y_test_predict = bclf.predict_proba(blend_test)[:, 1]
    Y_test_predict2 = bclf2.predict_proba(blend_test)[:, 1]
    Y_test_predict3 = bclf3.predict_proba(blend_test.astype(np.float32))[:, 1]
    Y_test_predict4 = bclf4.predict_proba(blend_test)[:, 1]

    print 'Logit Coefs:', bclf.coef_
    if in_sample:
        score = evaluation.roc_auc_truncated(Y_test, Y_test_predict)
        score2 = evaluation.roc_auc_truncated(Y_test, Y_test_predict2)
        score3 = evaluation.roc_auc_truncated(Y_test, blend_test.mean(1))
        score4 = evaluation.roc_auc_truncated(
            Y_test, scipy_opt(blend_train, Y_dev, blend_test))
        score5 = evaluation.roc_auc_truncated(
            Y_test, (Y_test_predict + Y_test_predict2) / 2.0)
        score6 = evaluation.roc_auc_truncated(Y_test, Y_test_predict3)
        score7 = evaluation.roc_auc_truncated(
            Y_test, (Y_test_predict + Y_test_predict2 + Y_test_predict3) / 3.0)
        score8 = evaluation.roc_auc_truncated(Y_test, Y_test_predict4)
        score9 = evaluation.roc_auc_truncated(
            Y_test,
            (Y_test_predict2 + Y_test_predict3 + Y_test_predict4) / 3.0)
        score10 = evaluation.roc_auc_truncated(
            Y_test, (Y_test_predict + Y_test_predict2 + Y_test_predict3 +
                     Y_test_predict4) / 4.0)

        print 'LR Score = %s' % (score)
        print 'GB Score = %s' % (score2)
        print 'MEAN Score = %s' % (score3)
        print 'Scipy Score = %s' % (score4)
        print 'LR + GB score = %s' % (score5)
        print 'ANN Score= %s' % (score6)
        print 'LR + GB + ANN Score = %s' % (score7)
        print 'ADA Score = %s' % (score8)
        print 'GB + ANN + ADA Score = %s' % (score9)
        print 'LR + GB + ANN + ADA Score = %s' % (score10)
        return blend_train, Y_dev, blend_test, Y_test

    # average of ADA, ANN and GBM.
    return (Y_test_predict + Y_test_predict2 + Y_test_predict3 +
            Y_test_predict4) / 4.0
Exemplo n.º 11
0

    #clf2 = SVC(kernel='linear', C=1)


    #X_train, X_test, y_train, y_test = train_test_split(train[variables], train['signal'], test_size=0.4, random_state=0)

    clf1.fit(X_train, y_train)

    #X_test_eval = X_test[X_test['min_ANNmuon'] > 0.4]
    #y_test_eval = t_test[y_test['min_ANNmuon']>0.4]


    
    train_probs = clf1.predict_proba(X_test)[:, 1]
    AUC = evaluation.roc_auc_truncated(y_test, train_probs)
    print('AUC', AUC)
    results.append(AUC)


#print(train_probs)


#cross_val_score(clf1, train, train['signal'], cv=10)



# Check agreement test
evaluation.check_agreement(baseline)

# Check correlation test
Exemplo n.º 12
0
 def score(self, X, y):
     Y = self.predict_proba(X)
     return evaluation.roc_auc_truncated(y, Y[:, 1])
def AUC(model, dataX, dataY, AUCindex):
    return evaluation.roc_auc_truncated(np.asarray(dataY.eval())[AUCindex], np.asarray(model.predict()(np.asarray(dataX.eval())[AUCindex])).T[0])
        return loss
    return objective
    
    
Xt, yt, _, _ = load(pt.training_file, pt.train_prediction_file)    # shuffled
Xa, ya, wa, _ = load(pt.check_agreement_file, pt.check_agreement_prediction_file,
                     tail=len(yt), weight=True)
Xc, yc, _, mc = load(pt.check_correlation_file, pt.check_correlation_prediction_file,
                     mass=True)
Xt, scaler = preprocess_data(Xt)
Xa = preprocess_data(Xa, scaler)[0]
Xc = preprocess_data(Xc, scaler)[0]
with open(pt.transductor_scaler_file, 'wb') as fid:
    cPickle.dump(scaler, fid)   

AUC = roc_auc_truncated(yt, Xt[:,-1])
print ('AUC before transductor', AUC)  

model = create_model(Xt.shape[1])

pretrain = True
if pretrain:
    # pretrain model
    print("Pretrain model")
    yt_categorical = np_utils.to_categorical(yt, nb_classes=2)
    model.fit(Xt, yt_categorical, batch_size=64, nb_epoch=1,
              validation_data=None, verbose=2, show_accuracy=True)
    print("Save pretrained model")
    with open(pt.transductor_pretrained_model_file, 'wb') as fid:
        cPickle.dump(model, fid)
else:
 def score(self, X, y):
     Y = self.predict_proba(X)
     return evaluation.roc_auc_truncated(y, Y[:, 1])
def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    # return a pair metric_name, result
    # since preds are margin(before logistic transformation, cutoff at 0)
    return 'truncated AUC', -evaluation.roc_auc_truncated(labels, preds)
Exemplo n.º 17
0
def _score_func(estimator, X, y):
    pred_probs = estimator.predict_proba(X)[:, 1]
    return evaluation.roc_auc_truncated(y, pred_probs)
Exemplo n.º 18
0
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from blue.featurelist import FeatureList
from blue.pandas_utils import get_columns_in_df
from blue.estimators import HyperoptEstimator

from evaluation import roc_auc_truncated

train_file = './data/training.csv'
test_file = './data/test.csv'
flist = FeatureList(train_file, spec='features.yml', derived_list=None)

df_train = pd.read_csv(train_file, index_col='id')
df_train = get_columns_in_df(df_train, flist.universe)

df_test = pd.read_csv(test_file)
df_test = get_columns_in_df(df_test, flist.predictors)

hpest = HyperoptEstimator(RandomForestClassifier, max_evals=5, n_jobs=3, metric=lambda x,y : - roc_auc_truncated(x,y))
hpest.fit(df_train[flist.predictors].values, df_train[flist.target].values)
Exemplo n.º 19
0
    def search(self):
        '''
		main function to perform searching...
		'''
        # initial status
        current_cv = self.init_cv
        current_pred = self.init_pred
        current_agree = self.init_agree
        current_corr = self.init_correlation
        train = pd.read_csv(
            "../data/training.csv")  # for cross validation purposes
        label = train['signal']

        current_ks = self.check_agreement_func(self.init_agree)
        current_cvm = self.check_corr_func(self.init_correlation)
        current_auc = evaluation.roc_auc_truncated(
            label[train['min_ANNmuon'] > 0.4],
            pd.Series(current_cv)[train['min_ANNmuon'] > 0.4])
        print "the initial test results..."
        print('KS metric', current_ks, current_ks <= 0.09)
        print('Cvm metric', current_cvm, current_cvm <= 0.002)

        # start searching
        num_round = 0
        while current_ks > 0.09 or current_cvm > 0.002:
            num_round += 1
            print "doing round %i..." % num_round
            if num_round > 10:
                print "finished all the rounds and can't find a solution..."
                break
            random_files = random.sample(self.files, len(
                self.files))  # add some randomness
            for f in random_files:
                # read all the files
                tmp_cv = pd.read_csv(self.cv_folder + f)
                tmp_pred = pd.read_csv(self.pred_folder + f)
                tmp_agree = pd.read_csv(self.agree_folder + f)
                tmp_corr = pd.read_csv(self.correlation_folder + f)

                tmp_agree_average = (tmp_agree['prediction'].values +
                                     current_agree) * 0.5
                tmp_corr_average = (tmp_corr['prediction'].values +
                                    current_corr) * 0.5
                tmp_cv_average = (tmp_cv['prediction'].values +
                                  current_cv) * 0.5
                tmp_auc = evaluation.roc_auc_truncated(
                    label[train['min_ANNmuon'] > 0.4],
                    pd.Series(current_cv)[train['min_ANNmuon'] > 0.4])
                if self.check_agreement_func(
                        tmp_agree_average
                ) < current_ks and self.check_corr_func(
                        tmp_corr_average) <= 0.002:
                    # update them
                    current_ks = self.check_agreement_func(tmp_agree_average)
                    current_cvm = self.check_corr_func(tmp_corr_average)
                    current_cv = tmp_cv_average
                    current_pred = (tmp_pred['prediction'].values +
                                    current_pred) * 0.5
                    current_agree, current_corr = tmp_agree_average, tmp_corr_average
                    print "find a reduced ks score %.3f..." % current_ks
                    current_auc = tmp_auc
                    print "the corresponding AUC score is %.5f" % current_auc
                    if current_ks <= 0.09:
                        print "found one that passes the test, and now start to optimize the AUC"
                        print "doing 5 rounds..."
                        n_r = 0
                        while n_r < 2:
                            n_r += 1
                            print n_r
                            for e2, f2 in enumerate(random_files):
                                # read all the files
                                tmp_cv2 = pd.read_csv(self.cv_folder + f2)
                                tmp_pred2 = pd.read_csv(self.pred_folder + f2)
                                tmp_agree2 = pd.read_csv(self.agree_folder +
                                                         f2)
                                tmp_corr2 = pd.read_csv(
                                    self.correlation_folder + f2)

                                tmp_agree_average2 = (
                                    tmp_agree2['prediction'].values +
                                    current_agree) * 0.5
                                tmp_corr_average2 = (
                                    tmp_corr2['prediction'].values +
                                    current_corr) * 0.5
                                tmp_cv_average2 = (tmp_cv2['prediction'].values
                                                   + current_cv) * 0.5
                                tmp_auc2 = evaluation.roc_auc_truncated(
                                    label[train['min_ANNmuon'] > 0.4],
                                    pd.Series(tmp_cv_average2)[
                                        train['min_ANNmuon'] > 0.4])
                                if self.check_agreement_func(tmp_agree_average2) <= 0.09 and self.check_corr_func(tmp_corr_average2) <= 0.002 and \
                                tmp_auc2 > current_auc:
                                    # update them
                                    current_ks = self.check_agreement_func(
                                        tmp_agree_average2)
                                    current_cvm = self.check_corr_func(
                                        tmp_corr_average2)
                                    current_cv = tmp_cv_average2
                                    current_pred = (
                                        tmp_pred2['prediction'].values +
                                        current_pred) * 0.5
                                    current_agree, current_corr = tmp_agree_average2, tmp_corr_average2
                                    print "current ks score %.3f..." % current_ks
                                    current_auc = tmp_auc2
                                    print "the corresponding AUC score is %.5f" % current_auc
                        print "Yeah! We've found one good prediction!"
                        submission = pd.DataFrame({
                            "id": tmp_pred['id'],
                            "prediction": current_pred
                        })
                        submission.to_csv(
                            "../submissions/xgb_search_%.4f.csv" % current_auc,
                            index=False)
                        break
def _score_func(estimator, X, y):
    pred_probs = estimator.predict_proba(X)[:, 1]
    return evaluation.roc_auc_truncated(y, pred_probs)
Exemplo n.º 21
0
          'gamma': 0.01, # 0.005
          "min_child_weight": 5,
          "silent": 1,
          "subsample": 0.7,
          "colsample_bytree": 0.7,
          'nthread': 4,
          "seed": 1}

num_trees=600

#gbm = xgb.train(params, xgb.DMatrix(train[features], train["signal"]), num_trees)

agreement_probs= rf.predict_proba(check_agreement[features])[:,1]
print('Checking agreement...')
ks = evaluation.compute_ks(
    agreement_probs[check_agreement['signal'].values == 0],
    agreement_probs[check_agreement['signal'].values == 1],
    check_agreement[check_agreement['signal'] == 0]['weight'].values,
    check_agreement[check_agreement['signal'] == 1]['weight'].values)
print ('KS metric UB =', ks, ks < 0.09)

train_eval_probs1 = rf.predict_proba(train_eval[features])[:,1]
AUC1 = evaluation.roc_auc_truncated(train_eval['signal'], train_eval_probs1)
print ('AUC UB ', AUC1)

print("Make predictions on the test set")
rfpred = rf.predict_proba(test[features])[:,1]
test_probs = rfpred
submission = pd.DataFrame({"id": test["id"], "prediction": test_probs})
submission.to_csv("ub_only_submission.csv", index=False)
def train_test_predict(classifier, classifier_name, features, features_name,
        data_directory, training_data):
    """
    """
    # Fit the classifier with the training data.
    start = time.time()
    classifier.fit(training_data[features], training_data['signal'])
    end = time.time()
    print("time to fit the classifier: {} seconds".format(end - start))
    print()

    # Check the agreement test.
    start = time.time()
    check_agreement = pandas.read_csv(data_directory + 'check_agreement.csv', index_col='id')
    agreement_probs = classifier.predict_proba(check_agreement[features])[:, 1]
    ks = evaluation.compute_ks(
        agreement_probs[check_agreement['signal'].values == 0],
        agreement_probs[check_agreement['signal'].values == 1],
        check_agreement[check_agreement['signal'] == 0]['weight'].values,
        check_agreement[check_agreement['signal'] == 1]['weight'].values)
    print('KS metric', ks, ks < 0.09)
    end = time.time()
    print("time to check the agreement test: {} seconds".format(end - start))
    print()

    # Check the correlation test.
    start = time.time()
    check_correlation = pandas.read_csv(data_directory + 'check_correlation.csv', index_col='id')
    correlation_probs = classifier.predict_proba(check_correlation[features])[:, 1]
    cvm = evaluation.compute_cvm(correlation_probs, check_correlation['mass'])
    print('CvM metric', cvm, cvm < 0.002)
    end = time.time()
    print("time to check the correlation test: {} seconds".format(end - start))
    print()

    # Compute weighted AUC on the training data with min_ANNmuon > 0.4.
    start = time.time()
    train_eval = training_data[training_data['min_ANNmuon'] > 0.4]
    train_probs = classifier.predict_proba(train_eval[features])[:, 1]
    AUC = evaluation.roc_auc_truncated(train_eval['signal'], train_probs)
    print('AUC', AUC)
    end = time.time()
    print("time to compute the weighted AUC: {} seconds".format(end - start))
    print()

    # Make predictions on the test data.
    start = time.time()
    testing_data = pandas.read_csv(data_directory + 'test.csv', index_col='id')
    result = pandas.DataFrame({'id': testing_data.index})
    result['prediction'] = classifier.predict_proba(testing_data[features])[:, 1]
    end = time.time()
    print("time to make predictions: {} seconds".format(end - start))
    print()

    predictions_name = classifier_name + '-' + features_name

    # Generate the csv file for Kaggle.
    result.to_csv(predictions_name + '.csv', index=False, sep=',')

    # Run the shell commands to generate the final archive through
    # the subprocess module calls.
    print(subprocess.check_output(['rm', '-f', predictions_name+'.7z']))

    print(subprocess.check_output(['7z', 'a', predictions_name+'.7z', predictions_name+'.csv']))

    print(subprocess.check_output(['ls', '-Ahl', predictions_name+'.csv']))
    print(subprocess.check_output(['ls', '-Ahl', predictions_name+'.7z']))
Exemplo n.º 23
0
    agreement_probs[check_agreement['signal'].values == 0],
    agreement_probs[check_agreement['signal'].values == 1],
    check_agreement[check_agreement['signal'] == 0]['weight'].values,
    check_agreement[check_agreement['signal'] == 1]['weight'].values)
print 'KS metric', ks, ks < 0.09

# Check correlation test
check_correlation = pandas.read_csv(folder + 'check_correlation.csv', index_col='id')
xg_check_correlation = xgb.DMatrix(check_correlation.values)
correlation_probs = xgb_model.predict(xg_check_correlation)
cvm = evaluation.compute_cvm(correlation_probs, check_correlation['mass'])
print 'CvM metric', cvm, cvm < 0.002

# Compute weighted AUC on the training data with min_ANNmuon > 0.4
train_eval = train[train['min_ANNmuon'] > 0.4]
train_eval_X = train_eval.drop(variables_to_drop, 1).values
xg_train_eval = xgb.DMatrix(train_eval_X)
train_probs = xgb_model.predict(xg_train_eval)
AUC = evaluation.roc_auc_truncated(train_eval['signal'], train_probs)
print 'AUC', AUC

# Predict test, create file for kaggle
test = pandas.read_csv(folder + 'test.csv', index_col='id')
test_X = test.values
xg_test = xgb.DMatrix(test_X)
result = pandas.DataFrame({'id': test.index})

result['prediction'] = xgb_model.predict(xg_test)

result.to_csv('../submissions/xgb.csv', index=False, sep=',')
Exemplo n.º 24
0
def stacked_models(train, features, test, in_sample=True):
    """
    Build stacked generalization models, set in_sample to False
    to predict on test set.
    """

    if in_sample:

        np.random.seed(1)
        new_indices = np.asarray(train.index.copy())
        np.random.shuffle(new_indices)

        train = train.iloc[new_indices].reset_index(drop=True).copy()

        # not used in CV testing..
        del test

        cutoff = int(new_indices.shape[0] * 0.75)

        X_dev = train[:cutoff].reset_index(drop=True).copy()
        Y_dev = train[:cutoff]['signal'].reset_index(drop=True).copy()

        X_test = train[cutoff:][train[cutoff:]['min_ANNmuon'] > 0.4].reset_index(drop=True).copy()
        Y_test = train[cutoff:][train[cutoff:]['min_ANNmuon'] > 0.4]['signal'].reset_index(drop=True).copy()

    else:
        np.random.seed(1)
        new_indices = np.asarray(train.index.copy())
        np.random.shuffle(new_indices)

        train = train.iloc[new_indices].reset_index(drop=True).copy()


        X_dev = train.reset_index(drop=True).copy()
        Y_dev = train['signal'].reset_index(drop=True).copy()

        X_test = test.reset_index(drop=True).copy()
        Y_test = None

    n_folds = 5

    # put ur parameter tuned CLFs in this list.

    clfs = [
        RandomForestClassifier(n_estimators=200, criterion='entropy', random_state=20, n_jobs=-1),
        RandomForestClassifier(n_estimators=200, criterion='entropy', random_state=20, n_jobs=-1, max_depth=6),
        ExtraTreesClassifier(n_estimators=200, criterion='entropy', random_state=50, n_jobs=-1),
        ExtraTreesClassifier(n_estimators=200, criterion='entropy', random_state=50, n_jobs=-1, max_depth=6),
        Pipeline([('scaler', StandardScaler()), ('lr', LogisticRegression())]),

        UGradientBoostingClassifier(loss=BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0), n_estimators=150, subsample=0.1, max_depth=6, min_samples_leaf=10, learning_rate=0.1, train_features=features, random_state=11),
        UGradientBoostingClassifier(loss=KnnFlatnessLossFunction(['mass'], n_neighbours=30, uniform_label=0), n_estimators=150, subsample=0.1, max_depth=6, min_samples_leaf=10, learning_rate=0.1, train_features=features, random_state=11),

        UGradientBoostingClassifier(loss=BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0), n_estimators=100, subsample=0.8, max_depth=6, min_samples_leaf=10, learning_rate=0.1, train_features=features, random_state=11),
        UGradientBoostingClassifier(loss=KnnFlatnessLossFunction(['mass'], n_neighbours=30, uniform_label=0), n_estimators=100, subsample=0.8, max_depth=6, min_samples_leaf=10, learning_rate=0.1, train_features=features, random_state=11),



        XGBoostClassifier(eval_metric='auc', objective='binary:logistic',
                          num_class=2,
                          nthread=4,
                          silent=1,

                          colsample_bytree=0.6,
                          eta=0.005,
                          max_depth=6,
                          min_child_weight=13,
                          seed=1337,
                          subsample=0.7
                          ),
        NN1(len(features)),
        NN2(len(features)),
        NN3(len(features)),
        NN4(len(features))
    ]


    skf = list(StratifiedKFold(Y_dev, n_folds))

    # Number of training data x Number of classifiers
    blend_train = np.zeros((X_dev.shape[0], len(clfs)))
    # Number of testing data x Number of classifiers
    blend_test = np.zeros((X_test.shape[0], len(clfs)))

    print 'X_test.shape = %s' % (str(X_test.shape))
    print 'blend_train.shape = %s' % (str(blend_train.shape))
    print 'blend_test.shape = %s' % (str(blend_test.shape))

    # For each classifier, we train the number of fold times (=len(skf))
    for j, clf in enumerate(clfs):
        print 'Training classifier [%s]' % (j)
        # Number of testing data x Number of folds , we will take the mean of
        # the predictions later
        blend_test_j = np.zeros((X_test.shape[0], len(skf)))
        for i, (train_index, cv_index) in enumerate(skf):
            print 'Fold [%s]' % (i)

            # This is the training and validation set
            X_train = X_dev.iloc[train_index].copy()
            Y_train = Y_dev.iloc[train_index].copy()
            X_cv = X_dev.iloc[cv_index].copy()
            Y_cv = Y_dev.iloc[cv_index].copy()

            # handle the case of hep.ml stuff
            if type(clf) == type(UGradientBoostingClassifier()):
                clf.fit(X_train[features + ['mass']], Y_train.values.astype(np.int32))
            else:
                clf.fit(X_train[features], Y_train.values.astype(np.int32))

            # This output will be the basis for our blended classifier to train against,
            # which is also the output of our classifiers
            blend_train[cv_index, j] = clf.predict_proba(X_cv[features])[:, 1]
            blend_test_j[:, i] = clf.predict_proba(X_test[features])[:, 1]
        # Take the mean of the predictions of the cross validation set
        blend_test[:, j] = blend_test_j.mean(1)

    print 'Y_dev.shape = %s' % (Y_dev.shape)

    # blend with LR...
    bclf = LogisticRegression()
    bclf.fit(blend_train, Y_dev)

    bclf2 = GradientBoostingClassifier(n_estimators=150, learning_rate=0.02, max_depth=4, subsample=0.9, verbose=3, random_state=1337)
    bclf2.fit(blend_train, Y_dev)

    bclf3 = NeuralNet(layers=[
                    ('input', layers.InputLayer),
                    ('hidden', layers.DenseLayer),
                    ('output', layers.DenseLayer)],

                    # layer parameters:
                    input_shape=(None, blend_train.shape[1]),
                    hidden_num_units = blend_train.shape[1],


                    output_nonlinearity=nonlinearities.softmax,  # output layer uses identity function
                    output_num_units=2,  # 2 target values

                    # optimization method:
                    update=nesterov_momentum,
                    update_learning_rate=0.01,
                    update_momentum=0.9,

                    regression=False,  # flag to indicate we're dealing with regression problem
                    max_epochs=53,  # TRY 50 and 46 epochs!
                    verbose=1,
                    eval_size=0.10

                    )

    bclf3.fit(blend_train.astype(np.float32), Y_dev.astype(np.int32))

    bclf4 = AdaBoostClassifier(n_estimators=400, random_state=88)
    bclf4.fit(blend_train, Y_dev)

    # Predict now
    Y_test_predict = bclf.predict_proba(blend_test)[:, 1]
    Y_test_predict2 = bclf2.predict_proba(blend_test)[:, 1]
    Y_test_predict3 = bclf3.predict_proba(blend_test.astype(np.float32))[:, 1]
    Y_test_predict4 = bclf4.predict_proba(blend_test)[:, 1]

    print 'Logit Coefs:', bclf.coef_
    if in_sample:
        score = evaluation.roc_auc_truncated(Y_test, Y_test_predict)
        score2 = evaluation.roc_auc_truncated(Y_test, Y_test_predict2)
        score3 = evaluation.roc_auc_truncated(Y_test, blend_test.mean(1))
        score4 = evaluation.roc_auc_truncated(Y_test, scipy_opt(blend_train, Y_dev, blend_test))
        score5 = evaluation.roc_auc_truncated(Y_test, (Y_test_predict + Y_test_predict2) / 2.0)
        score6 = evaluation.roc_auc_truncated(Y_test, Y_test_predict3)
        score7 = evaluation.roc_auc_truncated(Y_test, (Y_test_predict + Y_test_predict2 + Y_test_predict3) / 3.0)
        score8 = evaluation.roc_auc_truncated(Y_test, Y_test_predict4)
        score9 = evaluation.roc_auc_truncated(Y_test, (Y_test_predict2 + Y_test_predict3 + Y_test_predict4) / 3.0)
        score10 = evaluation.roc_auc_truncated(Y_test, (Y_test_predict + Y_test_predict2 + Y_test_predict3 + Y_test_predict4) / 4.0)

        print 'LR Score = %s' % (score)
        print 'GB Score = %s' % (score2)
        print 'MEAN Score = %s' % (score3)
        print 'Scipy Score = %s' % (score4)
        print 'LR + GB score = %s' % (score5)
        print 'ANN Score= %s' % (score6)
        print 'LR + GB + ANN Score = %s' % (score7)
        print 'ADA Score = %s' % (score8)
        print 'GB + ANN + ADA Score = %s' % (score9)
        print 'LR + GB + ANN + ADA Score = %s' % (score10)
        return blend_train, Y_dev, blend_test, Y_test

    # average of ADA, ANN and GBM.
    return (Y_test_predict + Y_test_predict2 + Y_test_predict3 + Y_test_predict4) / 4.0
    # Grid search to compute best score
    def multichoose(n,k):
        if k < 0 or n < 0: return "Error"
        if not k: return [[0]*n]
        if not n: return []
        if n == 1: return [[k]]
        return [[0]+val for val in multichoose(n-1,k)] + \
            [[val[0]+1]+val[1:] for val in multichoose(n,k-1)]

    n = 2
    k = 1000
    for xs in multichoose(n,k):
        #print xs
        preds = (xs[0]*keras_pred + xs[1]*gbm_pred)/float(k)
        score = evaluation.roc_auc_truncated(train_eval_table[i]['signal'], preds)
        if score>=train_eval_score[i]:
            train_eval_score[i] = score
            print score
            best_xs = xs

    print train_eval_score[i]
    print best_xs
    test["prediction_%i" %i] = (best_xs[0]*keras_test + best_xs[1]*gbm_test)/float(k)

    with open('./output/semi_strong_submission_%i.csv' %i, 'w') as f:
        f.write('id,prediction\n')
        for ID, p in zip(test['id'], test["prediction_%i" %i]):
            f.write('%s,%.8f\n' % (ID, p))

    # Save best combination weight
Exemplo n.º 26
0
    def fit(self, data, data_to_predict=test, pred_cv=False):
        # pred_cv is used for predicing on data_to_predict using cross validation, it's easier to include it in the fit function

        print('Fitting ' + self.model_name + ' model with ' + self.var_name +
              ' variables using ' + str(self.cv.n_splits) +
              '-fold Cross Validation\n')
        X = data[self.variables].values
        y = data['signal'].values

        trained = np.zeros(len(y))
        for i, (train_ind, test_ind) in enumerate(self.cv.split(X, y)):

            mod = self.create_model()
            scaler = StandardScaler()
            X[train_ind] = scaler.fit_transform(X[train_ind])
            X[test_ind] = scaler.transform(X[test_ind])

            if (self.train_params is not {}) and self.fig_name == 'xgb':
                self.train_params['eval_set'] = [(X[test_ind], y[test_ind])
                                                 ]  # for xgb models
            if (self.train_params is not {}) and self.fig_name == 'nn':
                self.train_params['validation_data'] = (X[test_ind],
                                                        y[test_ind]
                                                        )  # for nn models

            hist = mod.fit(X[train_ind], y[train_ind], **self.train_params)

            if pred_cv == True:
                X_pred = scaler.transform(data_to_predict[self.variables])

            if self.nn == True:
                y_pred_val = mod.predict(X[test_ind])
                y_pred_train = mod.predict(X[train_ind])
                trained[test_ind] = y_pred_val.reshape((test_ind.shape[0], ))
                self.val_history_nn.append(hist.history['val_loss'])
                self.train_history_nn.append(hist.history['loss'])
                if pred_cv == True:
                    self.predicted_cv.append(mod.predict(X_pred))

            else:
                y_pred_val = mod.predict_proba(X[test_ind])[:, 1]
                y_pred_train = mod.predict_proba(X[train_ind])[:, 1]
                trained[test_ind] = y_pred_val
                if pred_cv == True:
                    self.predicted_cv.append(mod.predict_proba(X_pred)[:, 1])

            result_val = evaluation.roc_auc_truncated(y[test_ind], y_pred_val)
            result_train = evaluation.roc_auc_truncated(
                y[train_ind], y_pred_train)

            self.scores_val.append(result_val)
            self.scores_train.append(result_train)

            print(
                'Iteration {} gave ROC AUC score of {} for validation set and {} for training set \n'
                .format(i + 1, np.round(result_val, 4),
                        np.round(result_train, 4)))
        print(
            'Mean ROC AUC score for {}-fold CV is:\n{} for validation set \n{} for training set\n'
            .format(self.cv.n_splits, np.round(np.mean(self.scores_val), 4),
                    np.round(np.mean(self.scores_train), 4)))

        self.trained = trained