def train(self): self.output().makedirs() preproc = pipeline.Pipeline([ ('norm', preprocessing.Normalizer()), ('poly', preprocessing.PolynomialFeatures(self.npoly.get())) ]) X = abhishek_feats.AbhishekFeatures().load('train', self.fold, as_df=True) X = preproc.fit_transform(X) y = xval_dataset.BaseDataset().load('train', self.fold).squeeze() cls = linear_model.LogisticRegression(C=self.C.get(), solver='sag', class_weight=core.dictweights) cls.fit(X, y) print('Validating') validX = abhishek_feats.AbhishekFeatures().load('valid', self.fold) validX = preproc.transform(validX) y = xval_dataset.BaseDataset().load('valid', self.fold).squeeze() y_pred = cls.predict_proba(validX)[:, 1] score = core.score_data(y, y_pred) np.save( 'cache/abhishek/logit/{:f}/{:d}/valid.npy'.format( self.C.get(), self.fold), y_pred) return score, cls, preproc
def run(self): self.output().makedirs() data = self.xdataset() X = data.load('train', self.fold) y = rf_dataset.Dataset().load('train', self.fold, as_df=True).is_duplicate cls = self.make_cls() print('Training classifier {:s} on data of size: {}'.format( repr(cls), X.shape)) cls.fit(X, y) self.post_fit(cls) X_val = data.load('valid', self.fold) y_val = rf_dataset.Dataset().load('valid', self.fold, as_df=True).is_duplicate y_pred = cls.predict_proba(X_val)[:, 1] np.savez_compressed(self.make_path('valid.npz'), data=y_pred) score = core.score_data(y_val, y_pred) del X, y, X_val, y_val X_test = data.load('test', None) y_test_pred = cls.predict_proba(X_test)[:, 1] np.savez_compressed(self.make_path('test.npz'), data=y_test_pred) print(colors.green | 'Score: {:s}: {:f}'.format(repr(self), score)) with self.output().open('w') as f: f.write('Score: {:s}: {:f}'.format(repr(self), score)) return score
def run(self): self.output().makedirs() wc_data = rf_word_count_features.WordCountMatrix() X = wc_data.load('train', self.fold).astype(np.float32) y = rf_dataset.Dataset().load('train', self.fold, as_df=True).is_duplicate cls = self.make_cls() cls.fit(X, y) X_val = wc_data.load('valid', self.fold).astype(np.float32) y_val = rf_dataset.Dataset().load('valid', self.fold, as_df=True).is_duplicate y_pred = cls.predict_proba(X_val)[:, 1] np.savez_compressed(self.make_path('valid.npz'), data=y_pred) score = core.score_data(y_val, y_pred) del X, y, X_val, y_val X_test = wc_data.load('test', None).astype(np.float32) y_test_pred = cls.predict_proba(X_test)[:, 1] np.savez_compressed(self.make_path('test.npz'), data=y_test_pred) print(colors.green | 'Score: {:s}: {:f}'.format(repr(self), score)) with self.output().open('w') as f: f.write('Score: {:s}: {:f}'.format(repr(self), score)) return score
def run(self): self.output().makedirs() X_train = RF_LeakyXGB_Dataset().load('train', self.fold, as_df=True) y_train = rf_dataset.Dataset().load('train', self.fold, as_df=True).is_duplicate X_valid = RF_LeakyXGB_Dataset().load('valid', self.fold, as_df=True) y_valid = rf_dataset.Dataset().load('valid', self.fold, as_df=True).is_duplicate pos_train = X_train[y_train == 1] neg_train = X_train[y_train == 0] X_train = pd.concat( (neg_train, pos_train.iloc[:int(0.8 * len(pos_train))], neg_train)) y_train = np.array( [0] * neg_train.shape[0] + [1] * pos_train.iloc[:int(0.8 * len(pos_train))].shape[0] + [0] * neg_train.shape[0]) del pos_train, neg_train #pos_valid = X_valid[y_valid == 1] #neg_valid = X_valid[y_valid == 0] #X_valid = pd.concat((neg_valid, pos_valid.iloc[:int(0.8 * len(pos_valid))], neg_valid)) #y_valid = np.array( # [0] * neg_valid.shape[0] + [1] * pos_valid.iloc[:int(0.8 * len(pos_valid))].shape[0] + [0] * neg_valid.shape[0]) #del pos_valid, neg_valid X_tr_tr, X_tr_es, y_tr_tr, y_tr_es = model_selection.train_test_split( X_train, y_train, test_size=0.05) d_train = xgb.DMatrix(X_tr_tr, label=y_tr_tr) d_es = xgb.DMatrix(X_tr_es, label=y_tr_es) d_valid = xgb.DMatrix(X_valid, label=y_valid) watchlist = [(d_train, 'train'), (d_es, 'd_es')] params = {} params['objective'] = 'binary:logistic' params['eval_metric'] = 'logloss' params['eta'] = 0.02 params['max_depth'] = 7 params['subsample'] = 0.6 params['base_score'] = 0.2 #bst = xgb.train(params, d_train, 2500, watchlist, early_stopping_rounds=50, verbose_eval=50) bst = xgb.train(params, d_train, 1000, watchlist, early_stopping_rounds=50, verbose_eval=50) p_valid = bst.predict(d_valid) print(score_data(y_valid, p_valid, weighted=False)) X_test = RF_LeakyXGB_Dataset().load('test', None, as_df=True) d_test = xgb.DMatrix(X_test) p_test = bst.predict(d_test) np.savez_compressed(self.make_path('done_tmp.npz'), valid=p_valid, test=p_test) os.rename(self.make_path('done_tmp.npz'), self.output().path)
def run(self): self.output().makedirs() batch_size = 128 normalizer = preprocessing.StandardScaler() train_q1, train_q2, train_other = rf_seq_data.RFWordSequenceDataset().load('train', fold=self.fold) train_other = normalizer.fit_transform(train_other) train_labels = rf_dataset.Dataset().load('train', fold=self.fold, as_df=True).is_duplicate print(train_q1.shape, train_q2.shape, train_other.shape) embedding = rf_seq_data.RFWordSequenceDataset().load_embedding_mat() np.random.seed(self.fold) model = self.model(embedding, train_q2.shape[1], train_other.shape[1]) early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=6) slow_plateau = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=3) model_path = self.make_path('model.h5') model_checkpointer = keras.callbacks.ModelCheckpoint(model_path, save_best_only=True, save_weights_only=True) if self.include_distances(): train_data = [train_q1, train_q2, train_other] else: train_data = [train_q1, train_q2] model.fit( train_data, train_labels, validation_split=0.05, epochs=20, batch_size=batch_size, shuffle=True, class_weight=dictweights, callbacks=[early_stopping, slow_plateau, model_checkpointer]) model.load_weights(model_path) valid_q1, valid_q2, valid_other = rf_seq_data.RFWordSequenceDataset().load('valid', fold=self.fold) valid_other = normalizer.transform(valid_other) valid_labels = rf_dataset.Dataset().load('valid', fold=self.fold, as_df=True).is_duplicate if self.include_distances(): valid_data = [valid_q1, valid_q2, valid_other] else: valid_data = [valid_q1, valid_q2] valid_preds = model.predict(valid_data, verbose=1, batch_size=batch_size) valid_preds = np.clip(valid_preds, 1e-7, 1 - 1e-7) score = score_data(valid_labels.values, valid_preds) print(colors.green | "Score for {:s}: {:f}".format(repr(self), score)) test_q1, test_q2, test_other = rf_seq_data.RFWordSequenceDataset().load('test', None) test_other = normalizer.transform(test_other) if self.include_distances(): test_data = [test_q1, test_q2, test_other] else: test_data = [test_q1, test_q2] test_preds = model.predict(test_data, verbose=1, batch_size=batch_size) np.savez_compressed(self.make_path('done_tmp.npz'), valid=valid_preds, test=test_preds) os.rename(self.make_path('done_tmp.npz'), self.output().path) return score
def valid(self): pred = self.predict('valid') print(colors.green | "prediction sample...") print(colors.green | str(pred.head())) y = dataset.Dataset().load()[2] loss = core.score_data(y.is_duplicate, pred) print(colors.green | "Performance: " + str(loss)) return pred
def run(self): batch_size = 128 self.output().makedirs() train_data, train_labels = self.load_dataset('train') valid_data, valid_labels = self.load_dataset('valid') valid_weights = core.weights[valid_labels] class_weights = dict(enumerate(core.weights)) embedding = keras_kaggle_data.KaggleDataset().load_embedding() model = self.model( embedding, keras_kaggle_data.KaggleDataset().MAX_SEQUENCE_LENGTH, train_data[2].shape[1]) model.summary() early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=6) slow_plateau = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=3) model_path = 'cache/%s/model.h5' % self.base_name model_checkpointer = keras.callbacks.ModelCheckpoint( model_path, save_best_only=True, save_weights_only=True) train_data = [ np.vstack([train_data[0], train_data[1]]), np.vstack([train_data[1], train_data[0]]), np.vstack([train_data[2], train_data[2]]) ] train_labels = np.concatenate([train_labels, train_labels]) model.fit(train_data, train_labels, validation_data=(valid_data, valid_labels, valid_weights), epochs=20, batch_size=batch_size, shuffle=True, class_weight=class_weights, callbacks=[early_stopping, slow_plateau, model_checkpointer]) model.load_weights(model_path) valid_preds = model.predict(valid_data, batch_size=batch_size) print(colors.green | ('Valid loss: %f ' % core.score_data(valid_labels, valid_preds))) del valid_labels, valid_data del train_labels, train_data merge_data, merge_labels = self.load_dataset('merge') merge_preds = model.predict(merge_data, batch_size=batch_size) np.save('cache/%s/merge.npy' % self.base_name, merge_preds) test_data, _ = self.load_dataset('test') test_preds = model.predict(test_data, batch_size=batch_size, verbose=1) np.save('cache/%s/classifications.npy' % self.base_name, test_preds)
def run(self): self.output().makedirs() X_train = RF_LeakyXGB_Dataset().load('train', self.fold, as_df=True) y_train = rf_dataset.Dataset().load('train', self.fold, as_df=True).is_duplicate X_valid = RF_LeakyXGB_Dataset().load('valid', self.fold, as_df=True) y_valid = rf_dataset.Dataset().load('valid', self.fold, as_df=True).is_duplicate pos_train = X_train[y_train == 1] neg_train = X_train[y_train == 0] X_train = pd.concat( (neg_train, pos_train.iloc[:int(0.8 * len(pos_train))], neg_train)) y_train = np.array( [0] * neg_train.shape[0] + [1] * pos_train.iloc[:int(0.8 * len(pos_train))].shape[0] + [0] * neg_train.shape[0]) del pos_train, neg_train #pos_valid = X_valid[y_valid == 1] #neg_valid = X_valid[y_valid == 0] #X_valid = pd.concat((neg_valid, pos_valid.iloc[:int(0.8 * len(pos_valid))], neg_valid)) #y_valid = np.array( # [0] * neg_valid.shape[0] + [1] * pos_valid.iloc[:int(0.8 * len(pos_valid))].shape[0] + [0] * neg_valid.shape[0]) #del pos_valid, neg_valid cls = lightgbm.sklearn.LGBMClassifier(n_estimators=2048, num_leaves=1024, learning_rate=0.03, subsample=0.75) X_tr_tr, X_tr_es, y_tr_tr, y_tr_es = model_selection.train_test_split( X_train, y_train, test_size=0.05) cls.fit(X_tr_tr, y_tr_tr, eval_set=[(X_tr_es, y_tr_es)], early_stopping_rounds=50) valid_pred = cls.predict_proba(X_valid)[:, 1] print(colors.green | '{:s} == {:f}'.format( repr(self), score_data(y_valid, valid_pred, weighted=False))) print(colors.yellow | str( pandas.Series(cls.feature_importances_, index=X_train.columns).sort_values())) X_test = RF_LeakyXGB_Dataset().load('test', None, as_df=True).fillna(-999).clip( -1000, 1000) test_pred = cls.predict_proba(X_test)[:, 1] np.savez_compressed(self.make_path('done_tmp.npz'), valid=valid_pred, test=test_pred) os.rename(self.make_path('done_tmp.npz'), self.output().path)
def score(self): self.output().makedirs() train_Xs = [] train_ys = [] for fold in range(1, fold_max): y = rf_dataset.Dataset().load('valid', fold, as_df=True).is_duplicate.values.squeeze() x = self.fold_x(fold, 'valid') nose.tools.assert_equal(x.shape[0], y.shape[0]) train_Xs.append(x) train_ys.append(y) sns.clustermap(pandas.concat(train_Xs, 0).corr()) plt.yticks(rotation=90) plt.savefig('./corr.png') plt.close() train_X = pandas.concat(train_Xs, 0).values train_y = np.concatenate(train_ys, 0).squeeze() cls = AutoExitingGBMLike(XGBClassifier( n_estimators=1024, learning_rate=0.05, max_depth=8, gamma=1, subsample=0.5 ), additional_fit_args={'verbose': False}) #cls = AutoExitingGBMLike(lightgbm.sklearn.LGBMClassifier( # n_estimators=1024, # learning_rate=0.01, # subsample=0.5, # num_leaves=2048 #), additional_fit_args={'verbose': False}) #cls = pipeline.Pipeline([ # ('poly', preprocessing.PolynomialFeatures(2)), # ('anova', feature_selection.SelectPercentile(feature_selection.f_classif)), # ('lin', linear_model.LogisticRegression(C=1, class_weight=core.dictweights)) #]) #cls = keras.wrappers.scikit_learn.KerasClassifier(build_fn=self.simple_nn) cls.fit(train_X, train_y) if hasattr(cls, 'feature_importances_'): ds_names = [repr(d) for d in self.classifiers(0)] print(colors.yellow | str(pandas.Series(cls.feature_importances_, index=ds_names).sort_values())) test_x = self.fold_x(0, 'valid').values test_y = rf_dataset.Dataset().load('valid', 0, as_df=True).is_duplicate.values.squeeze() score = core.score_data(test_y, cls.predict_proba(test_x)[:, 1]) return score, cls
def run(self): self.output().makedirs() X_train = RF_LeakyXGB_Dataset().load('train', self.fold, as_df=True).fillna(-999).clip( -1000, 1000) y_train = rf_dataset.Dataset().load('train', self.fold, as_df=True).is_duplicate X_valid = RF_LeakyXGB_Dataset().load('valid', self.fold, as_df=True).fillna(-999).clip( -1000, 1000) y_valid = rf_dataset.Dataset().load('valid', self.fold, as_df=True).is_duplicate pos_train = X_train[y_train == 1] neg_train = X_train[y_train == 0] X_train = pd.concat( (neg_train, pos_train.iloc[:int(0.8 * len(pos_train))], neg_train)) y_train = np.array( [0] * neg_train.shape[0] + [1] * pos_train.iloc[:int(0.8 * len(pos_train))].shape[0] + [0] * neg_train.shape[0]) del pos_train, neg_train #pos_valid = X_valid[y_valid == 1] #neg_valid = X_valid[y_valid == 0] #X_valid = pd.concat((neg_valid, pos_valid.iloc[:int(0.8 * len(pos_valid))], neg_valid)) #y_valid = np.array( # [0] * neg_valid.shape[0] + [1] * pos_valid.iloc[:int(0.8 * len(pos_valid))].shape[0] + [0] * neg_valid.shape[0]) #del pos_valid, neg_valid cls = ensemble.ExtraTreesClassifier(n_jobs=-1, n_estimators=1024) cls.fit(X_train.values, y_train.values) valid_pred = cls.predict_proba(X_valid)[:, 1] print(colors.green | '{:s} == {:f}'.format( repr(self), score_data(y_valid, valid_pred))) print(colors.yellow | str( pandas.Series(cls.feature_importances_, index=X_train.columns).sort_values())) X_test = RF_LeakyXGB_Dataset().load('test', None, as_df=True).fillna(-999).clip( -1000, 1000) test_pred = cls.predict_proba(X_test.values)[:, 1] np.savez_compressed(self.make_path('done_tmp.npz'), valid=valid_pred, test=test_pred) os.rename(self.make_path('done_tmp.npz'), self.output().path)
def testicles(self): X = self._load_named('train') y = dataset.Dataset().load_named('train').is_duplicate.values cls = lightgbm.LGBMClassifier(num_leaves=512, n_estimators=500) cls.fit(X.values, y) X_test = self._load_named('valid').values y_test = dataset.Dataset().load_named('valid').is_duplicate.values y_pred = cls.predict_proba(X_test)[:, 1] scoring = core.score_data(y_test, y_pred) importances = pandas.Series(cls.feature_importances_, index=X.columns) print(scoring) print(importances) with self.output().open('w') as f: f.write("Score: {:f}\n".format(scoring)) f.write(str(importances))
def run(self): self.output().makedirs() X = abhishek_feats.AbhishekFeatures().load('train', self.fold) y = xval_dataset.BaseDataset().load('train', self.fold).squeeze() cls = xgbsk.XGBClassifier(max_depth=self.max_depth.get(), learning_rate=self.eta.get(), n_estimators=self.n_est.get()) X_tr, X_va, y_tr, y_va = model_selection.train_test_split( X, y, test_size=0.05) cls.fit(X_tr, y_tr, sample_weight=core.weight_from(y_tr), eval_set=[(X_va, y_va)], early_stopping_rounds=10) validX = abhishek_feats.AbhishekFeatures().load('valid', self.fold) y = xval_dataset.BaseDataset().load('valid', self.fold).squeeze() y_pred = cls.predict_proba(validX)[:, 1] score = core.score_data(y, y_pred) scorestr = "{:s} = {:f}".format(repr(self), score) print(colors.green | colors.bold | scorestr) valid_fn = 'cache/abhishek/xgb/maxdepth_{:d}_eta_{:f}_nest_{:d}/{:d}/valid.npy'.format( self.max_depth.get(), self.eta.get(), self.n_est.get(), self.fold) np.save(valid_fn, y_pred) trainX = abhishek_feats.AbhishekFeatures().load('test', None) pred = cls.predict_proba(trainX)[:, 1] test_fn = 'cache/abhishek/xgb/maxdepth_{:d}_eta_{:f}_nest_{:d}/{:d}/test.npy'.format( self.max_depth.get(), self.eta.get(), self.n_est.get(), self.fold) np.save(test_fn, pred) with self.output().open('w') as f: cols = abhishek_feats.AbhishekFeatures().load('valid', self.fold, as_df=True).columns v = pandas.Series(cls.feature_importances_, index=cols).sort_values() v.to_csv(f) f.write("\n\n") f.write(scorestr) f.write("\n") return score
def score(self): self.output().makedirs() poly = preprocessing.PolynomialFeatures(self.npoly.get()) train_Xs = [] train_ys = [] for fold in range(1, 9): y = xval_dataset.BaseDataset().load('valid', fold).squeeze() x = self.fold_x(fold, 'valid') nose.tools.assert_equal(x.shape[0], y.shape[0]) train_Xs.append(x) train_ys.append(y) train_X = poly.fit_transform(np.concatenate(train_Xs, 0)) train_y = np.concatenate(train_ys, 0).squeeze() cls = linear_model.LogisticRegression(class_weight=core.dictweights) cls.fit(train_X, train_y) test_x = poly.transform(self.fold_x(0, 'valid')) test_y = xval_dataset.BaseDataset().load('valid', 0).squeeze() score = core.score_data(test_y, cls.predict_proba(test_x)) return score, poly, cls
def run(self): self.output().makedirs() X = abhishek_feats.AbhishekFeatures().load('train', self.fold) y = xval_dataset.BaseDataset().load('train', self.fold).squeeze() cls = lgbsklearn.LGBMClassifier(num_leaves=1024, n_estimators=1024, is_unbalance=True) X_tr, X_va, y_tr, y_va = model_selection.train_test_split( X, y, test_size=0.05) cls.fit(X_tr, y_tr, sample_weight=core.weight_from(y_tr), eval_set=(X_va, y_va), early_stopping_rounds=10) validX = abhishek_feats.AbhishekFeatures().load('valid', self.fold) y = xval_dataset.BaseDataset().load('valid', self.fold).squeeze() y_pred = cls.predict_proba(validX)[:, 1] scorestr = "{:s} = {:f}".format(repr(self), core.score_data(y, y_pred)) print(colors.green | colors.bold | scorestr) np.save('cache/abhishek/lgbm/{:d}/valid.npy'.format(self.fold), y_pred) trainX = abhishek_feats.AbhishekFeatures().load('test', None) pred = cls.predict_proba(trainX)[:, 1] np.save('cache/abhishek/lgbm/{:d}/test.npy'.format(self.fold), pred) with self.output().open('w') as f: cols = abhishek_feats.AbhishekFeatures().load('valid', self.fold, as_df=True).columns v = pandas.Series(cls.feature_importances_, index=cols).sort_values() v.to_csv(f) f.write("\n\n") f.write(scorestr) f.write("\n")
def run(self): self.output().makedirs() m1, m2 = rf_word_count_features.WordCountMatrix().load_raw_vectors( 'train') m1 = m1 > 0 m2 = m2 > 0 X = m1.multiply(m2) folds = (rf_dataset.Dataset().load_dataset_folds() + self.fold) % fold_max train_X = X[folds != 0] train_y = rf_dataset.Dataset().load('train', fold=self.fold, as_df=True).is_duplicate.values cls = naive_bayes.BernoulliNB() cls.fit(train_X, train_y) valid_X = X[folds == 0] valid_y = rf_dataset.Dataset().load('valid', fold=self.fold, as_df=True).is_duplicate.values valid_pred = cls.predict_proba(valid_X)[:, 1] score = score_data(valid_y, valid_pred) print(colors.green | "Score for {:s}: {:f}".format(repr(self), score)) t1, t2 = rf_word_count_features.WordCountMatrix().load_raw_vectors( 'test') t1 = t1 > 0 t2 = t2 > 0 test_X = t1.multiply(t2) test_pred = cls.predict_proba(test_X)[:, 1] np.savez_compressed(self.make_path('done_tmp.npz'), valid=valid_pred, test=test_pred) os.rename(self.make_path('done_tmp.npz'), self.make_path('done.npz')) return score
def run(self): self.output().makedirs() X = abhishek_feats.AbhishekFeatures().load('train', self.fold) y = xval_dataset.BaseDataset().load('train', self.fold).squeeze() cls = ensemble.ExtraTreesClassifier(n_estimators=500, n_jobs=-1, class_weight=core.dictweights) cls.fit(X, y) validX = abhishek_feats.AbhishekFeatures().load('valid', self.fold) y = xval_dataset.BaseDataset().load('valid', self.fold).squeeze() y_pred = cls.predict_proba(validX)[:, 1] score = core.score_data(y, y_pred) scorestr = "{:s} = {:f}".format(repr(self), score) print(colors.green | colors.bold | scorestr) np.save('cache/abhishek/xtc/{:d}/valid.npy'.format(self.fold), y_pred) trainX = abhishek_feats.AbhishekFeatures().load('test', None) pred = cls.predict_proba(trainX)[:, 1] np.save('cache/abhishek/xtc/{:d}/test.npy'.format(self.fold), pred) with self.output().open('w') as f: cols = abhishek_feats.AbhishekFeatures().load('valid', self.fold, as_df=True).columns v = pandas.Series(cls.feature_importances_, index=cols).sort_values() v.to_csv(f) f.write("\n") f.write("\n") f.write(scorestr) f.write("\n") return score
def run(self): self.output().makedirs() fold_ord = np.random.permutation(fold_max) merge_fold = fold_ord[0] test_fold = fold_ord[1] stack_folds = fold_ord[2:] print(colors.red | 'Fold order: {}/{}/{}'.format(merge_fold, test_fold, stack_folds)) stack_Xs = [self.fold_x(f, 'valid') for f in stack_folds] stack_ys = [self.fold_y(f, 'valid') for f in stack_folds] stack_X = pandas.concat(stack_Xs, 0) stack_y = np.concatenate(stack_ys, 0) merge_X = self.fold_x(merge_fold, 'valid') merge_y = self.fold_y(merge_fold, 'valid') test_X = self.fold_x(test_fold, 'valid') test_y = self.fold_y(test_fold, 'valid') classifiers = list(self.classifiers()) merge_preds = [] test_preds = [] ds_names = [repr(d) for d in self.datasets(0)] for cls in classifiers: print(colors.blue | colors.bold | "Training {:s}".format(repr(cls))) cls.fit(stack_X, stack_y) if hasattr(cls, 'feature_importances_'): print(colors.yellow | str(pandas.Series(cls.feature_importances_, index=ds_names).sort_values())) test_pred = cls.predict_proba(test_X)[:, 1] merge_pred = cls.predict_proba(merge_X)[:, 1] score = score_data(test_y, test_pred) test_score = score_data(test_y, test_pred) print(colors.yellow | 'Score: {:f}'.format(score)) print(colors.green | 'score: {:f}'.format(test_score)) merge_preds.append(merge_pred) test_preds.append(test_pred) merge_pred = np.vstack(merge_preds).T test_pred = np.vstack(test_preds).T #merge_cls = AutoExitingGBMLike(XGBClassifier( # n_estimators=1024, # learning_rate=0.05, # max_depth=4, # subsample=0.5 #), additional_fit_args={'verbose': False}) merge_cls = FeatureMean() merge_cls.fit(merge_pred, merge_y) test_pred = merge_cls.predict_proba(test_pred)[:, 1] test_score = score_data(test_y, test_pred) print(colors.green | 'Final score: {:f}'.format(test_score)) fold_preds = [] for fold in range(fold_max): fold_X = self.fold_x(fold, 'test') fold_merge_X = np.zeros([fold_X.shape[0], len(classifiers)]) for ix, cls in enumerate(classifiers): fold_merge_X[:, ix] = cls.predict_proba(fold_X)[:, 1] fold_preds.append(merge_cls.predict_proba(fold_merge_X)[:, 1]) predmat = np.vstack(fold_preds).mean(0) index = pandas.Index(np.arange(fold_X.shape[0]), name='test_id') print(predmat.shape) print(index.shape) pred = pandas.Series(predmat, index=index, name='is_duplicate').to_frame() with gzip.open(self.make_path('stacked_pred.csv.gz.tmp'), 'wt') as f: pred.to_csv(f) os.rename(self.make_path('stacked_pred.csv.gz.tmp'), self.make_path('stacked_pred.csv.gz')) return test_score