def train_model(modelBuilder): train_df = load_dataframe('train') test_df = load_dataframe('test') X_train = process(transform_dataset(train_df), isolate) X_test = process(transform_dataset(test_df), isolate) target_train = train_df['is_iceberg'] X_train_cv, X_valid, y_train_cv, y_valid = train_test_split( X_train, target_train, random_state=1, train_size=0.75) model = modelBuilder() optimizer = Adam(lr=LEARNING_RATE, beta_1=BETA_1, beta_2=BETA_2, epsilon=EPSILON, decay=DECAY) model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy']) model.summary() callbacks = build_save_callbacks(filepath=MODEL_PATH, patience=5) datagen = ImageDataGenerator( # featurewise_center=True, # featurewise_std_normalization=True, # rotation_range=20, # width_shift_range=0.2, # height_shift_range=0.2, # horizontal_flip=True ) datagen.fit(X_train) empty = ImageDataGenerator() empty.fit(X_valid) steps_per_epoch = len(X_train_cv) // BATCH_SIZE hist = model.fit_generator(datagen.flow(X_train_cv, y_train_cv, batch_size=BATCH_SIZE), epochs=EPOCHS, verbose=VERBOSE, validation_data=empty.flow(X_valid, y_valid), steps_per_epoch=steps_per_epoch, callbacks=callbacks) model.load_weights(filepath=MODEL_PATH) score = model.evaluate(X_valid, y_valid, verbose=1) print('Test loss:', score[0]) print('Test accuracy:', score[1]) predicted_test = model.predict_proba(X_test) save_submission(test_df, predicted_test, filename='sub.csv') save_history(hist.history, model_name=MODEL_NAME)
def _fold_save_loop(valid_oof_submission, test_submission, i, pipeline_name): logger.info('Saving fold {} oof predictions'.format(i)) save_submission(valid_oof_submission, params.experiment_dir, '{}_predictions_valid_fold{}.csv'.format(pipeline_name, i), logger) logger.info('Saving fold {} test predictions'.format(i)) save_submission(test_submission, params.experiment_dir, '{}_predictions_test_fold{}.csv'.format(pipeline_name, i), logger)
def fix_submission(submission_path_meta, submission_path_ss, submission_path_mix, threshold=0.03): pid2cdf0_meta, pid2cdf1_meta = read_submission(submission_path_meta) pid2cdf0_ss, pid2cdf1_ss = read_submission(submission_path_ss) pid2cdf0_mix, pid2cdf1_mix = read_submission(submission_path_mix) pid2cdf0_fixed, pid2cdf1_fixed = {}, {} patient_ids = pid2cdf0_meta.keys() assert patient_ids == pid2cdf0_ss.keys() assert patient_ids == pid2cdf0_mix.keys() # no prediction from metamodel -> take prediction from slice model for pid in patient_ids: pid2cdf0_fixed[pid] = pid2cdf0_ss[ pid] if pid2cdf0_meta[pid] is None else pid2cdf0_meta[pid] pid2cdf1_fixed[pid] = pid2cdf1_ss[ pid] if pid2cdf1_meta[pid] is None else pid2cdf1_meta[pid] # metamodel disagrees with slice model -> take prediction from mixture ensemble for pid in patient_ids: crps0 = utils_heart.crps(pid2cdf0_fixed[pid], pid2cdf0_ss[pid]) if crps0 > threshold: print 'sys', pid, crps0 pid2cdf0_fixed[pid] = pid2cdf0_mix[ pid] if crps0 > threshold else pid2cdf0_fixed[pid] crps1 = utils_heart.crps(pid2cdf1_fixed[pid], pid2cdf1_ss[pid]) if crps1 > threshold: print 'dst', pid, crps1 pid2cdf1_fixed[pid] = pid2cdf1_mix[ pid] if crps1 > threshold else pid2cdf1_fixed[pid] fixed_predictions = {} for pid in patient_ids: fixed_predictions[pid] = [pid2cdf0_fixed[pid], pid2cdf1_fixed[pid]] meta_expid = submission_path_meta.split('-')[-1].replace('.csv', '') ss_expid = submission_path_ss.split('-')[-1].replace('.csv', '') mix_expid = submission_path_mix.split('-')[-1].replace('.csv', '') fixed_submission_path = SUBMISSION_PATH + 'ira_%s-%s-%s.csv' % ( meta_expid, ss_expid, mix_expid) utils.save_submission(fixed_predictions, fixed_submission_path) print 'Submission save to', fixed_submission_path
def predict(self, x_rnn, x_fc, verbosity=0, write_to_file=False, overwrite=True, path=None): y_pred = self.model.predict( { 'tracks_input': x_rnn, 'session_input': x_fc }, verbose=verbosity) if write_to_file: session_length = x_fc[:, 0] * 10 + 10 if path == None: path = '../../data/submissions' path = path + '/' + self.model_name + '_' + self.now + '.txt' utils.save_submission(y_pred, session_length, path, overwrite=overwrite)
def fix_submission(submission_path_meta, submission_path_ss, submission_path_mix, threshold=0.03): pid2cdf0_meta, pid2cdf1_meta = read_submission(submission_path_meta) pid2cdf0_ss, pid2cdf1_ss = read_submission(submission_path_ss) pid2cdf0_mix, pid2cdf1_mix = read_submission(submission_path_mix) pid2cdf0_fixed, pid2cdf1_fixed = {}, {} patient_ids = pid2cdf0_meta.keys() assert patient_ids == pid2cdf0_ss.keys() assert patient_ids == pid2cdf0_mix.keys() # no prediction from metamodel -> take prediction from slice model for pid in patient_ids: pid2cdf0_fixed[pid] = pid2cdf0_ss[pid] if pid2cdf0_meta[pid] is None else pid2cdf0_meta[pid] pid2cdf1_fixed[pid] = pid2cdf1_ss[pid] if pid2cdf1_meta[pid] is None else pid2cdf1_meta[pid] # metamodel disagrees with slice model -> take prediction from mixture ensemble for pid in patient_ids: crps0 = utils_heart.crps(pid2cdf0_fixed[pid], pid2cdf0_ss[pid]) if crps0 > threshold: print 'sys', pid, crps0 pid2cdf0_fixed[pid] = pid2cdf0_mix[pid] if crps0 > threshold else pid2cdf0_fixed[pid] crps1 = utils_heart.crps(pid2cdf1_fixed[pid], pid2cdf1_ss[pid]) if crps1 > threshold: print 'dst', pid, crps1 pid2cdf1_fixed[pid] = pid2cdf1_mix[pid] if crps1 > threshold else pid2cdf1_fixed[pid] fixed_predictions = {} for pid in patient_ids: fixed_predictions[pid] = [pid2cdf0_fixed[pid], pid2cdf1_fixed[pid]] meta_expid = submission_path_meta.split('-')[-1].replace('.csv', '') ss_expid = submission_path_ss.split('-')[-1].replace('.csv', '') mix_expid = submission_path_mix.split('-')[-1].replace('.csv', '') fixed_submission_path = SUBMISSION_PATH + 'ira_%s-%s-%s.csv' % (meta_expid, ss_expid, mix_expid) utils.save_submission(fixed_predictions, fixed_submission_path) print 'Submission save to', fixed_submission_path
def _save_aggregate_fold_outputs(combined_oof_predictions, combined_test_predictions, mean_test_prediction, pipeline_name): logger.info('Saving out of fold valid predictions') save_submission(combined_oof_predictions, params.experiment_dir, '{}_predictions_train_oof.csv'.format(pipeline_name), logger) logger.info('Saving out of fold test predictions') save_submission(combined_test_predictions, params.experiment_dir, '{}_predictions_test_oof.csv'.format(pipeline_name), logger) logger.info('Saving averaged out of fold test predictions') save_submission(mean_test_prediction, params.experiment_dir, '{}_predictions_test_am.csv'.format(pipeline_name), logger)
weight += -eta * (grad_W + (self.lambda_ * weight)) bias += -eta * grad_b[0] smallOpt = { 'eta': [2.5, 2.0, 1.5], # initial learning rate 'maxiter': [10000], # max number of iterations (updates) of SGD 'batch_size': [1., 2, 3], 'etadrop': [ 0.75, .5, 0.25 ], # when dropping eta, multiply it by this number (e.g., .5 means halve it) 'eta_frac': [0.3, 0.2, 0.1], # drop eta every eta_frac fraction of the max iterations 'lambda_': [ 0.0025, 0.005, 0.0075 ] # so if eta_frac is .2, and maxiter is 10000, drop eta every 2000 iterations } print(smallOpt) gs.GridSearchCV(softmax(), smallOpt, cv=5) gs.fit(Xsmall, Ysmall) print("Grid scores on development set:") print("Best parameters set found on development set:\n") print(gs.best_params_, "\n") y_true, y_pred = Yval.argmax(-1), gs.predict(Xval) print(classification_report(y_true, y_pred)) # Save results save_submission('submission-small.csv', gs.predict(kaggleX))
def train_rf(): rf = RandomForestClassifier(n_estimators = 100, max_depth = 10, min_samples_split=4, min_samples_leaf=2, criterion="entropy") rf.fit(train_features, train_labels) probs = rf.predict_proba(test_features)[:,1] save_submission(outfile+"_rf", ids, probs) print cross_val_score(rf, train_features, train_labels, scoring="log_loss")
'etadrop': [ 0.4, 0.3, 0.2 ], # when dropping eta, multiply it by this number (e.g., .5 means halve it) 'eta_frac': [0.8, 0.7], # drop eta every eta_frac fraction of the max iterations 'lambda_': [ 0.1, 0.05, .025 ] # so if eta_frac is .2, and maxiter is 10000, drop eta every 2000 iterations } pprint.pprint(smallOpt, width=1) gs = GridSearchCV(softmaxModel(), smallOpt, cv=5, n_jobs=-1, verbose=1) gs.fit(Xsmall, Ysmall) print("Best parameters set found on development set:\n") pprint.pprint(gs.best_params_, width=1) # Test on validation y_true, y_pred = Yval.argmax(-1), gs.predict(Xval) print("\nAccuracy_Score") print(accuracy_score(y_true, y_pred)) print(classification_report(y_true, y_pred)) print("\n Confusion Matrix") print(confusion_matrix(y_true, y_pred)) # Test on custom softmax # Kaggle kagglePrediction = gs.predict(kaggleX) # Save results save_submission('submission-small.csv', kagglePrediction)
dtest = xgb.DMatrix(test[cols_].values) dtrain = \ xgb.DMatrix(train[cols_].values, label=train.loss) watchlist = [(dtrain, 'train'), (dtrain, 'eval')] gbdt = xgb.train(xgb_params, dtrain, best_nrounds, watchlist, obj=logregobj, feval=xg_eval_mae, maximize=False, verbose_eval=50, early_stopping_rounds=25) allpredictions['p1'] = \ gbdt.predict(dtest, ntree_limit=gbdt.best_ntree_limit) del dtrain del dtest del gbdt gc.collect() print(allpredictions.head()) utils.save_submission("xgb_other_train.csv", ids=train['id'], loss=train_sub) submission = pd.read_csv(directory + 'sample_submission.csv') if (kfolds > 1): submission.iloc[:, 1] = \ np.exp(allpredictions.mean(axis=1).values) - shift submission.to_csv('xgbmeansubmission.csv', index=None) submission.iloc[:, 1] = \ np.exp(allpredictions.median(axis=1).values) - shift submission.to_csv('xgbmediansubmission.csv', index=None) else: submission.iloc[:, 1] = np.exp(allpredictions.p1.values) - shift submission.to_csv('xgbsubmission.csv', index=None) print('Finished')
if set == 'test': test_data_iterator = config().test_data_iterator if n_tta_iterations == 1: test_data_iterator.transformation_params = config( ).valid_transformation_params else: test_data_iterator.transformation_params['zoom_range'] = (1., 1.) print('n test: %d' % test_data_iterator.nsamples) print('tta iteration:', end=' ') batch_predictions, batch_ids = [], [] for i in range(n_tta_iterations): print(i, end=' ') sys.stdout.flush() for xs_batch_test, _, ids_batch in buffering.buffered_gen_threaded( test_data_iterator.generate()): for x_shared, x in zip(xs_shared, xs_batch_test): x_shared.set_value(x) batch_predictions.append(iter_test_det()) batch_ids.append(ids_batch) avg_patient_predictions = config().get_avg_patient_predictions( batch_predictions, batch_ids, mean=mean) utils.save_pkl(avg_patient_predictions, prediction_path) print(' predictions saved to %s' % prediction_path) utils.save_submission(avg_patient_predictions, submission_path) print(' submission saved to %s' % submission_path)
params = { 'min_child_weight': 1, 'eta': 0.01, 'colsample_bytree': 0.5, 'max_depth': 12, 'subsample': 0.8, 'alpha': 1, 'gamma': 1, 'silent': 1, 'verbose_eval': True, 'seed': RANDOM_STATE } # xgtrain = xgb.DMatrix(trainf, label=y) # xgtest = xgb.DMatrix(testf) # res = xgb.cv(params, xgtrain, num_boost_round=num_rounds, nfold=5, stratified=False, # early_stopping_rounds=50, verbose_eval=1, show_stdv=True, feval=evalerror, maximize=False) params["num_rounds"] = int(4000 / 0.9) params["feval"] = evalerror pred, trainpred = utils.cv_xgboost(params, trainf, y, testf, nbags=10) pred = np.exp(pred) - shift train_pred = np.exp(trainpred) - shift utils.save_submission("data/blended1.csv", ids=ids, loss=pred) if is_submission: utils.save_submission(args.outfile, ids=ids, loss=pred) else: pred = pred.reshape((len(pred),1)) train_pred = train_pred.reshape((len(train_pred),1)) save_dataset(args.outfile, train_features=train_pred, train_labels=trainl, test_features=pred, ids=ids, feature_names=['xgb'])
def train_ada(): ada = AdaBoostClassifier(n_estimators=100) ada.fit(train_features, train_labels) probs = ada.predict_proba(test_features)[:,1] save_submission(outfile+"_ada", ids, probs)
print ' predictions saved to %s' % prediction_path print if set == 'test': test_data_iterator = config().test_data_iterator if n_tta_iterations == 1: test_data_iterator.transformation_params = config().valid_transformation_params else: test_data_iterator.transformation_params['zoom_range'] = (1., 1.) print 'n test: %d' % test_data_iterator.nsamples print 'tta iteration:', batch_predictions, batch_ids = [], [] for i in xrange(n_tta_iterations): print i, sys.stdout.flush() for xs_batch_test, _, ids_batch in buffering.buffered_gen_threaded(test_data_iterator.generate()): for x_shared, x in zip(xs_shared, xs_batch_test): x_shared.set_value(x) batch_predictions.append(iter_test_det()) batch_ids.append(ids_batch) avg_patient_predictions = config().get_avg_patient_predictions(batch_predictions, batch_ids, mean=mean) utils.save_pkl(avg_patient_predictions, prediction_path) print ' predictions saved to %s' % prediction_path utils.save_submission(avg_patient_predictions, submission_path) print ' submission saved to %s' % submission_path
def train_et(): et = ExtraTreesClassifier(n_estimators = 500, max_depth = 35, min_samples_split=4, min_samples_leaf=2, criterion="entropy") et.fit(train_features, train_labels) probs = et.predict_proba(test_features)[:,1] save_submission(outfile+"_et", ids, probs)
valid = train_and_valid[train_and_valid['split'] == 'valid'] X_valid = valid[feats] eval_set = [(X_valid, valid[target])] model = XGBWrapper(xgb_params, early_stopping_rounds=30, eval_set=eval_set, verbose=10) val_score = custom_valid_scheme(model, train, valid, feats, target, agg_function=AGG_FUNCTION) test = convert_to_float_or_factorize_objects(test, feats) test = predict_one_by_one(train=train_and_valid, test=test, feats=feats, model=model, agg_function=AGG_FUNCTION) assert test.index.is_monotonic_increasing save_submission(test, '%s.csv' % SUBMISSION_NAME, val_score=val_score) model.plot_importance() print('done')
# Compute quantiles of test predictions quant = [] for q in range(1, 100): p = np.percentile(test_preds, q) quant.append(p) # Compute initial offset values based on the discrepancies between label distribution of train data and the distribution of test predictions offsets = -1 * np.array( [quant[9] - 1.5, quant[20] - 2.5, quant[22] - 3.5, quant[24] - 4.5, 0, quant[34] - 5.5, quant[53] - 6.5, quant[67] - 7.5]) # train offsets data = np.vstack((train_preds, train_preds, target)) for j in range(num_classes): data[1, data[0].astype(int) == j] = data[0, data[0].astype(int) == j] + offsets[j] for j in range(num_classes): train_offset = lambda x: -apply_offset(data, x, j) offsets[j] = fmin_powell(train_offset, offsets[j]) print('Apply offsets to test') data = np.vstack((test_preds, test_preds)) for j in range(num_classes): data[1, data[0].astype(int) == j] = data[0, data[0].astype(int) == j] + offsets[j] preds_subm = np.round(np.clip(data[1], 1, 8)).astype(int) # Save submission print('Save submission file') utils.save_submission(preds_subm)
embeddings_A = shared_embedding_layer(input_A) sentence_representation_A = shared_lstm_layer(embeddings_A) normalized_A = BatchNormalization()(sentence_representation_A) input_B = Input(shape=(current_config.MAX_SENTENCE_LENGTH, )) embeddings_B = shared_embedding_layer(input_B) sentence_representation_B = shared_lstm_layer(embeddings_B) normalized_B = BatchNormalization()(sentence_representation_B) distance = Lambda(euclidean_distance)([normalized_A, normalized_B]) predictions = Dense(1, activation='sigmoid')(distance) model = Model(inputs=[input_A, input_B], outputs=predictions) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) model.summary() model.load_weights('20170604_1251_1.h5') test_dataframe = pd.read_csv('test.csv') test_questions_A, test_questions_B, _ = extract_questions_from_dataframe( test_dataframe, config=current_config, word2idx=word2idx, prediction_mode=True) predictions = model.predict(x=[test_questions_A, test_questions_B], batch_size=8192, verbose=1) save_submission(predictions, current_config)
# Featurize known data test_X = pd.merge(test, train_X, on=['shop_id', 'item_id'], how='inner') # Set month_num to November test_X['month_num'] = 11 # Predict values print('Predicting...') rf_val_predictions = rf_model.predict(test_X[features]) output = pd.DataFrame({ 'ID': test_X['ID'], 'item_cnt_month': rf_val_predictions }) # Set prediction of uknown pairs submission_cols = ['ID', 'item_cnt_month'] output = pd.merge(test, output, on='ID', how='left')[submission_cols] output.fillna(0.0, inplace=True) print(output.shape) # Have to output aggregated data... For now just do somethinf very stupid # Multiplicate each daily sale by number of days in November and take mean form the pair... output['item_cnt_month'] = output['item_cnt_month'].apply(lambda x: x * 30) output = output.groupby('ID').agg({'ID': 'first', 'item_cnt_month': 'mean'}) print('Saving file...') save_submission('random_forest_submission', output)
cv_scores = {} for train_id, val_id in folds: X_train = X.iloc[train_id, :] y_train = y.iloc[train_id] X_val = X.iloc[val_id, :] y_val = y.iloc[val_id] print("Current validation set: ", np.unique(groups[val_id])) model, encoder = fit_model(X_train, y_train, X_val, y_val, **best_params) if args.model == "xgboost": X_val = encoder.transform(X_val) y_pred = clip_target(model.predict(X_val)) cv_scores[np.unique(groups[val_id])[0]] = mean_squared_error(y_val, y_pred, squared=False) print(f"Average CV error: {np.array([cv_scores[i] for i in cv_scores]).mean()}") if args.save_model: model.save_model(f"../models/model_{dt.datetime.now().strftime('%Y%m%d_%H%M')}.{args.model}") print(f"Model saved in '../models/model_{dt.datetime.now().strftime('%Y%m%d_%H%M')}.{args.model}'") if args.save_submission: if args.model == "xgboost": X_test = encoder.transform(X_test) id_features = id_features.loc[id_features["date_block_num"] == args.test_month_id] save_submission(model, X_test, id_features, adjust_with_probing=False) print(f"Submission saved in ../submissions/submission_{dt.datetime.now().strftime('%Y%m%d_%H%M')}.csv")
def fit_predict(self, iteration_name, predict_test=True, save_preds=True, produce_sub=False, save_imps=True, save_aux_visu=False): if produce_sub: predict_test = True ''' Setup CV ''' # CV cycle collectors y_oof = np.zeros(self.y_tgt.size) if predict_test: y_test = np.zeros(self.test.shape[0]) eval_metrics = [] imps = pd.DataFrame() # Setup stratified CV num_folds = 5 folds = KFold(n_splits=num_folds, shuffle=True, random_state=self.cv_random_seed) # Extract numpy arrays for use in lgbm fit method approved_feats = [ feat for feat in list(self.train.columns) if feat not in self.feat_blacklist ] x_all = self.train[approved_feats].values if predict_test: x_test = self.test[approved_feats].values for i, (_train, _eval) in enumerate(folds.split(x_all)): print(f'> lgbm : Computing fold number {i} . . .') # Setup fold data x_train, y_train = x_all[_train], self.y_tgt[_train] sample_weight = self.sample_weight[_train] x_eval, y_eval = x_all[_eval], self.y_tgt[_eval] # Setup binary LGBM bst = lgb.LGBMRegressor( boosting_type='gbdt', num_leaves=self.fit_params['num_leaves'], learning_rate=self.fit_params['learning_rate'], n_estimators=self.fit_params['n_estimators'], objective='mae', # alpha=0.5, reg_lambda=self.fit_params['reg_lambda'], min_child_samples=self.fit_params['min_child_samples'], silent=self.fit_params['silent'], bagging_fraction=self.fit_params['bagging_fraction'], bagging_freq=self.fit_params['bagging_freq'], bagging_seed=self.fit_params['bagging_seed'], verbose=self.fit_params['verbose'], ) # Train bst bst.fit( X=x_train, y=y_train, sample_weight=sample_weight, eval_set=[(x_eval, y_eval)], eval_names=['\neval_set'], early_stopping_rounds=10, verbose=self.fit_params['verbose'], ) # Compute and store oof predictions and metric, performing custom thresholding y_oof[_eval] = bst.predict(x_eval) metric = mean_absolute_error(y_eval, y_oof[_eval]) eval_metrics.append(metric) print(f'> lgbm : Fold MAE : {metric:.4f}') # Build test predictions if predict_test: y_test += bst.predict(x_test) / num_folds # Store importances if save_imps: imp_df = pd.DataFrame() imp_df['feat'] = approved_feats imp_df['gain'] = bst.feature_importances_ imp_df['fold'] = i imps = pd.concat([imps, imp_df], axis=0, sort=False) print('> lgbm : CV results : ') print(pd.Series(eval_metrics).describe()) np.save('../other/y_oof_.npy', y_oof) np.save('../other/y_tgt_.npy', self.y_tgt) if predict_test: np.save('../other/y_pred_.npy', y_test) ''' Output wrap-up : save importances, predictions (oof and test), submission and others ''' # Insert here additional metrics final_metric = np.mean(eval_metrics) if self.postprocess_sub: final_name = f'lgbm_{iteration_name}_{final_metric:.4f}_pp' else: final_name = f'lgbm_{iteration_name}_{final_metric:.4f}' if predict_test: test_preds_df = pd.DataFrame(data=y_test[:, None], columns=[final_name], index=self.test.index) if save_imps: save_importances(imps, filename_='../importances/imps_' + final_name) if save_preds: train_preds_df = pd.DataFrame(data=y_oof[:, None], columns=[final_name]) train_preds_df.to_hdf(self.output_dir + f'{final_name}_oof.h5', key='w') # No sense in saving test without train hence indent if predict_test: test_preds_df.to_hdf(self.output_dir + f'{final_name}_test.h5', key='w') if produce_sub: save_submission( test_preds_df, sub_name=f'../submissions/{final_name}.csv', postprocess=self.postprocess_sub, ) if save_aux_visu: if False: plot_aux_visu() pass
for lambda_,results in small_trained_models.items(): if results['val_err'] < best_small_trained_val_err: best_small_trained_val_err = results['val_err'] best_small_trained_model = results['model'] best_small_trained_lambda = lambda_ best_large_trained_lambda = 0. best_large_trained_model = None best_large_trained_val_err = 100. for lambda_,results in large_trained_models.items(): if results['val_err'] < best_large_trained_val_err: best_large_trained_val_err = results['val_err'] best_large_trained_model = results['model'] best_large_trained_lambda = lambda_ print("Best small train model val err:", best_small_trained_val_err) print("Best small train model lambda:", best_small_trained_lambda) print("Best large train model val err:", best_large_trained_val_err) print("Best large train model lambda:", best_large_trained_lambda) # Generate a Kaggle submission file using `model` # for model trained on small_train kaggleX = load_data(data_fn, 'kaggle') kaggleYhat_small = predict(kaggleX, best_small_trained_model).argmax(-1) save_submission('submission-small.csv', kaggleYhat_small) #for model trained on large_train kaggleYhat_large = predict(kaggleX, best_large_trained_model).argmax(-1) save_submission('submission-large.csv', kaggleYhat_large)
'maxiter': [10000], # max number of iterations (updates) of SGD 'batch_size': [70, 60, 50], 'etadrop': [0.95], # when dropping eta, multiply it by this number (e.g., .5 means halve it) 'eta_frac': [0.18, .2, 0.22], # drop eta every eta_frac fraction of the max iterations 'lambda_' : [0.015, 0.01, 0.05] # so if eta_frac is .2, and maxiter is 10000, drop eta every 2000 iterations } pprint.pprint(bigOpt, width=1) gs = RandomizedSearchCV(softmaxModel(), bigOpt, cv=5, n_jobs=-1, verbose=1, n_iter=144) gs.fit(Xlarge, Ylarge) print("Best parameters set found on development set:\n") pprint.pprint(gs.best_params_, width=1) # Test on validation y_true, y_pred = Yval.argmax(-1), gs.predict(Xval) print("\nAccuracy_Score") print(accuracy_score(y_true, y_pred)) print(classification_report(y_true, y_pred)) print("\n Confusion Matrix\n") print(confusion_matrix(y_true, y_pred)) # Test on custom softmax # Kaggle kagglePrediction = gs.predict(kaggleX) # Save results save_submission('submission-large.csv', kagglePrediction)
def train_gb(): gb = GradientBoostingClassifier(n_estimators=100) gb.fit(train_features, train_labels) probs = gb.predict_proba(test_features)[:,1] save_submission(outfile+"_gb", ids, probs) print "created submission for gb" print cross_val_score(gb, train_features, train_labels, scoring="log_loss")