示例#1
0
def train_model(modelBuilder):
    train_df = load_dataframe('train')
    test_df = load_dataframe('test')

    X_train = process(transform_dataset(train_df), isolate)
    X_test = process(transform_dataset(test_df), isolate)

    target_train = train_df['is_iceberg']
    X_train_cv, X_valid, y_train_cv, y_valid = train_test_split(
        X_train, target_train, random_state=1, train_size=0.75)

    model = modelBuilder()
    optimizer = Adam(lr=LEARNING_RATE,
                     beta_1=BETA_1,
                     beta_2=BETA_2,
                     epsilon=EPSILON,
                     decay=DECAY)
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    model.summary()

    callbacks = build_save_callbacks(filepath=MODEL_PATH, patience=5)

    datagen = ImageDataGenerator(
        #         featurewise_center=True,
        #         featurewise_std_normalization=True,
        #         rotation_range=20,
        #         width_shift_range=0.2,
        #         height_shift_range=0.2,
        #         horizontal_flip=True
    )
    datagen.fit(X_train)

    empty = ImageDataGenerator()
    empty.fit(X_valid)

    steps_per_epoch = len(X_train_cv) // BATCH_SIZE
    hist = model.fit_generator(datagen.flow(X_train_cv,
                                            y_train_cv,
                                            batch_size=BATCH_SIZE),
                               epochs=EPOCHS,
                               verbose=VERBOSE,
                               validation_data=empty.flow(X_valid, y_valid),
                               steps_per_epoch=steps_per_epoch,
                               callbacks=callbacks)

    model.load_weights(filepath=MODEL_PATH)
    score = model.evaluate(X_valid, y_valid, verbose=1)
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])

    predicted_test = model.predict_proba(X_test)

    save_submission(test_df, predicted_test, filename='sub.csv')
    save_history(hist.history, model_name=MODEL_NAME)
示例#2
0
def _fold_save_loop(valid_oof_submission, test_submission, i, pipeline_name):
    logger.info('Saving fold {} oof predictions'.format(i))
    save_submission(valid_oof_submission, params.experiment_dir,
                    '{}_predictions_valid_fold{}.csv'.format(pipeline_name,
                                                             i), logger)

    logger.info('Saving fold {} test predictions'.format(i))
    save_submission(test_submission, params.experiment_dir,
                    '{}_predictions_test_fold{}.csv'.format(pipeline_name,
                                                            i), logger)
def fix_submission(submission_path_meta,
                   submission_path_ss,
                   submission_path_mix,
                   threshold=0.03):
    pid2cdf0_meta, pid2cdf1_meta = read_submission(submission_path_meta)
    pid2cdf0_ss, pid2cdf1_ss = read_submission(submission_path_ss)
    pid2cdf0_mix, pid2cdf1_mix = read_submission(submission_path_mix)
    pid2cdf0_fixed, pid2cdf1_fixed = {}, {}

    patient_ids = pid2cdf0_meta.keys()
    assert patient_ids == pid2cdf0_ss.keys()
    assert patient_ids == pid2cdf0_mix.keys()

    # no prediction from metamodel -> take prediction from slice model
    for pid in patient_ids:
        pid2cdf0_fixed[pid] = pid2cdf0_ss[
            pid] if pid2cdf0_meta[pid] is None else pid2cdf0_meta[pid]
        pid2cdf1_fixed[pid] = pid2cdf1_ss[
            pid] if pid2cdf1_meta[pid] is None else pid2cdf1_meta[pid]

    # metamodel disagrees with slice model -> take prediction from mixture ensemble
    for pid in patient_ids:
        crps0 = utils_heart.crps(pid2cdf0_fixed[pid], pid2cdf0_ss[pid])
        if crps0 > threshold:
            print 'sys', pid, crps0
        pid2cdf0_fixed[pid] = pid2cdf0_mix[
            pid] if crps0 > threshold else pid2cdf0_fixed[pid]

        crps1 = utils_heart.crps(pid2cdf1_fixed[pid], pid2cdf1_ss[pid])
        if crps1 > threshold:
            print 'dst', pid, crps1
        pid2cdf1_fixed[pid] = pid2cdf1_mix[
            pid] if crps1 > threshold else pid2cdf1_fixed[pid]

    fixed_predictions = {}
    for pid in patient_ids:
        fixed_predictions[pid] = [pid2cdf0_fixed[pid], pid2cdf1_fixed[pid]]

    meta_expid = submission_path_meta.split('-')[-1].replace('.csv', '')
    ss_expid = submission_path_ss.split('-')[-1].replace('.csv', '')
    mix_expid = submission_path_mix.split('-')[-1].replace('.csv', '')
    fixed_submission_path = SUBMISSION_PATH + 'ira_%s-%s-%s.csv' % (
        meta_expid, ss_expid, mix_expid)
    utils.save_submission(fixed_predictions, fixed_submission_path)
    print 'Submission save to', fixed_submission_path
    def predict(self,
                x_rnn,
                x_fc,
                verbosity=0,
                write_to_file=False,
                overwrite=True,
                path=None):
        y_pred = self.model.predict(
            {
                'tracks_input': x_rnn,
                'session_input': x_fc
            }, verbose=verbosity)

        if write_to_file:
            session_length = x_fc[:, 0] * 10 + 10

            if path == None:
                path = '../../data/submissions'
            path = path + '/' + self.model_name + '_' + self.now + '.txt'
            utils.save_submission(y_pred,
                                  session_length,
                                  path,
                                  overwrite=overwrite)
示例#5
0
def fix_submission(submission_path_meta, submission_path_ss, submission_path_mix, threshold=0.03):
    pid2cdf0_meta, pid2cdf1_meta = read_submission(submission_path_meta)
    pid2cdf0_ss, pid2cdf1_ss = read_submission(submission_path_ss)
    pid2cdf0_mix, pid2cdf1_mix = read_submission(submission_path_mix)
    pid2cdf0_fixed, pid2cdf1_fixed = {}, {}

    patient_ids = pid2cdf0_meta.keys()
    assert patient_ids == pid2cdf0_ss.keys()
    assert patient_ids == pid2cdf0_mix.keys()

    # no prediction from metamodel -> take prediction from slice model
    for pid in patient_ids:
        pid2cdf0_fixed[pid] = pid2cdf0_ss[pid] if pid2cdf0_meta[pid] is None else pid2cdf0_meta[pid]
        pid2cdf1_fixed[pid] = pid2cdf1_ss[pid] if pid2cdf1_meta[pid] is None else pid2cdf1_meta[pid]

    # metamodel disagrees with slice model -> take prediction from mixture ensemble
    for pid in patient_ids:
        crps0 = utils_heart.crps(pid2cdf0_fixed[pid], pid2cdf0_ss[pid])
        if crps0 > threshold:
            print 'sys', pid, crps0
        pid2cdf0_fixed[pid] = pid2cdf0_mix[pid] if crps0 > threshold else pid2cdf0_fixed[pid]

        crps1 = utils_heart.crps(pid2cdf1_fixed[pid], pid2cdf1_ss[pid])
        if crps1 > threshold:
            print 'dst', pid, crps1
        pid2cdf1_fixed[pid] = pid2cdf1_mix[pid] if crps1 > threshold else pid2cdf1_fixed[pid]

    fixed_predictions = {}
    for pid in patient_ids:
        fixed_predictions[pid] = [pid2cdf0_fixed[pid], pid2cdf1_fixed[pid]]

    meta_expid = submission_path_meta.split('-')[-1].replace('.csv', '')
    ss_expid = submission_path_ss.split('-')[-1].replace('.csv', '')
    mix_expid = submission_path_mix.split('-')[-1].replace('.csv', '')
    fixed_submission_path = SUBMISSION_PATH + 'ira_%s-%s-%s.csv' % (meta_expid, ss_expid, mix_expid)
    utils.save_submission(fixed_predictions, fixed_submission_path)
    print 'Submission save to', fixed_submission_path
示例#6
0
def _save_aggregate_fold_outputs(combined_oof_predictions,
                                 combined_test_predictions,
                                 mean_test_prediction, pipeline_name):
    logger.info('Saving out of fold valid predictions')
    save_submission(combined_oof_predictions, params.experiment_dir,
                    '{}_predictions_train_oof.csv'.format(pipeline_name),
                    logger)

    logger.info('Saving out of fold test predictions')
    save_submission(combined_test_predictions, params.experiment_dir,
                    '{}_predictions_test_oof.csv'.format(pipeline_name),
                    logger)

    logger.info('Saving averaged out of fold test predictions')
    save_submission(mean_test_prediction, params.experiment_dir,
                    '{}_predictions_test_am.csv'.format(pipeline_name), logger)
示例#7
0
        weight += -eta * (grad_W + (self.lambda_ * weight))
        bias += -eta * grad_b[0]


smallOpt = {
    'eta': [2.5, 2.0, 1.5],  # initial learning rate
    'maxiter': [10000],  # max number of iterations (updates) of SGD
    'batch_size': [1., 2, 3],
    'etadrop': [
        0.75, .5, 0.25
    ],  # when dropping eta, multiply it by this number (e.g., .5 means halve it)
    'eta_frac':
    [0.3, 0.2, 0.1],  # drop eta every eta_frac fraction of the max iterations
    'lambda_': [
        0.0025, 0.005, 0.0075
    ]  # so if eta_frac is .2, and maxiter is 10000, drop eta every 2000 iterations
}

print(smallOpt)

gs.GridSearchCV(softmax(), smallOpt, cv=5)
gs.fit(Xsmall, Ysmall)
print("Grid scores on development set:")
print("Best parameters set found on development set:\n")
print(gs.best_params_, "\n")
y_true, y_pred = Yval.argmax(-1), gs.predict(Xval)
print(classification_report(y_true, y_pred))

# Save results
save_submission('submission-small.csv', gs.predict(kaggleX))
示例#8
0
def train_rf():
 rf = RandomForestClassifier(n_estimators = 100, max_depth = 10, min_samples_split=4, min_samples_leaf=2, criterion="entropy")
 rf.fit(train_features, train_labels)
 probs = rf.predict_proba(test_features)[:,1]
 save_submission(outfile+"_rf", ids, probs)
 print cross_val_score(rf, train_features, train_labels, scoring="log_loss")
示例#9
0
    'etadrop': [
        0.4, 0.3, 0.2
    ],  # when dropping eta, multiply it by this number (e.g., .5 means halve it)
    'eta_frac':
    [0.8, 0.7],  # drop eta every eta_frac fraction of the max iterations
    'lambda_': [
        0.1, 0.05, .025
    ]  # so if eta_frac is .2, and maxiter is 10000, drop eta every 2000 iterations
}

pprint.pprint(smallOpt, width=1)

gs = GridSearchCV(softmaxModel(), smallOpt, cv=5, n_jobs=-1, verbose=1)
gs.fit(Xsmall, Ysmall)
print("Best parameters set found on development set:\n")
pprint.pprint(gs.best_params_, width=1)

# Test on validation
y_true, y_pred = Yval.argmax(-1), gs.predict(Xval)
print("\nAccuracy_Score")
print(accuracy_score(y_true, y_pred))
print(classification_report(y_true, y_pred))

print("\n Confusion Matrix")
print(confusion_matrix(y_true, y_pred))
# Test on custom softmax
# Kaggle
kagglePrediction = gs.predict(kaggleX)
# Save results
save_submission('submission-small.csv', kagglePrediction)
示例#10
0
        dtest = xgb.DMatrix(test[cols_].values)
        dtrain = \
            xgb.DMatrix(train[cols_].values,
                        label=train.loss)
        watchlist = [(dtrain, 'train'), (dtrain, 'eval')]
        gbdt = xgb.train(xgb_params, dtrain, best_nrounds, watchlist,
                         obj=logregobj,
                         feval=xg_eval_mae, maximize=False,
                         verbose_eval=50, early_stopping_rounds=25)
        allpredictions['p1'] = \
            gbdt.predict(dtest, ntree_limit=gbdt.best_ntree_limit)
        del dtrain
        del dtest
        del gbdt
        gc.collect()

    print(allpredictions.head())
    utils.save_submission("xgb_other_train.csv", ids=train['id'], loss=train_sub)
    submission = pd.read_csv(directory + 'sample_submission.csv')
    if (kfolds > 1):
        submission.iloc[:, 1] = \
            np.exp(allpredictions.mean(axis=1).values) - shift
        submission.to_csv('xgbmeansubmission.csv', index=None)
        submission.iloc[:, 1] = \
            np.exp(allpredictions.median(axis=1).values) - shift
        submission.to_csv('xgbmediansubmission.csv', index=None)
    else:
        submission.iloc[:, 1] = np.exp(allpredictions.p1.values) - shift
        submission.to_csv('xgbsubmission.csv', index=None)
    print('Finished')
示例#11
0
if set == 'test':
    test_data_iterator = config().test_data_iterator

    if n_tta_iterations == 1:
        test_data_iterator.transformation_params = config(
        ).valid_transformation_params
    else:
        test_data_iterator.transformation_params['zoom_range'] = (1., 1.)

    print('n test: %d' % test_data_iterator.nsamples)
    print('tta iteration:', end=' ')

    batch_predictions, batch_ids = [], []
    for i in range(n_tta_iterations):
        print(i, end=' ')
        sys.stdout.flush()
        for xs_batch_test, _, ids_batch in buffering.buffered_gen_threaded(
                test_data_iterator.generate()):
            for x_shared, x in zip(xs_shared, xs_batch_test):
                x_shared.set_value(x)
            batch_predictions.append(iter_test_det())
            batch_ids.append(ids_batch)

    avg_patient_predictions = config().get_avg_patient_predictions(
        batch_predictions, batch_ids, mean=mean)
    utils.save_pkl(avg_patient_predictions, prediction_path)
    print(' predictions saved to %s' % prediction_path)

    utils.save_submission(avg_patient_predictions, submission_path)
    print(' submission saved to %s' % submission_path)
示例#12
0
    params = {
        'min_child_weight': 1,
        'eta': 0.01,
        'colsample_bytree': 0.5,
        'max_depth': 12,
        'subsample': 0.8,
        'alpha': 1,
        'gamma': 1,
        'silent': 1,
        'verbose_eval': True,
        'seed': RANDOM_STATE
    }

#    xgtrain = xgb.DMatrix(trainf, label=y)
#    xgtest = xgb.DMatrix(testf)

#    res = xgb.cv(params, xgtrain, num_boost_round=num_rounds, nfold=5, stratified=False,
#         early_stopping_rounds=50, verbose_eval=1, show_stdv=True, feval=evalerror, maximize=False)
    params["num_rounds"] = int(4000 / 0.9)
    params["feval"] = evalerror
    pred, trainpred = utils.cv_xgboost(params, trainf, y, testf, nbags=10)
    pred = np.exp(pred) - shift
    train_pred = np.exp(trainpred) - shift
    utils.save_submission("data/blended1.csv", ids=ids, loss=pred)
    if is_submission: utils.save_submission(args.outfile, ids=ids, loss=pred)
    else:
        pred = pred.reshape((len(pred),1))
        train_pred = train_pred.reshape((len(train_pred),1))
        save_dataset(args.outfile, train_features=train_pred, train_labels=trainl, test_features=pred, ids=ids, feature_names=['xgb'])
    
示例#13
0
def train_ada():
    ada = AdaBoostClassifier(n_estimators=100)
    ada.fit(train_features, train_labels)
    probs = ada.predict_proba(test_features)[:,1]
    save_submission(outfile+"_ada", ids, probs)
示例#14
0
    print ' predictions saved to %s' % prediction_path
    print

if set == 'test':
    test_data_iterator = config().test_data_iterator

    if n_tta_iterations == 1:
        test_data_iterator.transformation_params = config().valid_transformation_params
    else:
        test_data_iterator.transformation_params['zoom_range'] = (1., 1.)

    print 'n test: %d' % test_data_iterator.nsamples
    print 'tta iteration:',

    batch_predictions, batch_ids = [], []
    for i in xrange(n_tta_iterations):
        print i,
        sys.stdout.flush()
        for xs_batch_test, _, ids_batch in buffering.buffered_gen_threaded(test_data_iterator.generate()):
            for x_shared, x in zip(xs_shared, xs_batch_test):
                x_shared.set_value(x)
            batch_predictions.append(iter_test_det())
            batch_ids.append(ids_batch)

    avg_patient_predictions = config().get_avg_patient_predictions(batch_predictions, batch_ids, mean=mean)
    utils.save_pkl(avg_patient_predictions, prediction_path)
    print ' predictions saved to %s' % prediction_path

    utils.save_submission(avg_patient_predictions, submission_path)
    print ' submission saved to %s' % submission_path
示例#15
0
def train_et():
 et = ExtraTreesClassifier(n_estimators = 500, max_depth = 35, min_samples_split=4, min_samples_leaf=2, criterion="entropy")
 et.fit(train_features, train_labels)
 probs = et.predict_proba(test_features)[:,1]
 save_submission(outfile+"_et", ids, probs)
示例#16
0
valid = train_and_valid[train_and_valid['split'] == 'valid']
X_valid = valid[feats]
eval_set = [(X_valid, valid[target])]
model = XGBWrapper(xgb_params,
                   early_stopping_rounds=30,
                   eval_set=eval_set,
                   verbose=10)

val_score = custom_valid_scheme(model,
                                train,
                                valid,
                                feats,
                                target,
                                agg_function=AGG_FUNCTION)

test = convert_to_float_or_factorize_objects(test, feats)

test = predict_one_by_one(train=train_and_valid,
                          test=test,
                          feats=feats,
                          model=model,
                          agg_function=AGG_FUNCTION)
assert test.index.is_monotonic_increasing

save_submission(test, '%s.csv' % SUBMISSION_NAME, val_score=val_score)

model.plot_importance()

print('done')
示例#17
0
# Compute quantiles of test predictions
quant = []
for q in range(1, 100):
    p = np.percentile(test_preds, q)
    quant.append(p)

# Compute initial offset values based on the discrepancies between label distribution of train data and the distribution of test predictions
offsets = -1 * np.array(
    [quant[9] - 1.5, quant[20] - 2.5, quant[22] - 3.5, quant[24] - 4.5, 0, quant[34] - 5.5, quant[53] - 6.5,
     quant[67] - 7.5])

# train offsets 
data = np.vstack((train_preds, train_preds, target))
for j in range(num_classes):
    data[1, data[0].astype(int) == j] = data[0, data[0].astype(int) == j] + offsets[j]
for j in range(num_classes):
    train_offset = lambda x: -apply_offset(data, x, j)
    offsets[j] = fmin_powell(train_offset, offsets[j])

print('Apply offsets to test')
data = np.vstack((test_preds, test_preds))
for j in range(num_classes):
    data[1, data[0].astype(int) == j] = data[0, data[0].astype(int) == j] + offsets[j]

preds_subm = np.round(np.clip(data[1], 1, 8)).astype(int)

# Save submission
print('Save submission file')
utils.save_submission(preds_subm)
示例#18
0
embeddings_A = shared_embedding_layer(input_A)
sentence_representation_A = shared_lstm_layer(embeddings_A)
normalized_A = BatchNormalization()(sentence_representation_A)

input_B = Input(shape=(current_config.MAX_SENTENCE_LENGTH, ))
embeddings_B = shared_embedding_layer(input_B)
sentence_representation_B = shared_lstm_layer(embeddings_B)
normalized_B = BatchNormalization()(sentence_representation_B)

distance = Lambda(euclidean_distance)([normalized_A, normalized_B])

predictions = Dense(1, activation='sigmoid')(distance)

model = Model(inputs=[input_A, input_B], outputs=predictions)
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

model.load_weights('20170604_1251_1.h5')
test_dataframe = pd.read_csv('test.csv')
test_questions_A, test_questions_B, _ = extract_questions_from_dataframe(
    test_dataframe,
    config=current_config,
    word2idx=word2idx,
    prediction_mode=True)
predictions = model.predict(x=[test_questions_A, test_questions_B],
                            batch_size=8192,
                            verbose=1)
save_submission(predictions, current_config)
# Featurize known data
test_X = pd.merge(test, train_X, on=['shop_id', 'item_id'], how='inner')

# Set month_num to November
test_X['month_num'] = 11

# Predict values
print('Predicting...')
rf_val_predictions = rf_model.predict(test_X[features])

output = pd.DataFrame({
    'ID': test_X['ID'],
    'item_cnt_month': rf_val_predictions
})

# Set prediction of uknown pairs
submission_cols = ['ID', 'item_cnt_month']

output = pd.merge(test, output, on='ID', how='left')[submission_cols]
output.fillna(0.0, inplace=True)
print(output.shape)

# Have to output aggregated data... For now just do somethinf very stupid
# Multiplicate each daily sale by number of days in November and take mean form the pair...
output['item_cnt_month'] = output['item_cnt_month'].apply(lambda x: x * 30)
output = output.groupby('ID').agg({'ID': 'first', 'item_cnt_month': 'mean'})

print('Saving file...')

save_submission('random_forest_submission', output)
示例#20
0
    cv_scores = {}
    for train_id, val_id in folds:

        X_train = X.iloc[train_id, :]
        y_train = y.iloc[train_id]
        X_val = X.iloc[val_id, :]
        y_val = y.iloc[val_id]

        print("Current validation set: ", np.unique(groups[val_id]))

        model, encoder = fit_model(X_train, y_train, X_val, y_val, **best_params)
        if args.model == "xgboost":
            X_val = encoder.transform(X_val)
        y_pred = clip_target(model.predict(X_val))
        cv_scores[np.unique(groups[val_id])[0]] = mean_squared_error(y_val, y_pred, squared=False)
        print(f"Average CV error: {np.array([cv_scores[i] for i in cv_scores]).mean()}")

    if args.save_model:
        model.save_model(f"../models/model_{dt.datetime.now().strftime('%Y%m%d_%H%M')}.{args.model}")
        print(f"Model saved in '../models/model_{dt.datetime.now().strftime('%Y%m%d_%H%M')}.{args.model}'")

    if args.save_submission:

        if args.model == "xgboost":
            X_test = encoder.transform(X_test)

        id_features = id_features.loc[id_features["date_block_num"] == args.test_month_id]
        save_submission(model, X_test, id_features, adjust_with_probing=False)
        print(f"Submission saved in ../submissions/submission_{dt.datetime.now().strftime('%Y%m%d_%H%M')}.csv")

示例#21
0
    def fit_predict(self,
                    iteration_name,
                    predict_test=True,
                    save_preds=True,
                    produce_sub=False,
                    save_imps=True,
                    save_aux_visu=False):

        if produce_sub:
            predict_test = True
        '''
        Setup CV
        '''

        # CV cycle collectors
        y_oof = np.zeros(self.y_tgt.size)
        if predict_test:
            y_test = np.zeros(self.test.shape[0])
        eval_metrics = []
        imps = pd.DataFrame()

        # Setup stratified CV
        num_folds = 5
        folds = KFold(n_splits=num_folds,
                      shuffle=True,
                      random_state=self.cv_random_seed)

        # Extract numpy arrays for use in lgbm fit method
        approved_feats = [
            feat for feat in list(self.train.columns)
            if feat not in self.feat_blacklist
        ]

        x_all = self.train[approved_feats].values
        if predict_test:
            x_test = self.test[approved_feats].values

        for i, (_train, _eval) in enumerate(folds.split(x_all)):

            print(f'> lgbm : Computing fold number {i} . . .')

            # Setup fold data
            x_train, y_train = x_all[_train], self.y_tgt[_train]
            sample_weight = self.sample_weight[_train]
            x_eval, y_eval = x_all[_eval], self.y_tgt[_eval]

            # Setup binary LGBM
            bst = lgb.LGBMRegressor(
                boosting_type='gbdt',
                num_leaves=self.fit_params['num_leaves'],
                learning_rate=self.fit_params['learning_rate'],
                n_estimators=self.fit_params['n_estimators'],
                objective='mae',
                # alpha=0.5,
                reg_lambda=self.fit_params['reg_lambda'],
                min_child_samples=self.fit_params['min_child_samples'],
                silent=self.fit_params['silent'],
                bagging_fraction=self.fit_params['bagging_fraction'],
                bagging_freq=self.fit_params['bagging_freq'],
                bagging_seed=self.fit_params['bagging_seed'],
                verbose=self.fit_params['verbose'],
            )

            # Train bst
            bst.fit(
                X=x_train,
                y=y_train,
                sample_weight=sample_weight,
                eval_set=[(x_eval, y_eval)],
                eval_names=['\neval_set'],
                early_stopping_rounds=10,
                verbose=self.fit_params['verbose'],
            )

            # Compute and store oof predictions and metric, performing custom thresholding
            y_oof[_eval] = bst.predict(x_eval)
            metric = mean_absolute_error(y_eval, y_oof[_eval])
            eval_metrics.append(metric)
            print(f'> lgbm : Fold MAE : {metric:.4f}')

            # Build test predictions
            if predict_test:
                y_test += bst.predict(x_test) / num_folds

            # Store importances
            if save_imps:
                imp_df = pd.DataFrame()
                imp_df['feat'] = approved_feats
                imp_df['gain'] = bst.feature_importances_
                imp_df['fold'] = i
                imps = pd.concat([imps, imp_df], axis=0, sort=False)

        print('> lgbm : CV results : ')
        print(pd.Series(eval_metrics).describe())

        np.save('../other/y_oof_.npy', y_oof)
        np.save('../other/y_tgt_.npy', self.y_tgt)

        if predict_test:
            np.save('../other/y_pred_.npy', y_test)
        '''
        Output wrap-up : save importances, predictions (oof and test), submission and others
        '''

        # Insert here additional metrics
        final_metric = np.mean(eval_metrics)

        if self.postprocess_sub:
            final_name = f'lgbm_{iteration_name}_{final_metric:.4f}_pp'
        else:
            final_name = f'lgbm_{iteration_name}_{final_metric:.4f}'

        if predict_test:
            test_preds_df = pd.DataFrame(data=y_test[:, None],
                                         columns=[final_name],
                                         index=self.test.index)

        if save_imps:
            save_importances(imps,
                             filename_='../importances/imps_' + final_name)

        if save_preds:
            train_preds_df = pd.DataFrame(data=y_oof[:, None],
                                          columns=[final_name])
            train_preds_df.to_hdf(self.output_dir + f'{final_name}_oof.h5',
                                  key='w')

            # No sense in saving test without train hence indent
            if predict_test:
                test_preds_df.to_hdf(self.output_dir + f'{final_name}_test.h5',
                                     key='w')

        if produce_sub:
            save_submission(
                test_preds_df,
                sub_name=f'../submissions/{final_name}.csv',
                postprocess=self.postprocess_sub,
            )

        if save_aux_visu:
            if False:
                plot_aux_visu()
            pass
示例#22
0
for lambda_,results in small_trained_models.items():
    if results['val_err'] < best_small_trained_val_err:
        best_small_trained_val_err = results['val_err']
        best_small_trained_model = results['model']
        best_small_trained_lambda = lambda_
        
best_large_trained_lambda = 0.
best_large_trained_model = None
best_large_trained_val_err = 100.
for lambda_,results in large_trained_models.items():
    if results['val_err'] < best_large_trained_val_err:
        best_large_trained_val_err = results['val_err']
        best_large_trained_model = results['model']
        best_large_trained_lambda = lambda_

print("Best small train model val err:", best_small_trained_val_err)
print("Best small train model lambda:", best_small_trained_lambda)
print("Best large train model val err:", best_large_trained_val_err)
print("Best large train model lambda:", best_large_trained_lambda)

# Generate a Kaggle submission file using `model`
# for model trained on small_train
kaggleX = load_data(data_fn, 'kaggle')
kaggleYhat_small = predict(kaggleX, best_small_trained_model).argmax(-1)
save_submission('submission-small.csv', kaggleYhat_small)

#for model trained on large_train
kaggleYhat_large = predict(kaggleX, best_large_trained_model).argmax(-1)
save_submission('submission-large.csv', kaggleYhat_large)

示例#23
0
    'maxiter': [10000],  # max number of iterations (updates) of SGD
    'batch_size': [70, 60, 50], 
    'etadrop': [0.95], # when dropping eta, multiply it by this number (e.g., .5 means halve it)
    'eta_frac': [0.18, .2, 0.22],  # drop eta every eta_frac fraction of the max iterations
    'lambda_' : [0.015, 0.01, 0.05]                  # so if eta_frac is .2, and maxiter is 10000, drop eta every 2000 iterations
}

pprint.pprint(bigOpt, width=1)

gs = RandomizedSearchCV(softmaxModel(), bigOpt, cv=5, n_jobs=-1, 
    verbose=1, n_iter=144)
gs.fit(Xlarge, Ylarge)
print("Best parameters set found on development set:\n")
pprint.pprint(gs.best_params_, width=1)
# Test on validation
y_true, y_pred = Yval.argmax(-1), gs.predict(Xval)
print("\nAccuracy_Score")
print(accuracy_score(y_true, y_pred))
print(classification_report(y_true, y_pred))

print("\n Confusion Matrix\n")
print(confusion_matrix(y_true, y_pred))

# Test on custom softmax
# Kaggle 
kagglePrediction = gs.predict(kaggleX)
# Save results
save_submission('submission-large.csv',  kagglePrediction)


示例#24
-1
def train_gb():
    gb = GradientBoostingClassifier(n_estimators=100)
    gb.fit(train_features, train_labels)
    probs = gb.predict_proba(test_features)[:,1]
    save_submission(outfile+"_gb", ids, probs)
    print "created submission for gb"
    print cross_val_score(gb, train_features, train_labels, scoring="log_loss")