def main():
    neptune.init(api_token=os.getenv('NEPTUNE_API_TOKEN'),
                 project_qualified_name=os.getenv('NEPTUNE_PROJECT'))

    application_table_path = os.path.join(RAW_DATA_DIRPATH,
                                          'application_train.csv.zip')
    application_table = pd.read_csv(application_table_path, nrows=NROWS)

    index_table = application_table[['SK_ID_CURR', 'TARGET']]

    with neptune.create_experiment(name='validation schema',
                                   tags=['processed', 'validation'],
                                   upload_source_files=get_filepaths()):

        train_idx, valid_idx = train_test_split(index_table,
                                                test_size=TEST_SIZE,
                                                random_state=SEED)
        train_idx_path = os.path.join(INTERIM_FEATURES_DIRPATH,
                                      'train_idx.csv')
        train_idx.to_csv(train_idx_path, index=None)
        neptune.send_artifact(train_idx_path)
        neptune.set_property('train_split_version', md5_hash(train_idx_path))

        valid_idx_path = os.path.join(INTERIM_FEATURES_DIRPATH,
                                      'valid_idx.csv')
        valid_idx.to_csv(valid_idx_path, index=None)
        neptune.send_artifact(valid_idx_path)
        neptune.set_property('valid_split_version', md5_hash(valid_idx_path))
def main():
    neptune.init(api_token=os.getenv('NEPTUNE_API_TOKEN'), project_qualified_name=os.getenv('NEPTUNE_PROJECT'))

    bureau_raw_path = os.path.join(RAW_DATA_DIRPATH,'bureau.csv.zip')
    bureau_raw = pd.read_csv(bureau_raw_path, nrows=NROWS)

    with neptune.create_experiment(name='feature_extraction',
                                   tags=['interim',
                                         'bureau',
                                         'feature_extraction'],
                                   upload_source_files=get_filepaths()):

        bureau_features, numeric_cols = extract(bureau_raw)
        bureau_features.to_csv(INTERIM_FEATURES_FILEPATH, index=None)

        neptune.set_property('numeric_features', str(numeric_cols))
        neptune.set_property('features_version', md5_hash(INTERIM_FEATURES_FILEPATH))
        neptune.set_property('features_path', INTERIM_FEATURES_FILEPATH)
def main():
    neptune.init(api_token=os.getenv('NEPTUNE_API_TOKEN'),
                 project_qualified_name=os.getenv('NEPTUNE_PROJECT'))

    interim_feature_paths = [APPLICATION_FEATURES_PATH, BUREAU_FEATURES_PATH]

    with neptune.create_experiment(
            name='feature_extraction',
            tags=['processed', 'feature_extraction', 'joined_features'],
            upload_source_files=get_filepaths()):

        features = pd.read_csv(interim_feature_paths[0],
                               usecols=['SK_ID_CURR'],
                               nrows=NROWS)
        for path in interim_feature_paths:
            df = pd.read_csv(path, nrows=NROWS)
            features = features.merge(df, on='SK_ID_CURR')

        features.to_csv(PROCESSED_FEATURES_FILEPATH, index=None)
        neptune.set_property('features_version',
                             md5_hash(PROCESSED_FEATURES_FILEPATH))
        neptune.set_property('features_path', PROCESSED_FEATURES_FILEPATH)
示例#4
0
    @skopt.utils.use_named_args(SPACE)
    def objective(**params):
        all_params = {**params, **STATIC_PARAMS}
        results = train_evaluate(train, valid, all_params)
        return -1.0 * results['valid_score']

    with neptune.create_experiment(name='model training',
                                   params=experiment_params,
                                   tags=['hpo', 'lgbm'],
                                   upload_source_files=get_filepaths(),
                                   properties={
                                       'features_path':
                                       FEATURES_PATH,
                                       'features_version':
                                       md5_hash(FEATURES_PATH),
                                       'train_split_version':
                                       md5_hash(TRAIN_IDX_PATH),
                                       'valid_split_version':
                                       md5_hash(VALID_IDX_PATH)
                                   }):

        results = skopt.forest_minimize(objective,
                                        SPACE,
                                        callback=[monitor],
                                        **HPO_PARAMS)

        best_auc = -1.0 * results.fun
        best_params = results.x

        neptune.send_metric('valid_auc', best_auc)
def main():
    neptune.init(api_token=os.getenv('NEPTUNE_API_TOKEN'),
                 project_qualified_name=os.getenv('NEPTUNE_PROJECT'))

    train_idx = pd.read_csv(TRAIN_IDX_PATH, nrows=NROWS)
    valid_idx = pd.read_csv(VALID_IDX_PATH, nrows=NROWS)
    features = pd.read_csv(FEATURES_PATH, nrows=NROWS)

    train = pd.merge(train_idx, features, on='SK_ID_CURR')
    valid = pd.merge(valid_idx, features, on='SK_ID_CURR')

    all_params = {
        'num_boost_round': NUM_BOOST_ROUND,
        'early_stopping_rounds': EARLY_STOPPING_ROUNDS,
        **LGBM_PARAMS
    }

    with neptune.create_experiment(name='model training',
                                   params=all_params,
                                   tags=['lgbm'],
                                   upload_source_files=get_filepaths(),
                                   properties={
                                       'features_path':
                                       FEATURES_PATH,
                                       'features_version':
                                       md5_hash(FEATURES_PATH),
                                       'train_split_version':
                                       md5_hash(TRAIN_IDX_PATH),
                                       'valid_split_version':
                                       md5_hash(VALID_IDX_PATH),
                                   }):
        results = train_evaluate(train,
                                 valid,
                                 LGBM_PARAMS,
                                 callbacks=[neptune_monitor()])
        train_score, valid_score = results['train_score'], results[
            'valid_score']
        train_preds, valid_preds = results['train_preds'], results[
            'valid_preds']

        neptune.send_metric('train_auc', train_score)
        neptune.send_metric('valid_auc', valid_score)

        train_pred_path = os.path.join(PREDICTION_DIRPATH, 'train_preds.csv')
        train_preds.to_csv(train_pred_path, index=None)
        neptune.send_artifact(train_pred_path)

        valid_pred_path = os.path.join(PREDICTION_DIRPATH, 'valid_preds.csv')
        valid_preds.to_csv(valid_pred_path, index=None)
        neptune.send_artifact(valid_pred_path)

        model_path = os.path.join(MODEL_DIRPATH, 'model.pkl')
        joblib.dump(results['model'], model_path)
        neptune.set_property('model_path', model_path)
        neptune.set_property('model_version', md5_hash(model_path))
        neptune.send_artifact(model_path)

        if PACKAGE_TO_PROD:
            saved_path = CreditDefaultClassifier.pack(
                model=results['model']).save(PRODUCTION_DIRPATH)
            neptune.set_property('production_model_path', saved_path)

        fig, ax = plt.subplots(figsize=(16, 12))
        sk_metrics.plot_confusion_matrix(valid_preds['TARGET'],
                                         valid_preds['preds_pos'] > 0.5,
                                         ax=ax)
        plot_path = os.path.join(REPORTS_DIRPATH, 'conf_matrix.png')
        fig.savefig(plot_path)
        neptune.send_image('diagnostics', plot_path)

        fig, ax = plt.subplots(figsize=(16, 12))
        sk_metrics.plot_roc(valid_preds['TARGET'],
                            valid_preds[['preds_neg', 'preds_pos']],
                            ax=ax)
        plot_path = os.path.join(REPORTS_DIRPATH, 'roc_auc.png')
        fig.savefig(plot_path)
        neptune.send_image('diagnostics', plot_path)

        fig, ax = plt.subplots(figsize=(16, 12))
        sk_metrics.plot_precision_recall(
            valid_preds['TARGET'],
            valid_preds[['preds_neg', 'preds_pos']],
            ax=ax)
        plot_path = os.path.join(REPORTS_DIRPATH, 'prec_recall.png')
        fig.savefig(plot_path)
        neptune.send_image('diagnostics', plot_path)

        fig, ax = plt.subplots(figsize=(16, 12))
        plot_prediction_distribution(valid_preds['TARGET'],
                                     valid_preds['preds_pos'],
                                     ax=ax)
        plot_path = os.path.join(REPORTS_DIRPATH, 'preds_dist.png')
        fig.savefig(plot_path)
        neptune.send_image('diagnostics', plot_path)