def main(): neptune.init(api_token=os.getenv('NEPTUNE_API_TOKEN'), project_qualified_name=os.getenv('NEPTUNE_PROJECT')) application_table_path = os.path.join(RAW_DATA_DIRPATH, 'application_train.csv.zip') application_table = pd.read_csv(application_table_path, nrows=NROWS) index_table = application_table[['SK_ID_CURR', 'TARGET']] with neptune.create_experiment(name='validation schema', tags=['processed', 'validation'], upload_source_files=get_filepaths()): train_idx, valid_idx = train_test_split(index_table, test_size=TEST_SIZE, random_state=SEED) train_idx_path = os.path.join(INTERIM_FEATURES_DIRPATH, 'train_idx.csv') train_idx.to_csv(train_idx_path, index=None) neptune.send_artifact(train_idx_path) neptune.set_property('train_split_version', md5_hash(train_idx_path)) valid_idx_path = os.path.join(INTERIM_FEATURES_DIRPATH, 'valid_idx.csv') valid_idx.to_csv(valid_idx_path, index=None) neptune.send_artifact(valid_idx_path) neptune.set_property('valid_split_version', md5_hash(valid_idx_path))
def main(): neptune.init(api_token=os.getenv('NEPTUNE_API_TOKEN'), project_qualified_name=os.getenv('NEPTUNE_PROJECT')) bureau_raw_path = os.path.join(RAW_DATA_DIRPATH,'bureau.csv.zip') bureau_raw = pd.read_csv(bureau_raw_path, nrows=NROWS) with neptune.create_experiment(name='feature_extraction', tags=['interim', 'bureau', 'feature_extraction'], upload_source_files=get_filepaths()): bureau_features, numeric_cols = extract(bureau_raw) bureau_features.to_csv(INTERIM_FEATURES_FILEPATH, index=None) neptune.set_property('numeric_features', str(numeric_cols)) neptune.set_property('features_version', md5_hash(INTERIM_FEATURES_FILEPATH)) neptune.set_property('features_path', INTERIM_FEATURES_FILEPATH)
def main(): neptune.init(api_token=os.getenv('NEPTUNE_API_TOKEN'), project_qualified_name=os.getenv('NEPTUNE_PROJECT')) interim_feature_paths = [APPLICATION_FEATURES_PATH, BUREAU_FEATURES_PATH] with neptune.create_experiment( name='feature_extraction', tags=['processed', 'feature_extraction', 'joined_features'], upload_source_files=get_filepaths()): features = pd.read_csv(interim_feature_paths[0], usecols=['SK_ID_CURR'], nrows=NROWS) for path in interim_feature_paths: df = pd.read_csv(path, nrows=NROWS) features = features.merge(df, on='SK_ID_CURR') features.to_csv(PROCESSED_FEATURES_FILEPATH, index=None) neptune.set_property('features_version', md5_hash(PROCESSED_FEATURES_FILEPATH)) neptune.set_property('features_path', PROCESSED_FEATURES_FILEPATH)
@skopt.utils.use_named_args(SPACE) def objective(**params): all_params = {**params, **STATIC_PARAMS} results = train_evaluate(train, valid, all_params) return -1.0 * results['valid_score'] with neptune.create_experiment(name='model training', params=experiment_params, tags=['hpo', 'lgbm'], upload_source_files=get_filepaths(), properties={ 'features_path': FEATURES_PATH, 'features_version': md5_hash(FEATURES_PATH), 'train_split_version': md5_hash(TRAIN_IDX_PATH), 'valid_split_version': md5_hash(VALID_IDX_PATH) }): results = skopt.forest_minimize(objective, SPACE, callback=[monitor], **HPO_PARAMS) best_auc = -1.0 * results.fun best_params = results.x neptune.send_metric('valid_auc', best_auc)
def main(): neptune.init(api_token=os.getenv('NEPTUNE_API_TOKEN'), project_qualified_name=os.getenv('NEPTUNE_PROJECT')) train_idx = pd.read_csv(TRAIN_IDX_PATH, nrows=NROWS) valid_idx = pd.read_csv(VALID_IDX_PATH, nrows=NROWS) features = pd.read_csv(FEATURES_PATH, nrows=NROWS) train = pd.merge(train_idx, features, on='SK_ID_CURR') valid = pd.merge(valid_idx, features, on='SK_ID_CURR') all_params = { 'num_boost_round': NUM_BOOST_ROUND, 'early_stopping_rounds': EARLY_STOPPING_ROUNDS, **LGBM_PARAMS } with neptune.create_experiment(name='model training', params=all_params, tags=['lgbm'], upload_source_files=get_filepaths(), properties={ 'features_path': FEATURES_PATH, 'features_version': md5_hash(FEATURES_PATH), 'train_split_version': md5_hash(TRAIN_IDX_PATH), 'valid_split_version': md5_hash(VALID_IDX_PATH), }): results = train_evaluate(train, valid, LGBM_PARAMS, callbacks=[neptune_monitor()]) train_score, valid_score = results['train_score'], results[ 'valid_score'] train_preds, valid_preds = results['train_preds'], results[ 'valid_preds'] neptune.send_metric('train_auc', train_score) neptune.send_metric('valid_auc', valid_score) train_pred_path = os.path.join(PREDICTION_DIRPATH, 'train_preds.csv') train_preds.to_csv(train_pred_path, index=None) neptune.send_artifact(train_pred_path) valid_pred_path = os.path.join(PREDICTION_DIRPATH, 'valid_preds.csv') valid_preds.to_csv(valid_pred_path, index=None) neptune.send_artifact(valid_pred_path) model_path = os.path.join(MODEL_DIRPATH, 'model.pkl') joblib.dump(results['model'], model_path) neptune.set_property('model_path', model_path) neptune.set_property('model_version', md5_hash(model_path)) neptune.send_artifact(model_path) if PACKAGE_TO_PROD: saved_path = CreditDefaultClassifier.pack( model=results['model']).save(PRODUCTION_DIRPATH) neptune.set_property('production_model_path', saved_path) fig, ax = plt.subplots(figsize=(16, 12)) sk_metrics.plot_confusion_matrix(valid_preds['TARGET'], valid_preds['preds_pos'] > 0.5, ax=ax) plot_path = os.path.join(REPORTS_DIRPATH, 'conf_matrix.png') fig.savefig(plot_path) neptune.send_image('diagnostics', plot_path) fig, ax = plt.subplots(figsize=(16, 12)) sk_metrics.plot_roc(valid_preds['TARGET'], valid_preds[['preds_neg', 'preds_pos']], ax=ax) plot_path = os.path.join(REPORTS_DIRPATH, 'roc_auc.png') fig.savefig(plot_path) neptune.send_image('diagnostics', plot_path) fig, ax = plt.subplots(figsize=(16, 12)) sk_metrics.plot_precision_recall( valid_preds['TARGET'], valid_preds[['preds_neg', 'preds_pos']], ax=ax) plot_path = os.path.join(REPORTS_DIRPATH, 'prec_recall.png') fig.savefig(plot_path) neptune.send_image('diagnostics', plot_path) fig, ax = plt.subplots(figsize=(16, 12)) plot_prediction_distribution(valid_preds['TARGET'], valid_preds['preds_pos'], ax=ax) plot_path = os.path.join(REPORTS_DIRPATH, 'preds_dist.png') fig.savefig(plot_path) neptune.send_image('diagnostics', plot_path)