def main(): t = Timer() seed_everything(cfg.common.seed) logger_path.mkdir(exist_ok=True) logging.basicConfig(filename=logger_path / 'train.log', level=logging.DEBUG) dh.save(logger_path / 'config.yml', cfg) with t.timer('load data'): train_df = pd.read_csv(const.TRAIN_PATH) test_df = pd.read_csv(const.TEST_PATH) with t.timer('make folds'): fold_df = factory.get_fold(cfg.validation, train_df) if cfg.validation.single: fold_df = fold_df[['fold_0']] fold_df /= fold_df['fold_0'].max() with t.timer('drop index'): if cfg.common.drop is not None: drop_idx = factory.get_drop_idx(cfg.common.drop) train_df = train_df.drop(drop_idx, axis=0).reset_index(drop=True) fold_df = fold_df.drop(drop_idx, axis=0).reset_index(drop=True) with t.timer('train model'): trainer = NNTrainer(run_name, fold_df, cfg) cv = trainer.train(train_df=train_df, target_df=train_df[const.TARGET_COL]) preds = trainer.predict(test_df) trainer.save() run_name_cv = f'{run_name}_{cv:.3f}' logger_path.rename(f'../logs/{run_name_cv}') logging.disable(logging.FATAL) with t.timer('make submission'): make_submission(run_name=run_name_cv, y_pred=preds, target_name='Label', comp=False) if cfg.common.kaggle.submit: kaggle = Kaggle(cfg.compe.name, run_name_cv) kaggle.submit(comment)
def main(): t = Timer() seed_everything(cfg.common.seed) logger_path.mkdir(exist_ok=True) dh.save(logger_path / 'config.yml', cfg) with t.timer('load data'): train_df = dh.load('../data/input/train_data.csv') test_df = dh.load('../data/input/test_data.csv') oof = np.zeros((len(train_df), len(cfg.models))) preds = np.zeros((len(test_df), len(cfg.models))) for i, m in enumerate(cfg.models): name = getattr(cfg.models, m).name log_dir = Path(f'../logs/{name}') model_oof = dh.load(log_dir / 'oof.npy') model_cfg = dh.load(log_dir / 'config.yml') if model_cfg.common.drop: drop_idxs = np.array([]) for drop_name in model_cfg.common.drop: drop_idx = dh.load(f'../pickle/{drop_name}.npy') drop_idxs = np.append(drop_idxs, drop_idx) model_oof = factory.fill_dropped(model_oof, drop_idx) model_preds = dh.load(f'../logs/{name}/raw_preds.npy') oof[:, i] = model_oof[:len(train_df)] preds[:, i] = model_preds with t.timer('drop index'): if cfg.common.drop is not None: drop_idxs = np.array([]) for drop_name in model_cfg.common.drop: drop_idx = dh.load(f'../pickle/{drop_name}.npy') drop_idxs = np.append(drop_idxs, drop_idx) train_df = train_df.drop(drop_idxs, axis=0).reset_index(drop=True) with t.timer('optimize model weight'): metric = factory.get_metrics(cfg.common.metrics.name) y_true = train_df[cfg.common.target] def objective(trial): p_list = [0 for i in range(len(cfg.models))] for i in range(len(cfg.models) - 1): p_list[i] = trial.suggest_discrete_uniform(f'p{i}', 0.0, 1.0 - sum(p_list), 0.01) p_list[-1] = round(1 - sum(p_list[:-1]), 2) y_pred = np.zeros(len(train_df)) for i in range(oof.shape[1]): y_pred += oof[:, i] * p_list[i] return metric(y_true, y_pred) study = optuna.create_study(direction='minimize') study.optimize(objective, timeout=10) best_params = list(study.best_params.values()) best_weight = best_params + [round(1 - sum(best_params), 2)] with t.timer('ensemble'): ensemble_oof = np.zeros(len(train_df)) ensemble_preds = np.zeros(len(test_df)) for i in range(len(best_weight)): ensemble_oof += oof[:, i] * best_weight[i] ensemble_preds += preds[:, i] * best_weight[i] dh.save(f'../logs/{run_name}/oof.npy', ensemble_oof) dh.save(f'../logs/{run_name}/raw_preds.npy', ensemble_preds) cv = metric(y_true, ensemble_oof) run_name_cv = f'{run_name}_{cv:.3f}' logger_path.rename(f'../logs/{run_name_cv}') print('\n\n===================================\n') print(f'CV: {cv:.4f}') print(f'BEST WEIGHT: {best_weight}') print('\n===================================\n\n') with t.timer('make submission'): sample_path = f'../data/input/sample_submission.feather' output_path = f'../data/output/{run_name_cv}.csv' make_submission(y_pred=ensemble_preds, target_name=cfg.common.target, sample_path=sample_path, output_path=output_path, comp=False) if cfg.common.kaggle.submit: kaggle = Kaggle(cfg.compe.compe_name, run_name_cv) kaggle.submit(comment) with t.timer('notify'): process_minutes = t.get_processing_time() message = f'''{options.model}\ncv: {cv:.3f}\ntime: {process_minutes}[min]''' send_line(notify_params.line.token, message) notion = Notion(token=notify_params.notion.token_v2) notion.set_url(url=notify_params.notion.url) notion.insert_rows({ 'name': run_name_cv, 'created': now, 'model': options.model, 'local_cv': round(cv, 4), 'time': process_minutes, 'comment': comment })
def main(): t = Timer() seed_everything(cfg.common.seed) logger_path.mkdir(exist_ok=True) logging.basicConfig(filename=logger_path / 'train.log', level=logging.DEBUG) dh.save(logger_path / 'config.yml', cfg) dh.save(logger_path / 'features.yml', features_params) with t.timer('load data'): train_df = dh.load('../data/input/train.csv') train2019_df = dh.load('../data/input/train_concated.csv') train_x = factory.get_features(features, cfg.data.loader.train) test_x = factory.get_features(features, cfg.data.loader.test) train_y = factory.get_target(cfg.data.target) with t.timer('add oof'): if cfg.data.features.oof.name is not None: oof, preds = factory.get_oof(cfg.data) train_x['oof'] = oof test_x['oof'] = preds features.append('oof') with t.timer('make folds'): fold_df = factory.get_fold(cfg.validation, train_df, train_df[['target']]) fold_df = pd.concat([ fold_df, pd.DataFrame(np.zeros((len(train2019_df), len(fold_df.columns))), columns=fold_df.columns) ], axis=0, sort=False, ignore_index=True) if cfg.validation.single: fold_df = fold_df[['fold_0']] fold_df /= fold_df['fold_0'].max() with t.timer('drop index'): if cfg.common.drop is not None: drop_idx = factory.get_drop_idx(cfg.common.drop) train_x = train_x.drop(drop_idx, axis=0).reset_index(drop=True) train_y = train_y.drop(drop_idx, axis=0).reset_index(drop=True) fold_df = fold_df.drop(drop_idx, axis=0).reset_index(drop=True) with t.timer('prepare for ad'): if cfg.data.adversarial_validation: train_x, train_y = factory.get_ad(cfg, train_x, test_x) with t.timer('train and predict'): trainer = Trainer(cfg) cv = trainer.train(train_df=train_x, target_df=train_y, fold_df=fold_df) preds = trainer.predict(test_x) trainer.save(run_name) run_name_cv = f'{run_name}_{cv:.3f}' logger_path.rename(f'../logs/{run_name_cv}') logging.disable(logging.FATAL) with t.timer('make submission'): sample_path = f'../data/input/sample_submission.csv' output_path = f'../data/output/{run_name_cv}.csv' make_submission(y_pred=preds, target_name=cfg.data.target.name, sample_path=sample_path, output_path=output_path, comp=False) if cfg.common.kaggle.submit: kaggle = Kaggle(cfg.compe.name, run_name_cv) kaggle.submit(comment) with t.timer('notify'): process_minutes = t.get_processing_time() message = f'''{cfg.model.name}\ncv: {cv:.3f}\ntime: {process_minutes}[min]''' send_line(notify_params.line.token, message) notion = Notion(token=notify_params.notion.token_v2) notion.set_url(url=notify_params.notion.url) notion.insert_rows({ 'name': run_name_cv, 'created': now, 'model': options.model, 'local_cv': round(cv, 4), 'time': process_minutes, 'comment': comment })
def main(): t = Timer() seed_everything(cfg.common.seed) logger_path.mkdir(exist_ok=True) logging.basicConfig(filename=logger_path / 'train.log', level=logging.DEBUG) dh.save(logger_path / 'config.yml', cfg) with t.timer('load data'): train_x = dh.load('../data/input/train_concated.csv') train_org_x = dh.load('../data/input/train.csv') train_2019_x = dh.load('../data/input/train_2019.csv') test_x = dh.load('../data/input/test.csv') with t.timer('make folds'): fold_org_df = factory.get_fold(cfg.validation.val1, train_org_x, train_org_x[[cfg.common.target]]) fold2019_df = factory.get_fold(cfg.validation.val2, train_2019_x, train_2019_x[[cfg.common.target]]) fold_df = pd.concat([fold_org_df, fold2019_df], axis=0, sort=False, ignore_index=True) if cfg.validation.val1.single: fold_df = fold_df[['fold_0']] fold_df /= fold_df['fold_0'].max() with t.timer('load features'): features = dh.load('../configs/feature/all.yml')['features'] for f in features: train_x[f] = dh.load(f'../features/{f}_train.feather')[f].fillna(-1) test_x[f] = dh.load(f'../features/{f}_test.feather')[f].fillna(-1) with t.timer('drop several rows'): if cfg.common.drop is not None: drop_idx = factory.get_drop_idx(cfg.common.drop) train_x = train_x.drop(drop_idx, axis=0).reset_index(drop=True) fold_df = fold_df.drop(drop_idx, axis=0).reset_index(drop=True) with t.timer('train model'): result = train_model(run_name, train_x, fold_df, cfg) logging.disable(logging.FATAL) run_name_cv = f'{run_name}_{result["cv"]:.3f}' logger_path.rename(f'../logs/{run_name_cv}') with t.timer('predict'): preds = predict_test(run_name_cv, test_x, fold_df, cfg) with t.timer('post process'): duplicates = { 'ISIC_5224960': 1, 'ISIC_9207777': 1, 'ISIC_6457527': 1, 'ISIC_8347588': 0, 'ISIC_8372206': 1, 'ISIC_9353360': 1, 'ISIC_3689290': 0, 'ISIC_3584949': 0, } for image_name, target in duplicates.items(): idx = test_x[test_x['image_name'] == image_name].index[0] preds[idx] = target with t.timer('make submission'): sample_path = f'../data/input/sample_submission.csv' output_path = f'../data/output/{run_name_cv}.csv' make_submission(y_pred=preds, target_name=cfg.common.target, sample_path=sample_path, output_path=output_path, comp=False) with t.timer('kaggle api'): kaggle = Kaggle(cfg.compe.compe_name, run_name_cv) if cfg.common.kaggle.submit: kaggle.submit(comment) with t.timer('notify'): process_minutes = t.get_processing_time() message = f'''{model_name}\ncv: {result["cv"]:.3f}\ntime: {process_minutes:.2f}[h]''' send_line(notify_params.line.token, message) notion = Notion(token=notify_params.notion.token_v2) notion.set_url(url=notify_params.notion.url) notion.insert_rows({ 'name': run_name_cv, 'created': now, 'model': cfg.model.name, 'local_cv': round(result['cv'], 4), 'time': process_minutes, 'comment': comment })