def get_features(df): features_df = pd.DataFrame() content_id_prior_array = np.zeros(len(df)) content_id_next_array = np.zeros(len(df)) user_gp = df.groupby('user_id') for user_id, user_df in tqdm(user_gp, total=len(user_gp)): user_idx = user_df.index.values content_id_prior_array[user_idx] = user_df['content_id'].shift( 1).values content_id_next_array[user_idx] = user_df['content_id'].shift( -1).values df['content_id_prior'] = content_id_prior_array df['content_id_next'] = content_id_next_array fold_df = get_fold(cfg, df) for col in ['content_id_prior', 'content_id_next']: te = TargetEncoding(fold_df) features_df[f'te_{col}_by_answered_correctly'] = te.fit_transform( df[col], df['answered_correctly']) dh.save( f'../data/processed/dropped___te_{col}_by_answered_correctly.pkl', te.encoder) features_df.columns = [f'dropped___{col}' for col in features_df.columns] return features_df
def get_features(df): features_df = pd.DataFrame() df['content_id_and_attempt_c'] = df['content_id'].astype( str) + '_' + df['attempt_c'].astype(str) fold_df = get_fold(cfg, df) for col in ['content_id', 'tag', 'part', 'content_id_and_attempt_c']: te = TargetEncoding(fold_df) features_df[f'te_{col}_by_answered_correctly'] = te.fit_transform( df[col], df['answered_correctly']) dh.save(f'../data/processed/te_{col}_by_answered_correctly.pkl', te.encoder) return features_df
def main(): t = Timer() seed_everything(cfg.common.seed) logger_path.mkdir(exist_ok=True) logging.basicConfig(filename=logger_path / 'train.log', level=logging.DEBUG) dh.save(logger_path / 'config.yml', cfg) with t.timer('load data'): train_df = pd.read_csv(const.TRAIN_PATH) test_df = pd.read_csv(const.TEST_PATH) with t.timer('make folds'): fold_df = factory.get_fold(cfg.validation, train_df) if cfg.validation.single: fold_df = fold_df[['fold_0']] fold_df /= fold_df['fold_0'].max() with t.timer('drop index'): if cfg.common.drop is not None: drop_idx = factory.get_drop_idx(cfg.common.drop) train_df = train_df.drop(drop_idx, axis=0).reset_index(drop=True) fold_df = fold_df.drop(drop_idx, axis=0).reset_index(drop=True) with t.timer('train model'): trainer = NNTrainer(run_name, fold_df, cfg) cv = trainer.train(train_df=train_df, target_df=train_df[const.TARGET_COL]) preds = trainer.predict(test_df) trainer.save() run_name_cv = f'{run_name}_{cv:.3f}' logger_path.rename(f'../logs/{run_name_cv}') logging.disable(logging.FATAL) with t.timer('make submission'): make_submission(run_name=run_name_cv, y_pred=preds, target_name='Label', comp=False) if cfg.common.kaggle.submit: kaggle = Kaggle(cfg.compe.name, run_name_cv) kaggle.submit(comment)
def main(): t = Timer() seed_everything(cfg.common.seed) logger_path.mkdir(exist_ok=True) logging.basicConfig(filename=logger_path / 'train.log', level=logging.DEBUG) dh.save(logger_path / 'config.yml', cfg) dh.save(logger_path / 'features.yml', features_params) with t.timer('load data'): train_df = dh.load('../data/input/train.csv') train2019_df = dh.load('../data/input/train_concated.csv') train_x = factory.get_features(features, cfg.data.loader.train) test_x = factory.get_features(features, cfg.data.loader.test) train_y = factory.get_target(cfg.data.target) with t.timer('add oof'): if cfg.data.features.oof.name is not None: oof, preds = factory.get_oof(cfg.data) train_x['oof'] = oof test_x['oof'] = preds features.append('oof') with t.timer('make folds'): fold_df = factory.get_fold(cfg.validation, train_df, train_df[['target']]) fold_df = pd.concat([ fold_df, pd.DataFrame(np.zeros((len(train2019_df), len(fold_df.columns))), columns=fold_df.columns) ], axis=0, sort=False, ignore_index=True) if cfg.validation.single: fold_df = fold_df[['fold_0']] fold_df /= fold_df['fold_0'].max() with t.timer('drop index'): if cfg.common.drop is not None: drop_idx = factory.get_drop_idx(cfg.common.drop) train_x = train_x.drop(drop_idx, axis=0).reset_index(drop=True) train_y = train_y.drop(drop_idx, axis=0).reset_index(drop=True) fold_df = fold_df.drop(drop_idx, axis=0).reset_index(drop=True) with t.timer('prepare for ad'): if cfg.data.adversarial_validation: train_x, train_y = factory.get_ad(cfg, train_x, test_x) with t.timer('train and predict'): trainer = Trainer(cfg) cv = trainer.train(train_df=train_x, target_df=train_y, fold_df=fold_df) preds = trainer.predict(test_x) trainer.save(run_name) run_name_cv = f'{run_name}_{cv:.3f}' logger_path.rename(f'../logs/{run_name_cv}') logging.disable(logging.FATAL) with t.timer('make submission'): sample_path = f'../data/input/sample_submission.csv' output_path = f'../data/output/{run_name_cv}.csv' make_submission(y_pred=preds, target_name=cfg.data.target.name, sample_path=sample_path, output_path=output_path, comp=False) if cfg.common.kaggle.submit: kaggle = Kaggle(cfg.compe.name, run_name_cv) kaggle.submit(comment) with t.timer('notify'): process_minutes = t.get_processing_time() message = f'''{cfg.model.name}\ncv: {cv:.3f}\ntime: {process_minutes}[min]''' send_line(notify_params.line.token, message) notion = Notion(token=notify_params.notion.token_v2) notion.set_url(url=notify_params.notion.url) notion.insert_rows({ 'name': run_name_cv, 'created': now, 'model': options.model, 'local_cv': round(cv, 4), 'time': process_minutes, 'comment': comment })
def main(): t = Timer() seed_everything(cfg.common.seed) logger_path.mkdir(exist_ok=True) logging.basicConfig(filename=logger_path / 'train.log', level=logging.DEBUG) dh.save(logger_path / 'config.yml', cfg) with t.timer('load data'): train_x = dh.load('../data/input/train_concated.csv') train_org_x = dh.load('../data/input/train.csv') train_2019_x = dh.load('../data/input/train_2019.csv') test_x = dh.load('../data/input/test.csv') with t.timer('make folds'): fold_org_df = factory.get_fold(cfg.validation.val1, train_org_x, train_org_x[[cfg.common.target]]) fold2019_df = factory.get_fold(cfg.validation.val2, train_2019_x, train_2019_x[[cfg.common.target]]) fold_df = pd.concat([fold_org_df, fold2019_df], axis=0, sort=False, ignore_index=True) if cfg.validation.val1.single: fold_df = fold_df[['fold_0']] fold_df /= fold_df['fold_0'].max() with t.timer('load features'): features = dh.load('../configs/feature/all.yml')['features'] for f in features: train_x[f] = dh.load(f'../features/{f}_train.feather')[f].fillna(-1) test_x[f] = dh.load(f'../features/{f}_test.feather')[f].fillna(-1) with t.timer('drop several rows'): if cfg.common.drop is not None: drop_idx = factory.get_drop_idx(cfg.common.drop) train_x = train_x.drop(drop_idx, axis=0).reset_index(drop=True) fold_df = fold_df.drop(drop_idx, axis=0).reset_index(drop=True) with t.timer('train model'): result = train_model(run_name, train_x, fold_df, cfg) logging.disable(logging.FATAL) run_name_cv = f'{run_name}_{result["cv"]:.3f}' logger_path.rename(f'../logs/{run_name_cv}') with t.timer('predict'): preds = predict_test(run_name_cv, test_x, fold_df, cfg) with t.timer('post process'): duplicates = { 'ISIC_5224960': 1, 'ISIC_9207777': 1, 'ISIC_6457527': 1, 'ISIC_8347588': 0, 'ISIC_8372206': 1, 'ISIC_9353360': 1, 'ISIC_3689290': 0, 'ISIC_3584949': 0, } for image_name, target in duplicates.items(): idx = test_x[test_x['image_name'] == image_name].index[0] preds[idx] = target with t.timer('make submission'): sample_path = f'../data/input/sample_submission.csv' output_path = f'../data/output/{run_name_cv}.csv' make_submission(y_pred=preds, target_name=cfg.common.target, sample_path=sample_path, output_path=output_path, comp=False) with t.timer('kaggle api'): kaggle = Kaggle(cfg.compe.compe_name, run_name_cv) if cfg.common.kaggle.submit: kaggle.submit(comment) with t.timer('notify'): process_minutes = t.get_processing_time() message = f'''{model_name}\ncv: {result["cv"]:.3f}\ntime: {process_minutes:.2f}[h]''' send_line(notify_params.line.token, message) notion = Notion(token=notify_params.notion.token_v2) notion.set_url(url=notify_params.notion.url) notion.insert_rows({ 'name': run_name_cv, 'created': now, 'model': cfg.model.name, 'local_cv': round(result['cv'], 4), 'time': process_minutes, 'comment': comment })
def get_features(train, test): train_features_df = pd.DataFrame() test_features_df = pd.DataFrame() # TargetEncoding cfg = edict({ 'name': 'KFold', 'params': { 'n_splits': 5, 'shuffle': True, 'random_state': 0, }, 'split': { 'y': 'y', 'groups': None }, 'weight': [1.0] }) fold_df = get_fold(cfg, train, train[['y']]) te = TargetEncoding(fold_df) train_features_df['te_land_type'] = te.fit_transform( train['種類'], train['y']) test_features_df['te_land_type'] = te.transform(test['種類']) te = TargetEncoding(fold_df) train_features_df['te_region'] = te.fit_transform(train['地域'], train['y']) test_features_df['te_region'] = te.transform(test['地域']) te = TargetEncoding(fold_df) train_features_df['te_region_code'] = te.fit_transform( train['市区町村コード'], train['y']) test_features_df['te_region_code'] = te.transform(test['市区町村コード']) te = TargetEncoding(fold_df) train_features_df['te_city_name'] = te.fit_transform( train['市区町村名'], train['y']) test_features_df['te_city_name'] = te.transform(test['市区町村名']) te = TargetEncoding(fold_df) train_features_df['te_town_name'] = te.fit_transform( train['地区名'], train['y']) test_features_df['te_town_name'] = te.transform(test['地区名']) te = TargetEncoding(fold_df) train_features_df['te_station_name'] = te.fit_transform( train['最寄駅:名称'], train['y']) test_features_df['te_station_name'] = te.transform(test['最寄駅:名称']) te = TargetEncoding(fold_df) train_features_df['te_land_shape'] = te.fit_transform( train['土地の形状'], train['y']) test_features_df['te_land_shape'] = te.transform(test['土地の形状']) te = TargetEncoding(fold_df) train_features_df['te_structure'] = te.fit_transform( train['建物の構造'], train['y']) test_features_df['te_structure'] = te.transform(test['建物の構造']) te = TargetEncoding(fold_df) train_features_df['te_feature_purpose'] = te.fit_transform( train['今後の利用目的'], train['y']) test_features_df['te_feature_purpose'] = te.transform(test['今後の利用目的']) te = TargetEncoding(fold_df) train_features_df['te_renovation'] = te.fit_transform( train['改装'], train['y']) test_features_df['te_renovation'] = te.transform(test['改装']) te = TargetEncoding(fold_df) train_features_df['te_city_planning'] = te.fit_transform( train['都市計画'], train['y']) test_features_df['te_city_planning'] = te.transform(test['都市計画']) te = TargetEncoding(fold_df) train_features_df['te_direction'] = te.fit_transform( train['前面道路:方位'], train['y']) test_features_df['te_direction'] = te.transform(test['前面道路:方位']) te = TargetEncoding(fold_df) train_features_df['te_load_type'] = te.fit_transform( train['前面道路:種類'], train['y']) test_features_df['te_load_type'] = te.transform(test['前面道路:種類']) return train_features_df, test_features_df