def get_features(df):
    features_df = pd.DataFrame()

    content_id_prior_array = np.zeros(len(df))
    content_id_next_array = np.zeros(len(df))

    user_gp = df.groupby('user_id')

    for user_id, user_df in tqdm(user_gp, total=len(user_gp)):
        user_idx = user_df.index.values
        content_id_prior_array[user_idx] = user_df['content_id'].shift(
            1).values
        content_id_next_array[user_idx] = user_df['content_id'].shift(
            -1).values

    df['content_id_prior'] = content_id_prior_array
    df['content_id_next'] = content_id_next_array

    fold_df = get_fold(cfg, df)

    for col in ['content_id_prior', 'content_id_next']:
        te = TargetEncoding(fold_df)
        features_df[f'te_{col}_by_answered_correctly'] = te.fit_transform(
            df[col], df['answered_correctly'])
        dh.save(
            f'../data/processed/dropped___te_{col}_by_answered_correctly.pkl',
            te.encoder)

    features_df.columns = [f'dropped___{col}' for col in features_df.columns]

    return features_df
Exemplo n.º 2
0
def get_features(df):
    features_df = pd.DataFrame()

    df['content_id_and_attempt_c'] = df['content_id'].astype(
        str) + '_' + df['attempt_c'].astype(str)

    fold_df = get_fold(cfg, df)

    for col in ['content_id', 'tag', 'part', 'content_id_and_attempt_c']:
        te = TargetEncoding(fold_df)
        features_df[f'te_{col}_by_answered_correctly'] = te.fit_transform(
            df[col], df['answered_correctly'])
        dh.save(f'../data/processed/te_{col}_by_answered_correctly.pkl',
                te.encoder)

    return features_df
Exemplo n.º 3
0
def main():
    t = Timer()
    seed_everything(cfg.common.seed)

    logger_path.mkdir(exist_ok=True)
    logging.basicConfig(filename=logger_path / 'train.log',
                        level=logging.DEBUG)

    dh.save(logger_path / 'config.yml', cfg)

    with t.timer('load data'):
        train_df = pd.read_csv(const.TRAIN_PATH)
        test_df = pd.read_csv(const.TEST_PATH)

    with t.timer('make folds'):
        fold_df = factory.get_fold(cfg.validation, train_df)
        if cfg.validation.single:
            fold_df = fold_df[['fold_0']]
            fold_df /= fold_df['fold_0'].max()

    with t.timer('drop index'):
        if cfg.common.drop is not None:
            drop_idx = factory.get_drop_idx(cfg.common.drop)
            train_df = train_df.drop(drop_idx, axis=0).reset_index(drop=True)
            fold_df = fold_df.drop(drop_idx, axis=0).reset_index(drop=True)

    with t.timer('train model'):
        trainer = NNTrainer(run_name, fold_df, cfg)
        cv = trainer.train(train_df=train_df,
                           target_df=train_df[const.TARGET_COL])
        preds = trainer.predict(test_df)
        trainer.save()

        run_name_cv = f'{run_name}_{cv:.3f}'
        logger_path.rename(f'../logs/{run_name_cv}')
        logging.disable(logging.FATAL)

    with t.timer('make submission'):
        make_submission(run_name=run_name_cv,
                        y_pred=preds,
                        target_name='Label',
                        comp=False)
        if cfg.common.kaggle.submit:
            kaggle = Kaggle(cfg.compe.name, run_name_cv)
            kaggle.submit(comment)
Exemplo n.º 4
0
def main():
    t = Timer()
    seed_everything(cfg.common.seed)

    logger_path.mkdir(exist_ok=True)
    logging.basicConfig(filename=logger_path / 'train.log',
                        level=logging.DEBUG)

    dh.save(logger_path / 'config.yml', cfg)
    dh.save(logger_path / 'features.yml', features_params)

    with t.timer('load data'):
        train_df = dh.load('../data/input/train.csv')
        train2019_df = dh.load('../data/input/train_concated.csv')
        train_x = factory.get_features(features, cfg.data.loader.train)
        test_x = factory.get_features(features, cfg.data.loader.test)
        train_y = factory.get_target(cfg.data.target)

    with t.timer('add oof'):
        if cfg.data.features.oof.name is not None:
            oof, preds = factory.get_oof(cfg.data)
            train_x['oof'] = oof
            test_x['oof'] = preds
            features.append('oof')

    with t.timer('make folds'):
        fold_df = factory.get_fold(cfg.validation, train_df,
                                   train_df[['target']])
        fold_df = pd.concat([
            fold_df,
            pd.DataFrame(np.zeros((len(train2019_df), len(fold_df.columns))),
                         columns=fold_df.columns)
        ],
                            axis=0,
                            sort=False,
                            ignore_index=True)
        if cfg.validation.single:
            fold_df = fold_df[['fold_0']]
            fold_df /= fold_df['fold_0'].max()

    with t.timer('drop index'):
        if cfg.common.drop is not None:
            drop_idx = factory.get_drop_idx(cfg.common.drop)
            train_x = train_x.drop(drop_idx, axis=0).reset_index(drop=True)
            train_y = train_y.drop(drop_idx, axis=0).reset_index(drop=True)
            fold_df = fold_df.drop(drop_idx, axis=0).reset_index(drop=True)

    with t.timer('prepare for ad'):
        if cfg.data.adversarial_validation:
            train_x, train_y = factory.get_ad(cfg, train_x, test_x)

    with t.timer('train and predict'):
        trainer = Trainer(cfg)
        cv = trainer.train(train_df=train_x,
                           target_df=train_y,
                           fold_df=fold_df)
        preds = trainer.predict(test_x)
        trainer.save(run_name)

        run_name_cv = f'{run_name}_{cv:.3f}'
        logger_path.rename(f'../logs/{run_name_cv}')
        logging.disable(logging.FATAL)

    with t.timer('make submission'):
        sample_path = f'../data/input/sample_submission.csv'
        output_path = f'../data/output/{run_name_cv}.csv'
        make_submission(y_pred=preds,
                        target_name=cfg.data.target.name,
                        sample_path=sample_path,
                        output_path=output_path,
                        comp=False)
        if cfg.common.kaggle.submit:
            kaggle = Kaggle(cfg.compe.name, run_name_cv)
            kaggle.submit(comment)

    with t.timer('notify'):
        process_minutes = t.get_processing_time()
        message = f'''{cfg.model.name}\ncv: {cv:.3f}\ntime: {process_minutes}[min]'''
        send_line(notify_params.line.token, message)

        notion = Notion(token=notify_params.notion.token_v2)
        notion.set_url(url=notify_params.notion.url)
        notion.insert_rows({
            'name': run_name_cv,
            'created': now,
            'model': options.model,
            'local_cv': round(cv, 4),
            'time': process_minutes,
            'comment': comment
        })
Exemplo n.º 5
0
def main():
    t = Timer()
    seed_everything(cfg.common.seed)

    logger_path.mkdir(exist_ok=True)
    logging.basicConfig(filename=logger_path / 'train.log', level=logging.DEBUG)

    dh.save(logger_path / 'config.yml', cfg)

    with t.timer('load data'):
        train_x = dh.load('../data/input/train_concated.csv')
        train_org_x = dh.load('../data/input/train.csv')
        train_2019_x = dh.load('../data/input/train_2019.csv')
        test_x = dh.load('../data/input/test.csv')

    with t.timer('make folds'):
        fold_org_df = factory.get_fold(cfg.validation.val1, train_org_x, train_org_x[[cfg.common.target]])
        fold2019_df = factory.get_fold(cfg.validation.val2, train_2019_x, train_2019_x[[cfg.common.target]])
        fold_df = pd.concat([fold_org_df, fold2019_df], axis=0, sort=False, ignore_index=True)
        if cfg.validation.val1.single:
            fold_df = fold_df[['fold_0']]
            fold_df /= fold_df['fold_0'].max()

    with t.timer('load features'):
        features = dh.load('../configs/feature/all.yml')['features']
        for f in features:
            train_x[f] = dh.load(f'../features/{f}_train.feather')[f].fillna(-1)
            test_x[f] = dh.load(f'../features/{f}_test.feather')[f].fillna(-1)

    with t.timer('drop several rows'):
        if cfg.common.drop is not None:
            drop_idx = factory.get_drop_idx(cfg.common.drop)
            train_x = train_x.drop(drop_idx, axis=0).reset_index(drop=True)
            fold_df = fold_df.drop(drop_idx, axis=0).reset_index(drop=True)

    with t.timer('train model'):
        result = train_model(run_name, train_x, fold_df, cfg)
    
    logging.disable(logging.FATAL)
    run_name_cv = f'{run_name}_{result["cv"]:.3f}'
    logger_path.rename(f'../logs/{run_name_cv}')

    with t.timer('predict'):
        preds = predict_test(run_name_cv, test_x, fold_df, cfg)

    with t.timer('post process'):
        duplicates = {
            'ISIC_5224960': 1,
            'ISIC_9207777': 1,
            'ISIC_6457527': 1,
            'ISIC_8347588': 0,
            'ISIC_8372206': 1,
            'ISIC_9353360': 1,
            'ISIC_3689290': 0,
            'ISIC_3584949': 0,  
        }
        for image_name, target in duplicates.items():
            idx = test_x[test_x['image_name'] == image_name].index[0]
            preds[idx] = target

    with t.timer('make submission'):
        sample_path = f'../data/input/sample_submission.csv'
        output_path = f'../data/output/{run_name_cv}.csv'
        make_submission(y_pred=preds,
                        target_name=cfg.common.target,
                        sample_path=sample_path,
                        output_path=output_path,
                        comp=False)

    with t.timer('kaggle api'):
        kaggle = Kaggle(cfg.compe.compe_name, run_name_cv)
        if cfg.common.kaggle.submit:
            kaggle.submit(comment)

    with t.timer('notify'):
        process_minutes = t.get_processing_time()
        message = f'''{model_name}\ncv: {result["cv"]:.3f}\ntime: {process_minutes:.2f}[h]'''
        send_line(notify_params.line.token, message)

        notion = Notion(token=notify_params.notion.token_v2)
        notion.set_url(url=notify_params.notion.url)
        notion.insert_rows({
            'name': run_name_cv,
            'created': now,
            'model': cfg.model.name,
            'local_cv': round(result['cv'], 4),
            'time': process_minutes,
            'comment': comment
        })
def get_features(train, test):
    train_features_df = pd.DataFrame()
    test_features_df = pd.DataFrame()

    # TargetEncoding
    cfg = edict({
        'name': 'KFold',
        'params': {
            'n_splits': 5,
            'shuffle': True,
            'random_state': 0,
        },
        'split': {
            'y': 'y',
            'groups': None
        },
        'weight': [1.0]
    })
    fold_df = get_fold(cfg, train, train[['y']])

    te = TargetEncoding(fold_df)
    train_features_df['te_land_type'] = te.fit_transform(
        train['種類'], train['y'])
    test_features_df['te_land_type'] = te.transform(test['種類'])

    te = TargetEncoding(fold_df)
    train_features_df['te_region'] = te.fit_transform(train['地域'], train['y'])
    test_features_df['te_region'] = te.transform(test['地域'])

    te = TargetEncoding(fold_df)
    train_features_df['te_region_code'] = te.fit_transform(
        train['市区町村コード'], train['y'])
    test_features_df['te_region_code'] = te.transform(test['市区町村コード'])

    te = TargetEncoding(fold_df)
    train_features_df['te_city_name'] = te.fit_transform(
        train['市区町村名'], train['y'])
    test_features_df['te_city_name'] = te.transform(test['市区町村名'])

    te = TargetEncoding(fold_df)
    train_features_df['te_town_name'] = te.fit_transform(
        train['地区名'], train['y'])
    test_features_df['te_town_name'] = te.transform(test['地区名'])

    te = TargetEncoding(fold_df)
    train_features_df['te_station_name'] = te.fit_transform(
        train['最寄駅:名称'], train['y'])
    test_features_df['te_station_name'] = te.transform(test['最寄駅:名称'])

    te = TargetEncoding(fold_df)
    train_features_df['te_land_shape'] = te.fit_transform(
        train['土地の形状'], train['y'])
    test_features_df['te_land_shape'] = te.transform(test['土地の形状'])

    te = TargetEncoding(fold_df)
    train_features_df['te_structure'] = te.fit_transform(
        train['建物の構造'], train['y'])
    test_features_df['te_structure'] = te.transform(test['建物の構造'])

    te = TargetEncoding(fold_df)
    train_features_df['te_feature_purpose'] = te.fit_transform(
        train['今後の利用目的'], train['y'])
    test_features_df['te_feature_purpose'] = te.transform(test['今後の利用目的'])

    te = TargetEncoding(fold_df)
    train_features_df['te_renovation'] = te.fit_transform(
        train['改装'], train['y'])
    test_features_df['te_renovation'] = te.transform(test['改装'])

    te = TargetEncoding(fold_df)
    train_features_df['te_city_planning'] = te.fit_transform(
        train['都市計画'], train['y'])
    test_features_df['te_city_planning'] = te.transform(test['都市計画'])

    te = TargetEncoding(fold_df)
    train_features_df['te_direction'] = te.fit_transform(
        train['前面道路:方位'], train['y'])
    test_features_df['te_direction'] = te.transform(test['前面道路:方位'])

    te = TargetEncoding(fold_df)
    train_features_df['te_load_type'] = te.fit_transform(
        train['前面道路:種類'], train['y'])
    test_features_df['te_load_type'] = te.transform(test['前面道路:種類'])

    return train_features_df, test_features_df