Exemplo n.º 1
0
def main():
    DATA_DIR = get_datadir()
    IN_DIR = f'{DATA_DIR}/raw'
    OUT_DIR = f'{DATA_DIR}/processed/raw_pickle_v1'
    Path(OUT_DIR).mkdir(exist_ok=True, parents=True)

    df_train = pd.read_csv(f'{IN_DIR}/train.csv')
    df_test = pd.read_csv(f'{IN_DIR}/test.csv')

    df_train.to_pickle(f'{OUT_DIR}/train.pkl')
    df_test.to_pickle(f'{OUT_DIR}/test.pkl')
Exemplo n.º 2
0
def main():
    '''
    Download raw data from kaggle & unzip
    '''
    DATA_DIR = get_datadir()
    ENV = get_exec_env()
    OUT_DIR = f'{DATA_DIR}/raw'
    Path(OUT_DIR).mkdir(exist_ok=True, parents=True)

    if ENV in ['kaggle-Interactive', 'kaggle-Batch']:
        for f in glob.glob(
                f'{DATA_DIR}/../../input/jane-street-market-prediction/*.csv'):
            shutil.copy2(f, OUT_DIR)
    elif ENV in ['colab', 'local']:
        cmd = [
            f'kaggle competitions download -c titanic -p {OUT_DIR}',
            f'unzip {OUT_DIR}/titanic.zip -d {OUT_DIR}',
            f'rm {OUT_DIR}/titanic.zip'
        ]
        for c in cmd:
            os.system(c)
    else:
        raise ValueError
Exemplo n.º 3
0
 def test_kaggleInteractive(self, mocker):
     mocker.patch('src.train_v1.util.get_environment.get_exec_env',
                  return_value='kaggle-Interactive')
     assert (get_datadir() == '/kaggle/working/data/train_v1')
Exemplo n.º 4
0
 def test_other(self, mocker):
     mocker.patch('src.train_v1.util.get_environment.get_exec_env',
                  return_value='')
     with pytest.raises(ValueError):
         get_datadir()
Exemplo n.º 5
0
 def test_local(self, mocker):
     mocker.patch('src.train_v1.util.get_environment.get_exec_env',
                  return_value='local')
     assert (get_datadir() == str(
         pathlib.Path('./data/train_v1').resolve()))
Exemplo n.º 6
0
 def test_colab(self, mocker):
     mocker.patch('src.train_v1.util.get_environment.get_exec_env',
                  return_value='colab')
     assert (get_datadir() == '')
Exemplo n.º 7
0
def main(cfg: DictConfig) -> None:
    pprint.pprint(dict(cfg))

    # set random seed
    seed_everything(**cfg.random_seed)

    commit = get_head_commit()
    # check for changes not commited
    if get_exec_env() == 'local':
        if cfg.experiment.tags.exec == 'prd' and has_changes_to_commit(
        ):  # check for changes not commited
            raise Exception(
                f'Changes must be commited before running production!')

    DATA_DIR = get_datadir()
    OUT_DIR = f'{DATA_DIR}/{cfg.experiment.name}/{cfg.experiment.tags.exec}{cfg.runno}'
    Path(OUT_DIR).mkdir(exist_ok=True, parents=True)

    device = get_device()

    # follow these sequences: uri > experiment > run > others
    tracking_uri = 'http://mlflow-tracking-server:5000'
    mlflow.set_tracking_uri(
        tracking_uri
    )  # uri must be set before set_experiment. artifact_uri is defined at tracking server
    mlflow.set_experiment(cfg.experiment.name)
    mlflow.start_run()
    mlflow.set_tags(cfg.experiment.tags)
    mlflow.set_tag('commit',
                   commit) if commit is not None else print('No commit hash')
    if get_exec_env() == 'local':
        mlflow.log_artifacts('.hydra/')
    else:
        print(
            'Note: configuration yaml is not logged in ipykernel environment')

    mlflow.set_tag('cv', cfg.cv.name)
    mlflow.set_tag('model', cfg.model.name)

    mlflow.log_param('feature_engineering', cfg.feature_engineering)
    mlflow.log_param('feature.name', [f.name for f in cfg.features])
    mlflow.log_params(cfg.cv.param)
    mlflow.log_params(cfg.model.model_param)
    mlflow.log_params(cfg.model.train_param)

    train = pd.DataFrame()

    # load feature
    feat_cols = []
    for f in cfg.features:
        df = pd.read_pickle(f'{DATA_DIR}/{f.path}').loc[:, f.cols]
        train = pd.concat([train, df], axis=1)
        feat_cols += f.cols
        print(f'Feature: {f.name}, shape: {df.shape}')

    # load info
    if cfg.info.path is not None:
        df = pd.read_pickle(f'{DATA_DIR}/{cfg.info.path}').loc[:,
                                                               cfg.info.cols]
        train = pd.concat([train, df], axis=1)

    # load target
    df = pd.read_pickle(f'{DATA_DIR}/{cfg.target.path}').loc[:, cfg.target.col]
    train = pd.concat([train, df], axis=1)

    print(f'Input feature shape: {train.shape}')

    # feature engineering
    nfl = NaFiller(cfg.feature_engineering.method_fillna,
                   feat_cols)  # fill missing values
    pipe = [nfl]
    for p in pipe:
        train = p.fit_transform(train)

    if nfl.mean_ is not None:
        np.save(f'{OUT_DIR}/nafiller_mean.npy', nfl.mean_.values)

    # Train
    if cfg.option.train:
        if cfg.cv.name == 'nocv':
            train_full(train, feat_cols, cfg.target.col, cfg.model.name,
                       cfg.model.model_param, cfg.model.train_param, OUT_DIR)
        elif cfg.cv.name == 'KFold':
            if cfg.model.type == 'pytorch-lightning':
                train_torch_KFold(train, feat_cols, [cfg.target.col],
                                  cfg.model.name, cfg.model.model_param,
                                  cfg.model.train_param, cfg.cv.param,
                                  cfg.optimizer, cfg.scheduler,
                                  cfg.loss_function, OUT_DIR)
            else:
                train_gbdt_KFold(train, feat_cols, cfg.target.col,
                                 cfg.model.name, cfg.model.model_param,
                                 cfg.model.train_param, cfg.cv.param, OUT_DIR)
        else:
            raise ValueError(f'Invalid cv: {cfg.cv.name}')

    # Predict
    if cfg.option.predict:
        print('Start predicting')
        # load data
        test = pd.read_pickle(f'{DATA_DIR}/{cfg.test.path}')
        sample_submission = pd.read_csv(
            f'{DATA_DIR}/raw/gender_submission.csv')
        y_pred = np.zeros(len(test))

        # feature engineering
        for p in pipe:
            test = p.transform(test)

        # load models
        models = []
        if cfg.model.type == 'pytorch-lightning':
            model_paths = [
                f'{OUT_DIR}/model_{i}.pth'
                for i in range(cfg.cv.param.n_splits)
            ]
            for model_path in model_paths:
                torch.cuda.empty_cache()
                model = get_model(cfg.model.name,
                                  cfg.model.model_param,
                                  feat_cols=feat_cols,
                                  target_cols=[cfg.target.col],
                                  device=device)
                model.to(device)
                model.load_state_dict(
                    torch.load(model_path, map_location=torch.device('cpu')))
                model.eval()
                models.append(model)
        elif cfg.model.type == 'pytorch':
            raise NotImplementedError()
        elif cfg.model.type == 'sklearn':
            model_paths = [
                f'{OUT_DIR}/model_{i}.pkl'
                for i in range(cfg.cv.param.n_splits)
            ]
            for model_path in model_paths:
                model = pd.read_pickle(open(model_path, 'rb'))
                models.append(model)
        else:
            raise ValueError(f'Invalid model.type: {cfg.model.type}')

        # ensemble models
        for model in models:
            if cfg.model.type in ['pytorch', 'pytorch-lightning']:
                # 1. create prediction as torch.tensor
                # 2. convert torch.tensor(418, 1) -> np.ndarray(418, 1) -> np.ndarray(418,)
                # 3. divide by len(model)
                y_pred += model(torch.tensor(test[feat_cols].values, dtype=torch.float).to(device)) \
                    .sigmoid().detach().cpu() \
                    .numpy()[:, 0] \
                    / len(models)
            elif cfg.model.type == 'sklearn':
                y_pred += model.predict(test[feat_cols].values) / len(models)
            else:
                raise ValueError(f'Invalid model.type: {cfg.model.type}')

        y_pred = np.where(y_pred >= 0.5, 1, 0).astype(int)
        pred_df = pd.DataFrame(data={
            'PassengerId': test['PassengerId'].values,
            'Survived': y_pred
        })

        if not pred_df.shape == sample_submission.shape:
            raise Exception(f'Incorrect pred_df.shape: {pred_df.shape}')

        submission_path = f'{OUT_DIR}/submission.csv'
        pred_df.to_csv(submission_path, index=False)
        print('End predicting')

    mlflow.log_artifacts(OUT_DIR)
    return None