def main(): DATA_DIR = get_datadir() IN_DIR = f'{DATA_DIR}/raw' OUT_DIR = f'{DATA_DIR}/processed/raw_pickle_v1' Path(OUT_DIR).mkdir(exist_ok=True, parents=True) df_train = pd.read_csv(f'{IN_DIR}/train.csv') df_test = pd.read_csv(f'{IN_DIR}/test.csv') df_train.to_pickle(f'{OUT_DIR}/train.pkl') df_test.to_pickle(f'{OUT_DIR}/test.pkl')
def main(): ''' Download raw data from kaggle & unzip ''' DATA_DIR = get_datadir() ENV = get_exec_env() OUT_DIR = f'{DATA_DIR}/raw' Path(OUT_DIR).mkdir(exist_ok=True, parents=True) if ENV in ['kaggle-Interactive', 'kaggle-Batch']: for f in glob.glob( f'{DATA_DIR}/../../input/jane-street-market-prediction/*.csv'): shutil.copy2(f, OUT_DIR) elif ENV in ['colab', 'local']: cmd = [ f'kaggle competitions download -c titanic -p {OUT_DIR}', f'unzip {OUT_DIR}/titanic.zip -d {OUT_DIR}', f'rm {OUT_DIR}/titanic.zip' ] for c in cmd: os.system(c) else: raise ValueError
def test_kaggleInteractive(self, mocker): mocker.patch('src.train_v1.util.get_environment.get_exec_env', return_value='kaggle-Interactive') assert (get_datadir() == '/kaggle/working/data/train_v1')
def test_other(self, mocker): mocker.patch('src.train_v1.util.get_environment.get_exec_env', return_value='') with pytest.raises(ValueError): get_datadir()
def test_local(self, mocker): mocker.patch('src.train_v1.util.get_environment.get_exec_env', return_value='local') assert (get_datadir() == str( pathlib.Path('./data/train_v1').resolve()))
def test_colab(self, mocker): mocker.patch('src.train_v1.util.get_environment.get_exec_env', return_value='colab') assert (get_datadir() == '')
def main(cfg: DictConfig) -> None: pprint.pprint(dict(cfg)) # set random seed seed_everything(**cfg.random_seed) commit = get_head_commit() # check for changes not commited if get_exec_env() == 'local': if cfg.experiment.tags.exec == 'prd' and has_changes_to_commit( ): # check for changes not commited raise Exception( f'Changes must be commited before running production!') DATA_DIR = get_datadir() OUT_DIR = f'{DATA_DIR}/{cfg.experiment.name}/{cfg.experiment.tags.exec}{cfg.runno}' Path(OUT_DIR).mkdir(exist_ok=True, parents=True) device = get_device() # follow these sequences: uri > experiment > run > others tracking_uri = 'http://mlflow-tracking-server:5000' mlflow.set_tracking_uri( tracking_uri ) # uri must be set before set_experiment. artifact_uri is defined at tracking server mlflow.set_experiment(cfg.experiment.name) mlflow.start_run() mlflow.set_tags(cfg.experiment.tags) mlflow.set_tag('commit', commit) if commit is not None else print('No commit hash') if get_exec_env() == 'local': mlflow.log_artifacts('.hydra/') else: print( 'Note: configuration yaml is not logged in ipykernel environment') mlflow.set_tag('cv', cfg.cv.name) mlflow.set_tag('model', cfg.model.name) mlflow.log_param('feature_engineering', cfg.feature_engineering) mlflow.log_param('feature.name', [f.name for f in cfg.features]) mlflow.log_params(cfg.cv.param) mlflow.log_params(cfg.model.model_param) mlflow.log_params(cfg.model.train_param) train = pd.DataFrame() # load feature feat_cols = [] for f in cfg.features: df = pd.read_pickle(f'{DATA_DIR}/{f.path}').loc[:, f.cols] train = pd.concat([train, df], axis=1) feat_cols += f.cols print(f'Feature: {f.name}, shape: {df.shape}') # load info if cfg.info.path is not None: df = pd.read_pickle(f'{DATA_DIR}/{cfg.info.path}').loc[:, cfg.info.cols] train = pd.concat([train, df], axis=1) # load target df = pd.read_pickle(f'{DATA_DIR}/{cfg.target.path}').loc[:, cfg.target.col] train = pd.concat([train, df], axis=1) print(f'Input feature shape: {train.shape}') # feature engineering nfl = NaFiller(cfg.feature_engineering.method_fillna, feat_cols) # fill missing values pipe = [nfl] for p in pipe: train = p.fit_transform(train) if nfl.mean_ is not None: np.save(f'{OUT_DIR}/nafiller_mean.npy', nfl.mean_.values) # Train if cfg.option.train: if cfg.cv.name == 'nocv': train_full(train, feat_cols, cfg.target.col, cfg.model.name, cfg.model.model_param, cfg.model.train_param, OUT_DIR) elif cfg.cv.name == 'KFold': if cfg.model.type == 'pytorch-lightning': train_torch_KFold(train, feat_cols, [cfg.target.col], cfg.model.name, cfg.model.model_param, cfg.model.train_param, cfg.cv.param, cfg.optimizer, cfg.scheduler, cfg.loss_function, OUT_DIR) else: train_gbdt_KFold(train, feat_cols, cfg.target.col, cfg.model.name, cfg.model.model_param, cfg.model.train_param, cfg.cv.param, OUT_DIR) else: raise ValueError(f'Invalid cv: {cfg.cv.name}') # Predict if cfg.option.predict: print('Start predicting') # load data test = pd.read_pickle(f'{DATA_DIR}/{cfg.test.path}') sample_submission = pd.read_csv( f'{DATA_DIR}/raw/gender_submission.csv') y_pred = np.zeros(len(test)) # feature engineering for p in pipe: test = p.transform(test) # load models models = [] if cfg.model.type == 'pytorch-lightning': model_paths = [ f'{OUT_DIR}/model_{i}.pth' for i in range(cfg.cv.param.n_splits) ] for model_path in model_paths: torch.cuda.empty_cache() model = get_model(cfg.model.name, cfg.model.model_param, feat_cols=feat_cols, target_cols=[cfg.target.col], device=device) model.to(device) model.load_state_dict( torch.load(model_path, map_location=torch.device('cpu'))) model.eval() models.append(model) elif cfg.model.type == 'pytorch': raise NotImplementedError() elif cfg.model.type == 'sklearn': model_paths = [ f'{OUT_DIR}/model_{i}.pkl' for i in range(cfg.cv.param.n_splits) ] for model_path in model_paths: model = pd.read_pickle(open(model_path, 'rb')) models.append(model) else: raise ValueError(f'Invalid model.type: {cfg.model.type}') # ensemble models for model in models: if cfg.model.type in ['pytorch', 'pytorch-lightning']: # 1. create prediction as torch.tensor # 2. convert torch.tensor(418, 1) -> np.ndarray(418, 1) -> np.ndarray(418,) # 3. divide by len(model) y_pred += model(torch.tensor(test[feat_cols].values, dtype=torch.float).to(device)) \ .sigmoid().detach().cpu() \ .numpy()[:, 0] \ / len(models) elif cfg.model.type == 'sklearn': y_pred += model.predict(test[feat_cols].values) / len(models) else: raise ValueError(f'Invalid model.type: {cfg.model.type}') y_pred = np.where(y_pred >= 0.5, 1, 0).astype(int) pred_df = pd.DataFrame(data={ 'PassengerId': test['PassengerId'].values, 'Survived': y_pred }) if not pred_df.shape == sample_submission.shape: raise Exception(f'Incorrect pred_df.shape: {pred_df.shape}') submission_path = f'{OUT_DIR}/submission.csv' pred_df.to_csv(submission_path, index=False) print('End predicting') mlflow.log_artifacts(OUT_DIR) return None