def __init__(self, model): self.model = model self.loss_fn = nn.NLLLoss() self.train_data = read_data("train") self.val_data = read_data("val") self.optimizer = optim.Adam(model.decoder.parameters(), lr=3e-4) self.scheduler = optim.lr_scheduler.ExponentialLR( self.optimizer, 0.995) self.device = th.device("cuda: 0") self.beam_searcher = BeamSearcher(self.model) self.stochastic_searcher = StochasticSearcher(self.model) self.trainer = create_supervised_trainer( self.model, self.optimizer, self.loss_fn, device=self.device, non_blocking=True, ) self.evaluator = create_supervised_evaluator(self.model, metrics={ "loss": Loss(self.loss_fn), }, device=self.device, non_blocking=True) time = datetime.now().strftime("%Y%m%d%H%M") self.writer = SummaryWriter(f"./logs/{time}") self.saver = ModelCheckpoint("./models", "model", save_interval=5) self.register_events()
def __init__(self, model): self.device = th.device("cuda: 1") self.model = model.to(self.device) self.optimizer = optim.Adam(self.model.decoder.parameters(), 3e-4) self.scheduler = optim.lr_scheduler.ExponentialLR( self.optimizer, 0.995) self.name = datetime.now().strftime("%Y%m%d%H%M") self.writer = SummaryWriter(f"./logs/{self.name}") self.train_loader = DataLoader(read_data("train", "train"), batch_size=32, shuffle=True, num_workers=4, pin_memory=True, collate_fn=collate_fn) self.beam_searcher = BeamSearcher(self.model) self.greedy_searcher = GreedySearcher(self.model) self.stochastic_searcher = StochasticSearcher(self.model) self.teacher_searcher = TeacherSearcher(self.model) train_set = read_data("train", "val") val_set = read_data("val", "val") self.val_loaders = { "train": DataLoader(Subset(train_set, list(range(5000))), batch_size=32, num_workers=4, pin_memory=True, collate_fn=collate_fn), "val": DataLoader(Subset(val_set, list(range(5000))), batch_size=32, num_workers=4, pin_memory=True, collate_fn=collate_fn), } self.teacher_power = 1.0
def predict_pipeline(training_pipeline_params): path = os.getcwd() model = pickle.load( open(training_pipeline_params.pretrained_model_path, 'rb')) logger.info(f"pretrained model {model} extracted") logger.info( f"start predict pipeline with params {training_pipeline_params}") data = read_data(training_pipeline_params.input_data_path) logger.info(f"data.shape is {data.shape}") data = drop_columns(data, training_pipeline_params.feature_params) logger.info(f"data.shape after dropping some columns is {data.shape}") transformer = build_transformer(training_pipeline_params.feature_params) transformer.fit(data) pred_features = make_features(transformer, data) predicts = predict_model( model, pred_features, training_pipeline_params.feature_params.use_log_trick, ) predictions_path = training_pipeline_params.predictions_path pd.DataFrame(predicts, columns=['predictions']).to_csv(predictions_path, index=None, mode='w') logger.info(f"predictions are written to {predictions_path}") return predicts
def train_pipeline(params: PipelineParams): logger.info(f"Start train with params {params}.") data = read_data(params.train_data_path) logger.info(f"Data shape is {data.shape}") data_train, data_val = split_train_val_data(data, params.split_params) logger.info(f"Train data shape is {data_train.shape}") logger.info(f"Validation data shape is {data_val.shape}") target_train = extract_target(data_train, params.features_params) data_train = data_train.drop(columns=['target']) transformer = build_transformer(params.features_params) transformer.fit(data_train) features_train = make_features(transformer, data_train) logger.info(f"Train features shape is {features_train.shape}") target_val = extract_target(data_val, params.features_params) data_val = data_val.drop(columns=['target']) features_val = make_features(transformer, data_val) logger.info(f"Validation features shape is {features_val.shape}") model = train_model(features_train, target_train, params.train_params) predicts = predict_model(model, features_val) metrics = evaluate_model(predicts, target_val) with open(params.metric_path, "w") as metric_file: json.dump(metrics, metric_file) logger.info(f"Metrics are: {metrics}") path_to_model = dump_model(model, params.model_path) logger.info(f"Model saved at {params.model_path}") with open(params.transformer_path, "wb") as tr: pickle.dump(transformer, tr) logger.info(f"Feature transformer saved at {params.transformer_path}") logger.info("Finished.") return path_to_model, metrics
def predict_pipeline(params: PredictionPipelineParams) -> pd.DataFrame: logger.info(f"start predict pipeline") logger.info(f"open data") df = read_data(params.input_data_predict_path) logger.debug(f"data shape: {df.shape}") logger.info(f"load model") with open(params.output_model_path, "rb") as f: model = pickle.load(f) logger.info(f"load transformer") with open(params.output_transformer_path, "rb") as f: transformer = pickle.load(f) logger.info(f"create features") transformed_df = make_features(transformer, df.drop(columns=['target'])) logger.info(f"prediction") predicts = model.predict(transformed_df) logger.info(f"save predictions") pd.DataFrame(predicts, columns=["target"]).to_csv(params.output_data_predict_path, index=False) logger.info(f"predict pipeline is finished") return pd.DataFrame(predicts, columns=["target"])
def main(data_path, model_path, output_path): data = read_data(data_path) with open(model_path, "rb") as model_file: model = pickle.load(model_file) if "target" in data: data = data.drop("target", axis=1) predictions = predict(model, data) predictions.to_csv(output_path)
def train_pipeline(cfg): # 1. read data logger.info( f"start train pipeline with config: \n\n{OmegaConf.to_yaml(cfg)} \n") data = read_data(cfg.input_data_path) logger.info(f"data.shape is {data.shape}") # 2. split strategy X_train, X_val = split_train_val_data(data, cfg.splitting_strategy, cfg.splitting_params) logger.info( f"X_train.shape is {X_train.shape} X_val.shape is {X_val.shape}") # 3. preprocess data logger.info("preprocess data...") transformer = RawDataPreprocessor() X_train = transformer.fit_transform(X_train) X_val = transformer.transform(X_val) selected_features = select_features(X_train, strategy='default') # 4. train if cfg.model.name == "rf": model = RandomForestClassifier(**cfg.model.train_params.model_params) elif cfg.model.name == "lr": model = LogisticRegression(**cfg.model.train_params.model_params) else: raise NotImplementedError() if cfg.fit_model: logger.info("fit model") model.fit(X_train[selected_features], X_train['target']) # 5. save model if cfg.fit_model == False and cfg.serialize_model == True: assert 1 == 0, ('you`re trying to save model without fit() it!') if cfg.serialize_model: serialize_model(cfg.model_path, model, transformer, selected_features) # 4. validate logger.info("load model for validation") model, transformer, selected_features = load_model(cfg.model_path) train_preds = model.predict_proba(X_train[selected_features])[:, 1] train_score = roc_auc_score(X_train['target'], train_preds) if len(X_val) == 0: val_preds = None val_score = np.NaN else: val_preds = model.predict_proba(X_val[selected_features])[:, 1] val_score = roc_auc_score(X_val['target'], val_preds) logger.info(f'ROC AUC train: {train_score:.5f} val: {val_score:.5f}')
def main(cfg: Config) -> None: logger.info("Started visualization.eda") logger.debug(f"App config: \n{OmegaConf.to_yaml(cfg)}") data = read_data(get_path_from_root(cfg.eda.input_data_path)) report = hydra.utils.instantiate(cfg.eda.eda_class, data) logger.info("Report is ready, writing to file") report_dir = get_path_from_root(cfg.eda.report_dir) Path(report_dir).mkdir(parents=True, exist_ok=True) report.to_file(os.path.join(report_dir, cfg.eda.report_file_name)) logger.info("Finished visualization.eda")
def train_pipeline(cfg: Config) -> None: logger.info("Started train pipeline") logger.debug(f"App config: \n{OmegaConf.to_yaml(cfg)}") data = read_data(get_path_from_root(cfg.main.input_data_path)) logger.debug(f"Data shape is {data.shape}") if cfg.split.name == "simple_split": train_data, val_data = split_train_val_data( data, typing.cast(SimpleSplitConfig, cfg.split) ) else: error_msg = f"Wrong split strategy {cfg.split.name}" logger.error(error_msg) raise ValueError(error_msg) train_features, train_target = separate_target(train_data, cfg.main.target_name) val_features, val_target = separate_target(val_data, cfg.main.target_name) logger.info("Started transforming data") transformer = HeartDatasetTransformer(cfg=cfg.transformer).fit( train_features, train_target ) train_features, train_target = transformer.transform(train_features, train_target) val_features, val_target = transformer.transform(val_features, val_target) logger.debug( "Transformed data shape\n" f"train_features: {train_features.shape}\n" f"train_target: {train_target.shape}\n" f"val_features: {val_features.shape}\n" f"val_target: {val_target.shape}" ) logger.info("Finished transforming data") logger.info("Started training a classifier") classifier = hydra.utils.instantiate(cfg.model).fit(train_features, train_target) logger.info("Finished training a classifier") logger.info("Started evaluating the classifier") val_predictions = classifier.predict(val_features) metrics = classification_report(val_target, val_predictions, output_dict=True) logger.debug(f"Metrics: \n{yaml.dump(metrics)}") logger.info("Finished evaluating the classifier") model = {"classifier": classifier, "transformer": transformer} if cfg.main.track.track_experiment: logger.info("Start saving experiment info") track_experiment(model, cfg, metrics) logger.info("Finished saving experiment info") if cfg.main.save_model.overwrite_main_model: logger.info("Start saving model") save_model(model, cfg.main.save_model) logger.info("Finished saving model") logger.info("Finished train pipeline")
def train_pipeline(training_pipeline_params: TrainingPipelineParams, model: SklearnClassifierModel): logger.info(f"start train pipeline with params {training_pipeline_params}") data = read_data(training_pipeline_params.input_data_path) logger.info(f"data.shape is {data.shape}") data = drop_columns(data, training_pipeline_params.feature_params) logger.info(f"data.shape after dropping some columns is {data.shape}") train_df, val_df = split_train_val_data( data, training_pipeline_params.splitting_params ) logger.info(f"train_df.shape is {train_df.shape}") logger.info(f"val_df.shape is {val_df.shape}") if train_df.shape[0] < NOT_ENOUGH_DATA_THRESHOLD: msg = "No enough data to build good model" logger.warning(msg) warning_logger.warning(msg) transformer = build_transformer(training_pipeline_params.feature_params) transformer.fit(train_df) train_features = make_features(transformer, train_df) train_target = extract_target(train_df, training_pipeline_params.feature_params) logger.info(f"train_features.shape is {train_features.shape}") model = train_model( train_features, train_target, model ) val_features = make_features(transformer, val_df) val_target = extract_target(val_df, training_pipeline_params.feature_params) logger.info(f"val_features.shape is {val_features.shape}") predicts = predict_model( model, val_features, training_pipeline_params.feature_params.use_log_trick, ) metrics = evaluate_model( predicts, val_target, use_log_trick=training_pipeline_params.feature_params.use_log_trick, ) with open(training_pipeline_params.metric_path, "w") as metric_file: json.dump(metrics, metric_file) logger.info(f"metrics is {metrics}") path_to_model = serialize_model(model, training_pipeline_params.output_model_path) return path_to_model, metrics
def predict_pipeline(cfg): assert os.path.exists(cfg.model_path) logger.info("loading model for predict...") model, transformer, selected_features = load_model(cfg.model_path) X_test = read_data(cfg.predict_raw_data_path) X_test = transformer.transform(X_test) logger.info("predict probabilities") test_preds = model.predict_proba(X_test[selected_features])[:, 1] test_preds_df = pd.DataFrame(data=test_preds, columns=['preds']) logger.info(f"save predicts to {cfg.predict_out_data_path}") test_preds_df.to_csv(cfg.predict_out_data_path, index=False)
def train_pipeline(params: TrainingPipelineParams) -> float: logger.info(f"start train pipeline") df = read_data(params.input_data_path) logger.info(f"load data, shape: {df.shape}") logger.info(f"train/test spit") train_df, test_df = split_train_val_data(df, params.split_params) logger.debug(f"train shape: {train_df.shape}") logger.debug(f"test shape: {test_df.shape}") logger.info(f"feature engineering") transformer = build_transformer(params.feature_params) transformer.fit(train_df.drop(columns=['target'])) logger.info(f"create train features and target") train_features = make_features(transformer, train_df.drop(columns=['target'])) train_target = extract_target(train_df, params.feature_params) logger.info(f"fit model") model = Classifier(params.model_params) model.fit(train_features, train_target) logger.info(f"model is fitted") logger.info(f"create test features and target") test_features = make_features(transformer, test_df.drop(columns=['target'])) test_target = extract_target(test_df, params.feature_params) logger.info(f"made predictions") pred = model.predict(test_features) score = get_score(test_target, pred) logger.debug(f"ROC-AUC: {score}") logger.info(f"save model") model.dump(params.output_model_path) logger.info(f"save transformer") with open(params.output_transformer_path, "wb") as f: pickle.dump(transformer, f) logger.info(f"train pipeline is finished") return score
def dataset(dataset_path) -> pd.DataFrame: return read_data(dataset_path)
def test_read_data(dataset_path: str, target_col: str): data = read_data(dataset_path) assert len(data) > 10 assert target_col in data.keys()