Пример #1
0
 def __init__(self, model):
     self.model = model
     self.loss_fn = nn.NLLLoss()
     self.train_data = read_data("train")
     self.val_data = read_data("val")
     self.optimizer = optim.Adam(model.decoder.parameters(), lr=3e-4)
     self.scheduler = optim.lr_scheduler.ExponentialLR(
         self.optimizer, 0.995)
     self.device = th.device("cuda: 0")
     self.beam_searcher = BeamSearcher(self.model)
     self.stochastic_searcher = StochasticSearcher(self.model)
     self.trainer = create_supervised_trainer(
         self.model,
         self.optimizer,
         self.loss_fn,
         device=self.device,
         non_blocking=True,
     )
     self.evaluator = create_supervised_evaluator(self.model,
                                                  metrics={
                                                      "loss":
                                                      Loss(self.loss_fn),
                                                  },
                                                  device=self.device,
                                                  non_blocking=True)
     time = datetime.now().strftime("%Y%m%d%H%M")
     self.writer = SummaryWriter(f"./logs/{time}")
     self.saver = ModelCheckpoint("./models", "model", save_interval=5)
     self.register_events()
Пример #2
0
 def __init__(self, model):
     self.device = th.device("cuda: 1")
     self.model = model.to(self.device)
     self.optimizer = optim.Adam(self.model.decoder.parameters(), 3e-4)
     self.scheduler = optim.lr_scheduler.ExponentialLR(
         self.optimizer, 0.995)
     self.name = datetime.now().strftime("%Y%m%d%H%M")
     self.writer = SummaryWriter(f"./logs/{self.name}")
     self.train_loader = DataLoader(read_data("train", "train"),
                                    batch_size=32,
                                    shuffle=True,
                                    num_workers=4,
                                    pin_memory=True,
                                    collate_fn=collate_fn)
     self.beam_searcher = BeamSearcher(self.model)
     self.greedy_searcher = GreedySearcher(self.model)
     self.stochastic_searcher = StochasticSearcher(self.model)
     self.teacher_searcher = TeacherSearcher(self.model)
     train_set = read_data("train", "val")
     val_set = read_data("val", "val")
     self.val_loaders = {
         "train":
         DataLoader(Subset(train_set, list(range(5000))),
                    batch_size=32,
                    num_workers=4,
                    pin_memory=True,
                    collate_fn=collate_fn),
         "val":
         DataLoader(Subset(val_set, list(range(5000))),
                    batch_size=32,
                    num_workers=4,
                    pin_memory=True,
                    collate_fn=collate_fn),
     }
     self.teacher_power = 1.0
def predict_pipeline(training_pipeline_params):
    path = os.getcwd()
    model = pickle.load(
        open(training_pipeline_params.pretrained_model_path, 'rb'))
    logger.info(f"pretrained model {model} extracted")

    logger.info(
        f"start predict pipeline with params {training_pipeline_params}")
    data = read_data(training_pipeline_params.input_data_path)
    logger.info(f"data.shape is {data.shape}")
    data = drop_columns(data, training_pipeline_params.feature_params)
    logger.info(f"data.shape after dropping some columns is {data.shape}")

    transformer = build_transformer(training_pipeline_params.feature_params)
    transformer.fit(data)
    pred_features = make_features(transformer, data)

    predicts = predict_model(
        model,
        pred_features,
        training_pipeline_params.feature_params.use_log_trick,
    )
    predictions_path = training_pipeline_params.predictions_path
    pd.DataFrame(predicts, columns=['predictions']).to_csv(predictions_path,
                                                           index=None,
                                                           mode='w')
    logger.info(f"predictions are written to {predictions_path}")

    return predicts
Пример #4
0
def train_pipeline(params: PipelineParams):
    logger.info(f"Start train with params {params}.")
    data = read_data(params.train_data_path)
    logger.info(f"Data shape is {data.shape}")
    data_train, data_val = split_train_val_data(data, params.split_params)
    logger.info(f"Train data shape is {data_train.shape}")
    logger.info(f"Validation data shape is {data_val.shape}")
    target_train = extract_target(data_train, params.features_params)
    data_train = data_train.drop(columns=['target'])
    transformer = build_transformer(params.features_params)
    transformer.fit(data_train)
    features_train = make_features(transformer, data_train)
    logger.info(f"Train features shape is {features_train.shape}")
    target_val = extract_target(data_val, params.features_params)
    data_val = data_val.drop(columns=['target'])
    features_val = make_features(transformer, data_val)
    logger.info(f"Validation features shape is {features_val.shape}")

    model = train_model(features_train, target_train, params.train_params)
    predicts = predict_model(model, features_val)
    metrics = evaluate_model(predicts, target_val)
    with open(params.metric_path, "w") as metric_file:
        json.dump(metrics, metric_file)
    logger.info(f"Metrics are: {metrics}")
    path_to_model = dump_model(model, params.model_path)
    logger.info(f"Model saved at {params.model_path}")
    with open(params.transformer_path, "wb") as tr:
        pickle.dump(transformer, tr)
    logger.info(f"Feature transformer saved at {params.transformer_path}")
    logger.info("Finished.")
    return path_to_model, metrics
def predict_pipeline(params: PredictionPipelineParams) -> pd.DataFrame:
    logger.info(f"start predict pipeline")

    logger.info(f"open data")
    df = read_data(params.input_data_predict_path)
    logger.debug(f"data shape: {df.shape}")

    logger.info(f"load model")
    with open(params.output_model_path, "rb") as f:
        model = pickle.load(f)

    logger.info(f"load transformer")
    with open(params.output_transformer_path, "rb") as f:
        transformer = pickle.load(f)

    logger.info(f"create features")
    transformed_df = make_features(transformer, df.drop(columns=['target']))

    logger.info(f"prediction")
    predicts = model.predict(transformed_df)

    logger.info(f"save predictions")
    pd.DataFrame(predicts,
                 columns=["target"]).to_csv(params.output_data_predict_path,
                                            index=False)

    logger.info(f"predict pipeline is finished")
    return pd.DataFrame(predicts, columns=["target"])
Пример #6
0
def main(data_path, model_path, output_path):
    data = read_data(data_path)
    with open(model_path, "rb") as model_file:
        model = pickle.load(model_file)
    if "target" in data:
        data = data.drop("target", axis=1)
    predictions = predict(model, data)
    predictions.to_csv(output_path)
Пример #7
0
def train_pipeline(cfg):

    # 1. read data
    logger.info(
        f"start train pipeline with config: \n\n{OmegaConf.to_yaml(cfg)} \n")

    data = read_data(cfg.input_data_path)
    logger.info(f"data.shape is {data.shape}")

    # 2. split strategy
    X_train, X_val = split_train_val_data(data, cfg.splitting_strategy,
                                          cfg.splitting_params)
    logger.info(
        f"X_train.shape is {X_train.shape} X_val.shape is {X_val.shape}")

    # 3. preprocess data
    logger.info("preprocess data...")
    transformer = RawDataPreprocessor()
    X_train = transformer.fit_transform(X_train)
    X_val = transformer.transform(X_val)

    selected_features = select_features(X_train, strategy='default')

    # 4. train
    if cfg.model.name == "rf":
        model = RandomForestClassifier(**cfg.model.train_params.model_params)
    elif cfg.model.name == "lr":
        model = LogisticRegression(**cfg.model.train_params.model_params)
    else:
        raise NotImplementedError()

    if cfg.fit_model:
        logger.info("fit model")
        model.fit(X_train[selected_features], X_train['target'])

    # 5. save model
    if cfg.fit_model == False and cfg.serialize_model == True:
        assert 1 == 0, ('you`re trying to save model without fit() it!')

    if cfg.serialize_model:
        serialize_model(cfg.model_path, model, transformer, selected_features)

    # 4. validate
    logger.info("load model for validation")
    model, transformer, selected_features = load_model(cfg.model_path)

    train_preds = model.predict_proba(X_train[selected_features])[:, 1]
    train_score = roc_auc_score(X_train['target'], train_preds)

    if len(X_val) == 0:
        val_preds = None
        val_score = np.NaN
    else:
        val_preds = model.predict_proba(X_val[selected_features])[:, 1]
        val_score = roc_auc_score(X_val['target'], val_preds)

    logger.info(f'ROC AUC train: {train_score:.5f} val: {val_score:.5f}')
Пример #8
0
def main(cfg: Config) -> None:
    logger.info("Started visualization.eda")
    logger.debug(f"App config: \n{OmegaConf.to_yaml(cfg)}")
    data = read_data(get_path_from_root(cfg.eda.input_data_path))
    report = hydra.utils.instantiate(cfg.eda.eda_class, data)
    logger.info("Report is ready, writing to file")
    report_dir = get_path_from_root(cfg.eda.report_dir)
    Path(report_dir).mkdir(parents=True, exist_ok=True)
    report.to_file(os.path.join(report_dir, cfg.eda.report_file_name))
    logger.info("Finished visualization.eda")
Пример #9
0
def train_pipeline(cfg: Config) -> None:
    logger.info("Started train pipeline")
    logger.debug(f"App config: \n{OmegaConf.to_yaml(cfg)}")
    data = read_data(get_path_from_root(cfg.main.input_data_path))
    logger.debug(f"Data shape is {data.shape}")
    if cfg.split.name == "simple_split":
        train_data, val_data = split_train_val_data(
            data, typing.cast(SimpleSplitConfig, cfg.split)
        )
    else:
        error_msg = f"Wrong split strategy {cfg.split.name}"
        logger.error(error_msg)
        raise ValueError(error_msg)

    train_features, train_target = separate_target(train_data, cfg.main.target_name)
    val_features, val_target = separate_target(val_data, cfg.main.target_name)

    logger.info("Started transforming data")
    transformer = HeartDatasetTransformer(cfg=cfg.transformer).fit(
        train_features, train_target
    )
    train_features, train_target = transformer.transform(train_features, train_target)
    val_features, val_target = transformer.transform(val_features, val_target)
    logger.debug(
        "Transformed data shape\n"
        f"train_features: {train_features.shape}\n"
        f"train_target: {train_target.shape}\n"
        f"val_features: {val_features.shape}\n"
        f"val_target: {val_target.shape}"
    )
    logger.info("Finished transforming data")

    logger.info("Started training a classifier")
    classifier = hydra.utils.instantiate(cfg.model).fit(train_features, train_target)
    logger.info("Finished training a classifier")

    logger.info("Started evaluating the classifier")
    val_predictions = classifier.predict(val_features)
    metrics = classification_report(val_target, val_predictions, output_dict=True)
    logger.debug(f"Metrics: \n{yaml.dump(metrics)}")
    logger.info("Finished evaluating the classifier")

    model = {"classifier": classifier, "transformer": transformer}

    if cfg.main.track.track_experiment:
        logger.info("Start saving experiment info")
        track_experiment(model, cfg, metrics)
        logger.info("Finished saving experiment info")

    if cfg.main.save_model.overwrite_main_model:
        logger.info("Start saving model")
        save_model(model, cfg.main.save_model)
        logger.info("Finished saving model")

    logger.info("Finished train pipeline")
def train_pipeline(training_pipeline_params: TrainingPipelineParams, model: SklearnClassifierModel):
    logger.info(f"start train pipeline with params {training_pipeline_params}")
    data = read_data(training_pipeline_params.input_data_path)
    logger.info(f"data.shape is {data.shape}")
    data = drop_columns(data, training_pipeline_params.feature_params)
    logger.info(f"data.shape after dropping some columns is {data.shape}")
    train_df, val_df = split_train_val_data(
        data, training_pipeline_params.splitting_params
    )
    logger.info(f"train_df.shape is {train_df.shape}")
    logger.info(f"val_df.shape is {val_df.shape}")

    if train_df.shape[0] < NOT_ENOUGH_DATA_THRESHOLD:
        msg = "No enough data to build good model"
        logger.warning(msg)
        warning_logger.warning(msg)

    transformer = build_transformer(training_pipeline_params.feature_params)
    transformer.fit(train_df)
    train_features = make_features(transformer, train_df)
    train_target = extract_target(train_df, training_pipeline_params.feature_params)

    logger.info(f"train_features.shape is {train_features.shape}")

    model = train_model(
        train_features, train_target, model
    )

    val_features = make_features(transformer, val_df)
    val_target = extract_target(val_df, training_pipeline_params.feature_params)

    logger.info(f"val_features.shape is {val_features.shape}")
    predicts = predict_model(
        model,
        val_features,
        training_pipeline_params.feature_params.use_log_trick,
    )

    metrics = evaluate_model(
        predicts,
        val_target,
        use_log_trick=training_pipeline_params.feature_params.use_log_trick,
    )

    with open(training_pipeline_params.metric_path, "w") as metric_file:
        json.dump(metrics, metric_file)
    logger.info(f"metrics is {metrics}")

    path_to_model = serialize_model(model, training_pipeline_params.output_model_path)

    return path_to_model, metrics
Пример #11
0
def predict_pipeline(cfg):
    assert os.path.exists(cfg.model_path)

    logger.info("loading model for predict...")
    model, transformer, selected_features = load_model(cfg.model_path)

    X_test = read_data(cfg.predict_raw_data_path)
    X_test = transformer.transform(X_test)
    
    logger.info("predict probabilities")
    test_preds = model.predict_proba(X_test[selected_features])[:, 1]
    test_preds_df = pd.DataFrame(data=test_preds, columns=['preds'])

    logger.info(f"save predicts to {cfg.predict_out_data_path}")
    test_preds_df.to_csv(cfg.predict_out_data_path, index=False)
def train_pipeline(params: TrainingPipelineParams) -> float:
    logger.info(f"start train pipeline")

    df = read_data(params.input_data_path)
    logger.info(f"load data, shape: {df.shape}")

    logger.info(f"train/test spit")
    train_df, test_df = split_train_val_data(df, params.split_params)
    logger.debug(f"train shape: {train_df.shape}")
    logger.debug(f"test shape: {test_df.shape}")

    logger.info(f"feature engineering")
    transformer = build_transformer(params.feature_params)
    transformer.fit(train_df.drop(columns=['target']))

    logger.info(f"create train features and target")
    train_features = make_features(transformer,
                                   train_df.drop(columns=['target']))
    train_target = extract_target(train_df, params.feature_params)

    logger.info(f"fit model")
    model = Classifier(params.model_params)
    model.fit(train_features, train_target)
    logger.info(f"model is fitted")

    logger.info(f"create test features and target")
    test_features = make_features(transformer,
                                  test_df.drop(columns=['target']))
    test_target = extract_target(test_df, params.feature_params)

    logger.info(f"made predictions")
    pred = model.predict(test_features)

    score = get_score(test_target, pred)
    logger.debug(f"ROC-AUC: {score}")

    logger.info(f"save model")
    model.dump(params.output_model_path)

    logger.info(f"save transformer")
    with open(params.output_transformer_path, "wb") as f:
        pickle.dump(transformer, f)

    logger.info(f"train pipeline is finished")
    return score
Пример #13
0
def dataset(dataset_path) -> pd.DataFrame:
    return read_data(dataset_path)
def test_read_data(dataset_path: str, target_col: str):
    data = read_data(dataset_path)
    assert len(data) > 10
    assert target_col in data.keys()