def test_save_and_load_dict(): with tempfile.TemporaryDirectory() as dp: d = {"hello": "world"} fp = Path(dp, "d.json") utils.save_dict(d=d, filepath=fp) d = utils.load_dict(filepath=fp) assert d["hello"] == "world"
def compute_features(params: Namespace) -> None: """Compute features to use for training. Args: params (Namespace): Input parameters for operations. """ # Set up utils.set_seed(seed=params.seed) # Load data projects_url = ( "https://raw.githubusercontent.com/GokuMohandas/MadeWithML/main/datasets/projects.json" ) projects = utils.load_json_from_url(url=projects_url) df = pd.DataFrame(projects) # Compute features df["text"] = df.title + " " + df.description df.drop(columns=["title", "description"], inplace=True) df = df[["id", "created_on", "text", "tags"]] # Save features = df.to_dict(orient="records") df_dict_fp = Path(config.DATA_DIR, "features.json") utils.save_dict(d=features, filepath=df_dict_fp) return df, features
def behavioral_reevaluation( model_dir: Path = config.MODEL_DIR, ): # pragma: no cover, requires changing existing runs """Reevaluate existing runs on current behavioral tests in eval.py. This is possible since behavioral tests are inputs applied to black box models and compared with expected outputs. There is not dependency on data or model versions. Args: model_dir (Path): location of model artifacts. Defaults to config.MODEL_DIR. Raises: ValueError: Run id doesn't exist in experiment. """ # Generate behavioral report artifacts = main.load_artifacts(model_dir=model_dir) artifacts["performance"]["behavioral"] = eval.get_behavioral_report( artifacts=artifacts) mlflow.log_metric("behavioral_score", artifacts["performance"]["behavioral"]["score"]) # Log updated performance utils.save_dict(artifacts["performance"], Path(model_dir, "performance.json"))
def train_model( params_fp: Path = Path(config.CONFIG_DIR, "params.json"), model_dir: Optional[Path] = Path(config.MODEL_DIR), experiment_name: Optional[str] = "best", run_name: Optional[str] = "model", ) -> None: """Train a model using the specified parameters. Args: params_fp (Path, optional): Parameters to use for training. Defaults to `config/params.json`. model_dir (Path): location of model artifacts. Defaults to config.MODEL_DIR. experiment_name (str, optional): Name of the experiment to save the run to. Defaults to `best`. run_name (str, optional): Name of the run. Defaults to `model`. """ # Set experiment and start run params = Namespace(**utils.load_dict(filepath=params_fp)) # Start run mlflow.set_experiment(experiment_name=experiment_name) with mlflow.start_run(run_name=run_name): # Train artifacts = main.run(params=params) # Set tags tags = {} mlflow.set_tags(tags) # Log metrics performance = artifacts["performance"] logger.info(json.dumps(performance["overall"], indent=2)) metrics = { "precision": performance["overall"]["precision"], "recall": performance["overall"]["recall"], "f1": performance["overall"]["f1"], "best_val_loss": artifacts["loss"], "behavioral_score": performance["behavioral"]["score"], "slices_f1": performance["slices"]["overall"]["f1"], } mlflow.log_metrics(metrics) # Log artifacts with tempfile.TemporaryDirectory() as dp: artifacts["label_encoder"].save(Path(dp, "label_encoder.json")) artifacts["tokenizer"].save(Path(dp, "tokenizer.json")) torch.save(artifacts["model"].state_dict(), Path(dp, "model.pt")) utils.save_dict(performance, Path(dp, "performance.json")) mlflow.log_artifacts(dp) mlflow.log_params(vars(artifacts["params"])) # Save for repo with open(Path(model_dir, "params.json"), "w") as fp: json.dump(vars(params), fp=fp, indent=2, cls=NumpyEncoder) artifacts["label_encoder"].save(Path(model_dir, "label_encoder.json")) artifacts["tokenizer"].save(Path(model_dir, "tokenizer.json")) torch.save(artifacts["model"].state_dict(), Path(model_dir, "model.pt")) utils.save_dict(performance, Path(model_dir, "performance.json"))
def download_auxiliary_data(): """Load auxiliary data from URL and save to local drive.""" # Download auxiliary data tags_url = "https://raw.githubusercontent.com/GokuMohandas/MadeWithML/main/datasets/tags.json" tags = utils.load_json_from_url(url=tags_url) # Save data tags_fp = Path(config.DATA_DIR, "tags.json") utils.save_dict(d=tags, filepath=tags_fp) logger.info("✅ Auxiliary data downloaded!")
def update_behavioral_report(run_id): with mlflow.start_run(run_id=run_id): # Generate behavioral report artifacts = main.load_artifacts(run_id=run_id) behavioral_report = eval.get_behavioral_report(artifacts=artifacts) mlflow.log_metric("behavioral_score", behavioral_report["score"]) # Log artifacts with tempfile.TemporaryDirectory() as dp: utils.save_dict(behavioral_report, Path(dp, "behavioral_report.json")) mlflow.log_artifacts(dp) logger.info(f"Updated behavioral report for run_id {run_id}")
def train_model( args_fp: Path = Path(config.CONFIG_DIR, "args.json"), experiment_name: Optional[str] = "best", run_name: Optional[str] = "model", ) -> None: """Train a model using the specified parameters. Args: args_fp (Path, optional): Location of arguments to use for training. Defaults to `config/args.json`. experiment_name (str, optional): Name of the experiment to save the run to. Defaults to `best`. run_name (str, optional): Name of the run. Defaults to `model`. """ # Set experiment and start run args = Namespace(**utils.load_dict(filepath=args_fp)) # Start run mlflow.set_experiment(experiment_name=experiment_name) with mlflow.start_run(run_name=run_name ) as run: # NOQA: F841 (assigned to but never used) # Train artifacts = main.run(args=args) # Set tags tags = {"data_version": artifacts["data_version"]} mlflow.set_tags(tags) # Log metrics performance = artifacts["performance"] logger.info(json.dumps(performance["overall"], indent=2)) metrics = { "precision": performance["overall"]["precision"], "recall": performance["overall"]["recall"], "f1": performance["overall"]["f1"], "best_val_loss": artifacts["loss"], "behavioral_score": artifacts["behavioral_report"]["score"], "slices_f1": performance["slices"]["f1"], } mlflow.log_metrics(metrics) # Log artifacts with tempfile.TemporaryDirectory() as dp: artifacts["label_encoder"].save(Path(dp, "label_encoder.json")) artifacts["tokenizer"].save(Path(dp, "tokenizer.json")) torch.save(artifacts["model"].state_dict(), Path(dp, "model.pt")) utils.save_dict(performance, Path(dp, "performance.json")) utils.save_dict(artifacts["behavioral_report"], Path(dp, "behavioral_report.json")) mlflow.log_artifacts(dp) mlflow.log_params(vars(artifacts["args"]))
def download_data(): """Download data from online to local drive. Note: We could've just copied files from `datasets` but we'll use this later on with other data sources. """ # Download data projects_url = "https://raw.githubusercontent.com/GokuMohandas/applied-ml/main/datasets/projects.json" tags_url = "https://raw.githubusercontent.com/GokuMohandas/applied-ml/main/datasets/tags.json" projects = utils.load_json_from_url(url=projects_url) tags = utils.load_json_from_url(url=tags_url) # Save data projects_fp = Path(config.DATA_DIR, "projects.json") tags_fp = Path(config.DATA_DIR, "tags.json") utils.save_dict(d=projects, filepath=projects_fp) utils.save_dict(d=tags, filepath=tags_fp) logger.info("✅ Data downloaded!")
def optimize( params_fp: Path = Path(config.CONFIG_DIR, "params.json"), study_name: Optional[str] = "optimization", num_trials: int = 100, ) -> None: """Optimize a subset of hyperparameters towards an objective. This saves the best trial's parameters into `config/params.json`. Args: params_fp (Path, optional): Location of parameters (just using num_samples, num_epochs, etc.) to use for training. Defaults to `config/params.json`. study_name (str, optional): Name of the study to save trial runs under. Defaults to `optimization`. num_trials (int, optional): Number of trials to run. Defaults to 100. """ # Starting parameters (not actually used but needed for set up) params = Namespace(**utils.load_dict(filepath=params_fp)) # Optimize pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=5) study = optuna.create_study(study_name=study_name, direction="maximize", pruner=pruner) mlflow_callback = MLflowCallback(tracking_uri=mlflow.get_tracking_uri(), metric_name="f1") study.optimize( lambda trial: main.objective(params, trial), n_trials=num_trials, callbacks=[mlflow_callback], ) # All trials trials_df = study.trials_dataframe() trials_df = trials_df.sort_values(["value"], ascending=False) # Best trial logger.info(f"Best value (f1): {study.best_trial.value}") params = {**params.__dict__, **study.best_trial.params} params["threshold"] = study.best_trial.user_attrs["threshold"] utils.save_dict(params, params_fp, cls=NumpyEncoder) logger.info(json.dumps(params, indent=2, cls=NumpyEncoder))
def train_model(args_fp: Path = Path(config.CONFIG_DIR, "args.json")) -> None: """Train a model using the specified parameters. Args: args_fp (Path, optional): Location of arguments to use for training. Defaults to `config/args.json`. """ # Set experiment and start run args = Namespace(**utils.load_dict(filepath=args_fp)) # Start run mlflow.set_experiment(experiment_name="best") with mlflow.start_run( run_name="cnn" ) as run: # NOQA: F841 (assigned to but never used) # Train artifacts = train.run(args=args) # Log key metrics performance = artifacts["performance"] loss = artifacts["loss"] mlflow.log_metrics({"precision": performance["overall"]["precision"]}) mlflow.log_metrics({"recall": performance["overall"]["recall"]}) mlflow.log_metrics({"f1": performance["overall"]["f1"]}) mlflow.log_metrics({"best_val_loss": loss}) # Log artifacts args = artifacts["args"] model = artifacts["model"] label_encoder = artifacts["label_encoder"] tokenizer = artifacts["tokenizer"] with tempfile.TemporaryDirectory() as fp: label_encoder.save(Path(fp, "label_encoder.json")) tokenizer.save(Path(fp, "tokenizer.json")) torch.save(model.state_dict(), Path(fp, "model.pt")) utils.save_dict(performance, Path(fp, "performance.json")) mlflow.log_artifacts(fp) mlflow.log_params(vars(args)) logger.info(json.dumps(performance["overall"], indent=2))