def predict_tags( text: str = "Transfer learning with BERT for self-supervised learning", run_id: str = "", ) -> Dict: """Predict tags for a give input text using a trained model. Warning: Make sure that you have a trained model first! Args: text (str, optional): Input text to predict tags for. Defaults to "Transfer learning with BERT for self-supervised learning". run_id (str, optional): ID of the run to load model artifacts from. Defaults to model with lowest `best_val_loss` from the `best` experiment. Returns: Predicted tags for input text. """ # Get best run if not run_id: experiment_id = mlflow.get_experiment_by_name("best").experiment_id all_runs = mlflow.search_runs( experiment_ids=experiment_id, order_by=["metrics.best_val_loss ASC"], ) run_id = all_runs.iloc[0].run_id # Predict artifacts = predict.load_artifacts(run_id=run_id) prediction = predict.predict(texts=[text], artifacts=artifacts) logger.info(json.dumps(prediction, indent=2)) return prediction
def health_check(): response = requests.get( f"http://0.0.0.0:{os.environ.get('PORT', 5000)}/", headers=headers, ) results = json.loads(response.text) logger.info(json.dumps(results, indent=4))
def set_artifact_metadata(): """Set the artifact URI for all experiments and runs. Used when transferring experiments from other locations (ex. Colab). Note: check out the [optimize.ipynb](https://colab.research.google.com/github/GokuMohandas/applied-ml/blob/main/notebooks/optimize.ipynb){:target="_blank"} notebook for how to train on Google Colab and transfer to local. """ def set_artifact_location(var, fp): """Set variable's yaml value on file at fp.""" with open(fp) as f: metadata = yaml.load(f) # Set new value experiment_id = metadata[var].split("/")[-1] artifact_location = Path("file://", config.EXPERIMENTS_DIR, experiment_id) metadata[var] = str(artifact_location) with open(fp, "w") as f: yaml.dump(metadata, f) def set_artifact_uri(var, fp): """Set variable's yaml value on file at fp.""" with open(fp) as f: metadata = yaml.load(f) # Set new value experiment_id = metadata[var].split("/")[-3] run_id = metadata[var].split("/")[-2] artifact_uri = Path( "file://", config.EXPERIMENTS_DIR, experiment_id, run_id, "artifacts", ) metadata[var] = str(artifact_uri) with open(fp, "w") as f: yaml.dump(metadata, f) # Get artifact location experiment_meta_yamls = list( Path(config.EXPERIMENTS_DIR).glob("*/meta.yaml")) for meta_yaml in experiment_meta_yamls: set_artifact_location(var="artifact_location", fp=meta_yaml) logger.info(f"Set artfifact location for {meta_yaml}") # Change artifact URI run_meta_yamls = list(Path(config.EXPERIMENTS_DIR).glob("*/*/meta.yaml")) for meta_yaml in run_meta_yamls: set_artifact_uri(var="artifact_uri", fp=meta_yaml) logger.info(f"Set artfifact URI for {meta_yaml}")
def download_data(): """Download data from online to local drive. Note: We could've just copied files from `datasets` but we'll use this later on with other data sources. """ # Download data projects_url = "https://raw.githubusercontent.com/GokuMohandas/applied-ml/main/datasets/projects.json" tags_url = "https://raw.githubusercontent.com/GokuMohandas/applied-ml/main/datasets/tags.json" projects = utils.load_json_from_url(url=projects_url) tags = utils.load_json_from_url(url=tags_url) # Save data projects_fp = Path(config.DATA_DIR, "projects.json") tags_fp = Path(config.DATA_DIR, "tags.json") utils.save_dict(d=projects, filepath=projects_fp) utils.save_dict(d=tags, filepath=tags_fp) logger.info("✅ Data downloaded!")
def clean_experiments(experiments_to_keep: str = "best"): """Removes all experiments besides the ones specified in `experiments_to_keep`. Args: experiments_to_keep (str): comma separated string of experiments to keep. """ # Get experiments to keep experiments_to_keep = list( set([exp.strip() for exp in experiments_to_keep.split(",")])) if not len(experiments_to_keep): raise ValueError("You must keep at least one experiment.") # Filter and delete client = mlflow.tracking.MlflowClient() for experiment in client.list_experiments(): if experiment.name not in experiments_to_keep: logger.info(f"Deleting Experiment {experiment.name}") client.delete_experiment(experiment_id=experiment.experiment_id) # Delete MLFlow trash shutil.rmtree(Path(config.EXPERIMENTS_DIR, ".trash")) logger.info(f"Cleared experiments besides {experiments_to_keep}")
def predict(): data = { "run_id": "", "texts": [ { "text": "Transfer learning with transformers for self-supervised learning." }, { "text": "Generative adversarial networks in both PyTorch and TensorFlow." }, ], } response = requests.post( f"http://0.0.0.0:{os.environ.get('PORT', 5000)}/predict", headers=headers, data=json.dumps(data), ) results = json.loads(response.text) logger.info(json.dumps(results, indent=4))
def train_model(args_fp: Path = Path(config.CONFIG_DIR, "args.json")) -> None: """Train a model using the specified parameters. Args: args_fp (Path, optional): Location of arguments to use for training. Defaults to `config/args.json`. """ # Set experiment and start run args = Namespace(**utils.load_dict(filepath=args_fp)) # Start run mlflow.set_experiment(experiment_name="best") with mlflow.start_run( run_name="cnn") as run: # NOQA: F841 (assigned to but never used) # Train artifacts = train.run(args=args) # Log key metrics performance = artifacts["performance"] loss = artifacts["loss"] mlflow.log_metrics({"precision": performance["overall"]["precision"]}) mlflow.log_metrics({"recall": performance["overall"]["recall"]}) mlflow.log_metrics({"f1": performance["overall"]["f1"]}) mlflow.log_metrics({"best_val_loss": loss}) # Log artifacts args = artifacts["args"] model = artifacts["model"] label_encoder = artifacts["label_encoder"] tokenizer = artifacts["tokenizer"] with tempfile.TemporaryDirectory() as fp: label_encoder.save(Path(fp, "label_encoder.json")) tokenizer.save(Path(fp, "tokenizer.json")) torch.save(model.state_dict(), Path(fp, "model.pt")) utils.save_dict(performance, Path(fp, "performance.json")) mlflow.log_artifacts(fp) mlflow.log_params(vars(args)) logger.info(json.dumps(performance["overall"], indent=2))
def optimize(num_trials: int = 100) -> None: """Optimize a subset of hyperparameters towards an objective. This saves the best trial's arguments into `config/args.json`. Args: num_trials (int, optional): Number of trials to run. Defaults to 100. """ # Starting arguments (not actually used but needed for set up) args_fp = Path(config.CONFIG_DIR, "args.json") args = Namespace(**utils.load_dict(filepath=args_fp)) # Optimize pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=5) study = optuna.create_study(study_name="optimization", direction="maximize", pruner=pruner) mlflow_callback = MLflowCallback(tracking_uri=mlflow.get_tracking_uri(), metric_name="f1") study.optimize( lambda trial: train.objective(args, trial), n_trials=num_trials, callbacks=[mlflow_callback], ) # All trials trials_df = study.trials_dataframe() trials_df = trials_df.sort_values(["value"], ascending=False) # sort by metric trials_df.to_csv(Path(config.EXPERIMENTS_DIR, "trials.csv"), index=False) # save # Best trial logger.info(f"Best value (f1): {study.best_trial.value}") params = {**args.__dict__, **study.best_trial.params} params["threshold"] = study.best_trial.user_attrs["threshold"] with open(Path(config.CONFIG_DIR, "args.json"), "w") as fp: json.dump(params, fp=fp, indent=2, cls=NumpyEncoder) logger.info(json.dumps(params, indent=2, cls=NumpyEncoder))