def predict_tags( text: Optional[ str] = "Transfer learning with BERT for self-supervised learning", run_id: str = open(Path(config.MODEL_DIR, "run_id.txt")).read(), ) -> Dict: """Predict tags for a give input text using a trained model. Warning: Make sure that you have a trained model first! Args: text (str, optional): Input text to predict tags for. Defaults to "Transfer learning with BERT for self-supervised learning". run_id (str): ID of the model run to load artifacts. Defaults to run ID in config.MODEL_DIR. Raises: ValueError: Run id doesn't exist in experiment. Returns: Predicted tags for input text. """ # Predict artifacts = main.load_artifacts(run_id=run_id) prediction = predict.predict(texts=[text], artifacts=artifacts) logger.info(json.dumps(prediction, indent=2)) return prediction
def predict_tags( text: Optional[ str] = "Transfer learning with BERT for self-supervised learning", model_dir: Path = config.MODEL_DIR, ) -> Dict: """Predict tags for a give input text using a trained model. Warning: Make sure that you have a trained model first! Args: text (str, optional): Input text to predict tags for. Defaults to "Transfer learning with BERT for self-supervised learning". model_dir (Path): location of model artifacts. Defaults to config.MODEL_DIR. Raises: ValueError: Run id doesn't exist in experiment. Returns: Predicted tags for input text. """ # Predict artifacts = main.load_artifacts(model_dir=model_dir) prediction = predict.predict(texts=[text], artifacts=artifacts) logger.info(json.dumps(prediction, indent=2)) return prediction
def predict_tags( text: str = "Transfer learning with BERT for self-supervised learning", run_id: str = "", ) -> Dict: """Predict tags for a give input text using a trained model. Warning: Make sure that you have a trained model first! Args: text (str, optional): Input text to predict tags for. Defaults to "Transfer learning with BERT for self-supervised learning". run_id (str, optional): ID of the run to load model artifacts from. Defaults to model with lowest `best_val_loss` from the `best` experiment. Returns: Predicted tags for input text. """ # Get best run if not run_id: experiment_id = mlflow.get_experiment_by_name("best").experiment_id all_runs = mlflow.search_runs( experiment_ids=experiment_id, order_by=["metrics.best_val_loss ASC"], ) run_id = all_runs.iloc[0].run_id # Predict prediction = predict.predict(texts=[text], run_id=run_id) logger.info(json.dumps(prediction, indent=2)) return prediction
def train_model( params_fp: Path = Path(config.CONFIG_DIR, "params.json"), model_dir: Optional[Path] = Path(config.MODEL_DIR), experiment_name: Optional[str] = "best", run_name: Optional[str] = "model", ) -> None: """Train a model using the specified parameters. Args: params_fp (Path, optional): Parameters to use for training. Defaults to `config/params.json`. model_dir (Path): location of model artifacts. Defaults to config.MODEL_DIR. experiment_name (str, optional): Name of the experiment to save the run to. Defaults to `best`. run_name (str, optional): Name of the run. Defaults to `model`. """ # Set experiment and start run params = Namespace(**utils.load_dict(filepath=params_fp)) # Start run mlflow.set_experiment(experiment_name=experiment_name) with mlflow.start_run(run_name=run_name): run_id = mlflow.active_run().info.run_id # Train artifacts = main.run(params=params) # Set tags tags = {} mlflow.set_tags(tags) # Log metrics performance = artifacts["performance"] logger.info(json.dumps(performance["overall"], indent=2)) metrics = { "precision": performance["overall"]["precision"], "recall": performance["overall"]["recall"], "f1": performance["overall"]["f1"], "best_val_loss": artifacts["loss"], "behavioral_score": performance["behavioral"]["score"], "slices_f1": performance["slices"]["overall"]["f1"], } mlflow.log_metrics(metrics) # Log artifacts with tempfile.TemporaryDirectory() as dp: utils.save_dict(vars(artifacts["params"]), Path(dp, "params.json"), cls=NumpyEncoder) utils.save_dict(performance, Path(dp, "performance.json")) artifacts["label_encoder"].save(Path(dp, "label_encoder.json")) artifacts["tokenizer"].save(Path(dp, "tokenizer.json")) torch.save(artifacts["model"].state_dict(), Path(dp, "model.pt")) mlflow.log_artifacts(dp) mlflow.log_params(vars(artifacts["params"])) # Save for repo open(Path(model_dir, "run_id.txt"), "w").write(run_id) utils.save_dict(vars(params), Path(model_dir, "params.json"), cls=NumpyEncoder) utils.save_dict(performance, Path(model_dir, "performance.json"))
def health_check(): response = requests.get( f"http://0.0.0.0:{os.environ.get('PORT', 5000)}/", headers=headers, ) results = json.loads(response.text) logger.info(json.dumps(results, indent=4))
def load_best_artifacts(): global runs, run_ids, best_artifacts, best_run_id runs = utils.get_sorted_runs(experiment_name="best", order_by=["metrics.f1 DESC"]) run_ids = [run["run_id"] for run in runs] best_run_id = run_ids[0] best_artifacts = predict.load_artifacts(run_id=best_run_id) logger.info( "Loaded trained model and other required artifacts for inference!")
def set_artifact_metadata(): """Set the artifact URI for all experiments and runs. Used when transferring experiments from other locations (ex. Colab). Note: check out the [optimize.ipynb](https://colab.research.google.com/github/GokuMohandas/applied-ml/blob/main/notebooks/optimize.ipynb){:target="_blank"} notebook for how to train on Google Colab and transfer to local. """ def set_artifact_location(var, fp): """Set variable's yaml value on file at fp.""" with open(fp) as f: metadata = yaml.load(f) # Set new value experiment_id = metadata[var].split("/")[-1] artifact_location = Path("file://", config.EXPERIMENTS_DIR, experiment_id) metadata[var] = str(artifact_location) with open(fp, "w") as f: yaml.dump(metadata, f) def set_artifact_uri(var, fp): """Set variable's yaml value on file at fp.""" with open(fp) as f: metadata = yaml.load(f) # Set new value experiment_id = metadata[var].split("/")[-3] run_id = metadata[var].split("/")[-2] artifact_uri = Path( "file://", config.EXPERIMENTS_DIR, experiment_id, run_id, "artifacts", ) metadata[var] = str(artifact_uri) with open(fp, "w") as f: yaml.dump(metadata, f) # Get artifact location experiment_meta_yamls = list( Path(config.EXPERIMENTS_DIR).glob("*/meta.yaml")) for meta_yaml in experiment_meta_yamls: set_artifact_location(var="artifact_location", fp=meta_yaml) logger.info(f"Set artfifact location for {meta_yaml}") # Change artifact URI run_meta_yamls = list(Path(config.EXPERIMENTS_DIR).glob("*/*/meta.yaml")) for meta_yaml in run_meta_yamls: set_artifact_uri(var="artifact_uri", fp=meta_yaml) logger.info(f"Set artfifact URI for {meta_yaml}")
def update_behavioral_report(run_id): with mlflow.start_run(run_id=run_id): # Generate behavioral report artifacts = main.load_artifacts(run_id=run_id) behavioral_report = eval.get_behavioral_report(artifacts=artifacts) mlflow.log_metric("behavioral_score", behavioral_report["score"]) # Log artifacts with tempfile.TemporaryDirectory() as dp: utils.save_dict(behavioral_report, Path(dp, "behavioral_report.json")) mlflow.log_artifacts(dp) logger.info(f"Updated behavioral report for run_id {run_id}")
def performance( author: str = config.AUTHOR, repo: str = config.REPO, tag: str = "workspace", verbose: bool = True, ): if tag == "workspace": performance = utils.load_dict(filepath=Path(config.MODEL_DIR, "performance.json")) else: url = f"https://raw.githubusercontent.com/{author}/{repo}/{tag}/model/performance.json" performance = utils.load_json_from_url(url=url) if verbose: logger.info(json.dumps(performance, indent=2)) return performance
def train_model( args_fp: Path = Path(config.CONFIG_DIR, "args.json"), experiment_name: Optional[str] = "best", run_name: Optional[str] = "model", ) -> None: """Train a model using the specified parameters. Args: args_fp (Path, optional): Location of arguments to use for training. Defaults to `config/args.json`. experiment_name (str, optional): Name of the experiment to save the run to. Defaults to `best`. run_name (str, optional): Name of the run. Defaults to `model`. """ # Set experiment and start run args = Namespace(**utils.load_dict(filepath=args_fp)) # Start run mlflow.set_experiment(experiment_name=experiment_name) with mlflow.start_run(run_name=run_name ) as run: # NOQA: F841 (assigned to but never used) # Train artifacts = main.run(args=args) # Set tags tags = {"data_version": artifacts["data_version"]} mlflow.set_tags(tags) # Log metrics performance = artifacts["performance"] logger.info(json.dumps(performance["overall"], indent=2)) metrics = { "precision": performance["overall"]["precision"], "recall": performance["overall"]["recall"], "f1": performance["overall"]["f1"], "best_val_loss": artifacts["loss"], "behavioral_score": artifacts["behavioral_report"]["score"], "slices_f1": performance["slices"]["f1"], } mlflow.log_metrics(metrics) # Log artifacts with tempfile.TemporaryDirectory() as dp: artifacts["label_encoder"].save(Path(dp, "label_encoder.json")) artifacts["tokenizer"].save(Path(dp, "tokenizer.json")) torch.save(artifacts["model"].state_dict(), Path(dp, "model.pt")) utils.save_dict(performance, Path(dp, "performance.json")) utils.save_dict(artifacts["behavioral_report"], Path(dp, "behavioral_report.json")) mlflow.log_artifacts(dp) mlflow.log_params(vars(artifacts["args"]))
def predict(): data = { "run_id": "", "texts": [ {"text": "Transfer learning with transformers for self-supervised learning."}, {"text": "Generative adversarial networks in both PyTorch and TensorFlow."}, ], } response = requests.post( f"http://0.0.0.0:{os.environ.get('PORT', 5000)}/predict", headers=headers, data=json.dumps(data), ) results = json.loads(response.text) logger.info(json.dumps(results, indent=4))
def predict_tags( text: Optional[ str] = "Transfer learning with BERT for self-supervised learning", experiment_name: Optional[str] = "best", run_id: Optional[str] = "", ) -> Dict: """Predict tags for a give input text using a trained model. Warning: Make sure that you have a trained model first! Args: text (str, optional): Input text to predict tags for. Defaults to "Transfer learning with BERT for self-supervised learning". experiment_name (str, optional): Name of the experiment to fetch run from. run_id (str, optional): ID of the run to load model artifacts from. Defaults to run with highest F1 score. Raises: ValueError: Run id doesn't exist in experiment. Returns: Predicted tags for input text. """ # Get sorted runs runs = utils.get_sorted_runs( experiment_name=experiment_name, order_by=["metrics.f1 DESC"], ) run_ids = [run["run_id"] for run in runs] # Get best run if not run_id: run_id = run_ids[0] # Validate run id if run_id not in run_ids: # pragma: no cover, simple value check raise ValueError( f"Run_id {run_id} does not exist in experiment {experiment_name}") # Predict artifacts = main.load_artifacts(run_id=run_id) prediction = predict.predict(texts=[text], artifacts=artifacts) logger.info(json.dumps(prediction, indent=2)) return prediction
def download_data(): """Download data from online to local drive. Note: We could've just copied files from `datasets` but we'll use this later on with other data sources. """ # Download data projects_url = "https://raw.githubusercontent.com/GokuMohandas/applied-ml/main/datasets/projects.json" tags_url = "https://raw.githubusercontent.com/GokuMohandas/applied-ml/main/datasets/tags.json" projects = utils.load_json_from_url(url=projects_url) tags = utils.load_json_from_url(url=tags_url) # Save data projects_fp = Path(config.DATA_DIR, "projects.json") tags_fp = Path(config.DATA_DIR, "tags.json") utils.save_dict(d=projects, filepath=projects_fp) utils.save_dict(d=tags, filepath=tags_fp) logger.info("✅ Data downloaded!")
def optimize( params_fp: Path = Path(config.CONFIG_DIR, "params.json"), study_name: Optional[str] = "optimization", num_trials: int = 100, ) -> None: """Optimize a subset of hyperparameters towards an objective. This saves the best trial's parameters into `config/params.json`. Args: params_fp (Path, optional): Location of parameters (just using num_samples, num_epochs, etc.) to use for training. Defaults to `config/params.json`. study_name (str, optional): Name of the study to save trial runs under. Defaults to `optimization`. num_trials (int, optional): Number of trials to run. Defaults to 100. """ # Starting parameters (not actually used but needed for set up) params = Namespace(**utils.load_dict(filepath=params_fp)) # Optimize pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=5) study = optuna.create_study(study_name=study_name, direction="maximize", pruner=pruner) mlflow_callback = MLflowCallback(tracking_uri=mlflow.get_tracking_uri(), metric_name="f1") study.optimize( lambda trial: main.objective(params, trial), n_trials=num_trials, callbacks=[mlflow_callback], ) # All trials trials_df = study.trials_dataframe() trials_df = trials_df.sort_values(["value"], ascending=False) # Best trial logger.info(f"Best value (f1): {study.best_trial.value}") params = {**params.__dict__, **study.best_trial.params} params["threshold"] = study.best_trial.user_attrs["threshold"] with open(params_fp, "w") as fp: json.dump(params, fp=fp, indent=2, cls=NumpyEncoder) logger.info(json.dumps(params, indent=2, cls=NumpyEncoder))
def optimize(num_trials: int = 100) -> None: """Optimize a subset of hyperparameters towards an objective. This saves the best trial's arguments into `config/args.json`. Args: num_trials (int, optional): Number of trials to run. Defaults to 100. """ # Starting arguments (not actually used but needed for set up) args_fp = Path(config.CONFIG_DIR, "args.json") args = Namespace(**utils.load_dict(filepath=args_fp)) # Optimize pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=5) study = optuna.create_study( study_name="optimization", direction="maximize", pruner=pruner ) mlflow_callback = MLflowCallback( tracking_uri=mlflow.get_tracking_uri(), metric_name="f1" ) study.optimize( lambda trial: train.objective(args, trial), n_trials=num_trials, callbacks=[mlflow_callback], ) # All trials trials_df = study.trials_dataframe() trials_df = trials_df.sort_values( ["value"], ascending=False ) # sort by metric trials_df.to_csv( Path(config.EXPERIMENTS_DIR, "trials.csv"), index=False ) # save # Best trial logger.info(f"Best value (f1): {study.best_trial.value}") params = {**args.__dict__, **study.best_trial.params} params["threshold"] = study.best_trial.user_attrs["threshold"] with open(Path(config.CONFIG_DIR, "args.json"), "w") as fp: json.dump(params, fp=fp, indent=2, cls=NumpyEncoder) logger.info(json.dumps(params, indent=2, cls=NumpyEncoder))
def objective(args: Namespace, trial: optuna.trial._trial.Trial) -> float: """Objective function for optimization trials. Args: args (Namespace): Input arguments for each trial (see `config/args.json`) for argument names. trial (optuna.trial._trial.Trial): Optuna optimization trial. Returns: F1 score from evaluating the trained model on the test data split. """ # Paramters (to tune) args.embedding_dim = trial.suggest_int("embedding_dim", 128, 512) args.num_filters = trial.suggest_int("num_filters", 128, 512) args.hidden_dim = trial.suggest_int("hidden_dim", 128, 512) args.dropout_p = trial.suggest_uniform("dropout_p", 0.3, 0.8) args.lr = trial.suggest_loguniform("lr", 5e-5, 5e-4) # Train (can move some of these outside for efficiency) logger.info(f"\nTrial {trial.number}:") logger.info(json.dumps(trial.params, indent=2)) artifacts = run(args=args, trial=trial) # Set additional attributes args = artifacts["args"] performance = artifacts["performance"] logger.info(json.dumps(performance["overall"], indent=2)) trial.set_user_attr("threshold", args.threshold) trial.set_user_attr("precision", performance["overall"]["precision"]) trial.set_user_attr("recall", performance["overall"]["recall"]) trial.set_user_attr("f1", performance["overall"]["f1"]) return performance["overall"]["f1"]
def diff( author: str = config.AUTHOR, repo: str = config.REPO, tag_a: str = "workspace", tag_b: str = "", ): # pragma: no cover, can't be certain what diffs will exist # Tag b if tag_b == "": tags_url = f"https://api.github.com/repos/{author}/{repo}/tags" tag_b = utils.load_json_from_url(url=tags_url)[0]["name"] logger.info(f"Comparing {tag_a} with {tag_b}:") # Params params_a = params(author=author, repo=repo, tag=tag_a, verbose=False) params_b = params(author=author, repo=repo, tag=tag_b, verbose=False) params_diff = utils.dict_diff(d_a=params_a, d_b=params_b, d_a_name=tag_a, d_b_name=tag_b) logger.info(f"Parameter differences: {json.dumps(params_diff, indent=2)}") # Performance performance_a = performance(author=author, repo=repo, tag=tag_a, verbose=False) performance_b = performance(author=author, repo=repo, tag=tag_b, verbose=False) performance_diff = utils.dict_diff( d_a=performance_a, d_b=performance_b, d_a_name=tag_a, d_b_name=tag_b ) logger.info(f"Performance differences: {json.dumps(performance_diff, indent=2)}") return params_diff, performance_diff
def train_model(args_fp: Path = Path(config.CONFIG_DIR, "args.json")) -> None: """Train a model using the specified parameters. Args: args_fp (Path, optional): Location of arguments to use for training. Defaults to `config/args.json`. """ # Set experiment and start run args = Namespace(**utils.load_dict(filepath=args_fp)) # Start run mlflow.set_experiment(experiment_name="best") with mlflow.start_run( run_name="cnn" ) as run: # NOQA: F841 (assigned to but never used) # Train artifacts = train.run(args=args) # Log key metrics performance = artifacts["performance"] loss = artifacts["loss"] mlflow.log_metrics({"precision": performance["overall"]["precision"]}) mlflow.log_metrics({"recall": performance["overall"]["recall"]}) mlflow.log_metrics({"f1": performance["overall"]["f1"]}) mlflow.log_metrics({"best_val_loss": loss}) # Log artifacts args = artifacts["args"] model = artifacts["model"] label_encoder = artifacts["label_encoder"] tokenizer = artifacts["tokenizer"] with tempfile.TemporaryDirectory() as fp: label_encoder.save(Path(fp, "label_encoder.json")) tokenizer.save(Path(fp, "tokenizer.json")) torch.save(model.state_dict(), Path(fp, "model.pt")) utils.save_dict(performance, Path(fp, "performance.json")) mlflow.log_artifacts(fp) mlflow.log_params(vars(args)) logger.info(json.dumps(performance["overall"], indent=2))
def train( self, num_epochs: int, patience: int, train_dataloader: torch.utils.data.DataLoader, val_dataloader: torch.utils.data.DataLoader, ) -> Tuple: """Training loop. Args: num_epochs (int): Maximum number of epochs to train for (can stop earlier based on performance). patience (int): Number of acceptable epochs for continuous degrading performance. train_dataloader (torch.utils.data.DataLoader): Dataloader object with training data split. val_dataloader (torch.utils.data.DataLoader): Dataloader object with validation data split. Raises: optuna.TrialPruned: Early stopping of the optimization trial if poor performance. Returns: The best validation loss and the trained model from that point. """ best_val_loss = np.inf best_model = None _patience = patience for epoch in range(num_epochs): # Steps train_loss = self.train_step(dataloader=train_dataloader) val_loss, _, _ = self.eval_step(dataloader=val_dataloader) self.scheduler.step(val_loss) # Pruning based on the intermediate value if self.trial: self.trial.report(val_loss, epoch) if self.trial.should_prune(): # pragma: no cover, optuna pruning logger.info("Unpromising trial pruned!") raise optuna.TrialPruned() # Early stopping if val_loss < best_val_loss: best_val_loss = val_loss best_model = self.model _patience = patience # reset _patience else: # pragma: no cover, simple subtraction _patience -= 1 if not _patience: # pragma: no cover, simple break logger.info("Stopping early!") break # Logging logger.info( f"Epoch: {epoch+1} | " f"train_loss: {train_loss:.5f}, " f"val_loss: {val_loss:.5f}, " f"lr: {self.optimizer.param_groups[0]['lr']:.2E}, " f"_patience: {_patience}" ) return best_val_loss, best_model
def clean_experiments(experiments_to_keep: str = "best"): """Removes all experiments besides the ones specified in `experiments_to_keep`. Args: experiments_to_keep (str): comma separated string of experiments to keep. """ # Get experiments to keep experiments_to_keep = list( set([exp.strip() for exp in experiments_to_keep.split(",")])) if not len(experiments_to_keep) or not experiments_to_keep[0]: raise ValueError("You must keep at least one experiment.") # Filter and delete client = mlflow.tracking.MlflowClient() for experiment in client.list_experiments(): if experiment.name not in experiments_to_keep: # pragma: no cover, mlflow function logger.info(f"Deleting Experiment {experiment.name}") client.delete_experiment(experiment_id=experiment.experiment_id) # Delete MLFlow trash shutil.rmtree(Path(config.EXPERIMENTS_DIR, ".trash")) logger.info(f"Cleared experiments besides {experiments_to_keep}")
max_filter_size=args.max_filter_size) val_dataset = data.CNNTextDataset(X=X_val, y=y_val, max_filter_size=args.max_filter_size) test_dataset = data.CNNTextDataset(X=X_test, y=y_test, max_filter_size=args.max_filter_size) train_dataloader = train_dataset.create_dataloader( batch_size=args.batch_size) val_dataloader = val_dataset.create_dataloader(batch_size=args.batch_size) test_dataloader = test_dataset.create_dataloader( batch_size=args.batch_size) # Load artifacts runs = utils.get_sorted_runs(experiment_name="best", order_by=["metrics.f1 DESC"]) run_ids = [run["run_id"] for run in runs] artifacts = main.load_artifacts(run_id=run_ids[0], device=torch.device("cpu")) # Evaluation device = torch.device("cpu") performance, behavioral_report = evaluate( artifacts=artifacts, dataloader=test_dataloader, df=test_df, device=device, ) logger.info(json.dumps(performance, indent=2)) logger.info(json.dumps(behavioral_report, indent=2))
def run(args: Namespace, trial: optuna.trial._trial.Trial = None) -> Dict: """Operations for training. 1. Set seed 2. Set device 3. Load data 4. Clean data 5. Preprocess data 6. Encode labels 7. Split data 8. Tokenize inputs 9. Create dataloaders 10. Initialize model 11. Train model 12. Evaluate model Args: args (Namespace): Input arguments for operations. trial (optuna.trial._trial.Trail, optional): Optuna optimization trial. Defaults to None. Returns: Artifacts to save and load for later. """ # 1. Set seed utils.set_seed(seed=args.seed) # 2. Set device device = utils.set_device(cuda=args.cuda) # 3. Load data df, projects_dict, tags_dict = data.load( shuffle=args.shuffle, num_samples=args.num_samples ) # 4. Clean data df, tags_dict, tags_above_frequency = data.clean( df=df, tags_dict=tags_dict, min_tag_freq=args.min_tag_freq ) # 5. Preprocess data df.text = df.text.apply(data.preprocess, lower=args.lower, stem=args.stem) # 6. Encode labels y, class_weights, label_encoder = data.encode_labels(labels=df.tags) # 7. Split data utils.set_seed(seed=args.seed) # needed for skmultilearn X_train, X_val, X_test, y_train, y_val, y_test = data.split( X=df.text.to_numpy(), y=y, train_size=args.train_size ) # 8. Tokenize inputs X_train, tokenizer = data.tokenize_text( X=X_train, char_level=args.char_level ) X_val, _ = data.tokenize_text( X=X_val, char_level=args.char_level, tokenizer=tokenizer ) X_test, _ = data.tokenize_text( X=X_test, char_level=args.char_level, tokenizer=tokenizer ) # 9. Create dataloaders train_dataloader = data.get_dataloader( data=[X_train, y_train], max_filter_size=args.max_filter_size, batch_size=args.batch_size, ) val_dataloader = data.get_dataloader( data=[X_val, y_val], max_filter_size=args.max_filter_size, batch_size=args.batch_size, ) test_dataloader = data.get_dataloader( data=[X_test, y_test], max_filter_size=args.max_filter_size, batch_size=args.batch_size, ) # 10. Initialize model model = models.initialize_model( args=args, vocab_size=len(tokenizer), num_classes=len(label_encoder), device=device, ) # 11. Train model logger.info( f"Arguments: {json.dumps(args.__dict__, indent=2, cls=NumpyEncoder)}" ) args, model, loss = train( args=args, train_dataloader=train_dataloader, val_dataloader=val_dataloader, model=model, device=device, class_weights=class_weights, trial=trial, ) # 12. Evaluate model device = torch.device("cpu") performance = evaluate( dataloader=test_dataloader, model=model.to(device), device=device, threshold=args.threshold, classes=label_encoder.classes, ) return { "args": args, "label_encoder": label_encoder, "tokenizer": tokenizer, "model": model, "loss": loss, "performance": performance, }
def diff(commit_a: str = "workspace", commit_b: str = "head"): # pragma: no cover """Compare relevant differences (params, metrics) between commits. Inspired by DVC's `dvc metrics diff`but repurposed to display diffs pertinent to our experiments. Args: commit_a (str, optional): Primary commit. Defaults to "workspace". commit_b (str, optional): Commit to compare to. Defaults to "head". Raises: ValueError: Invalid commit. """ diffs = {} commits = ["a", "b"] if commit_a.lower() in ("head", "current"): commit_a = "main" if commit_b.lower() in ("head", "current"): commit_b = "main" # Get params params = {"a": {}, "b": {}} for i, commit in enumerate([commit_a, commit_b]): if commit == "workspace": params[commits[i]] = utils.load_dict( filepath=Path(config.CONFIG_DIR, "params.json")) continue params_url = ( f"https://raw.githubusercontent.com/GokuMohandas/applied-ml/{commit}/model/params.json" ) params[commits[i]] = utils.load_json_from_url(url=params_url) # Parameter differences diffs["params"] = {} for arg in params["a"]: a = params["a"][arg] b = params["b"][arg] if a != b: diffs["params"][arg] = {commit_a: a, commit_b: b} logger.info( f"Parameter differences:\n{json.dumps(diffs['params'], indent=2)}") # Get metrics metrics = {"a": {}, "b": {}} for i, commit in enumerate([commit_a, commit_b]): if commit == "workspace": metrics[commits[i]] = utils.load_dict( filepath=Path(config.MODEL_DIR, "performance.json")) continue metrics_url = f"https://raw.githubusercontent.com/GokuMohandas/applied-ml/{commit}/model/performance.json" metrics[commits[i]] = utils.load_json_from_url(url=metrics_url) # Recursively flatten metrics_a = pd.json_normalize(metrics["a"], sep=".").to_dict(orient="records")[0] metrics_b = pd.json_normalize(metrics["b"], sep=".").to_dict(orient="records")[0] if metrics_a.keys() != metrics_b.keys(): raise Exception( "Cannot compare these commits because they have different metrics." ) # Metric differences diffs["metrics"] = {} diffs["metrics"]["improvements"] = {} diffs["metrics"]["regressions"] = {} for metric in metrics_a: if ((metric in metrics_b) and (metrics_a[metric] != metrics_b[metric]) and (isinstance(metrics_a[metric], numbers.Number)) and (metric.split(".")[-1] != "num_samples")): item = { commit_a: metrics_a[metric], commit_b: metrics_b[metric], "diff": metrics_a[metric] - metrics_b[metric], } if item["diff"] >= 0.0: diffs["metrics"]["improvements"][metric] = item else: diffs["metrics"]["regressions"][metric] = item logger.info( f"Metric differences:\n{json.dumps(diffs['metrics'], indent=2)}") return diffs
def load_artifacts(): global artifacts run_id = open(Path(config.MODEL_DIR, "run_id.txt")).read() artifacts = main.load_artifacts(run_id=run_id) logger.info("Ready for inference!")
def load_artifacts(): global artifacts artifacts = main.load_artifacts(model_dir=config.MODEL_DIR) logger.info("Ready for inference!")
def run(params: Namespace, trial: optuna.trial._trial.Trial = None) -> Dict: """Operations for training. Args: params (Namespace): Input parameters for operations. trial (optuna.trial._trial.Trail, optional): Optuna optimization trial. Defaults to None. Returns: Artifacts to save and load for later. """ # 1. Set seed utils.set_seed(seed=params.seed) # 2. Set device device = utils.set_device(cuda=params.cuda) # 3. Load data projects_fp = Path(config.DATA_DIR, "projects.json") tags_fp = Path(config.DATA_DIR, "tags.json") projects = utils.load_dict(filepath=projects_fp) tags_dict = utils.list_to_dict(utils.load_dict(filepath=tags_fp), key="tag") df = pd.DataFrame(projects) if params.shuffle: df = df.sample(frac=1).reset_index(drop=True) df = df[:params.subset] # None = all samples # 4. Prepare data (feature engineering, filter, clean) df, tags_above_freq, tags_below_freq = data.prepare( df=df, include=list(tags_dict.keys()), exclude=config.EXCLUDED_TAGS, min_tag_freq=params.min_tag_freq, ) params.num_samples = len(df) # 5. Preprocess data df.text = df.text.apply(data.preprocess, lower=params.lower, stem=params.stem) # 6. Encode labels labels = df.tags label_encoder = data.MultiLabelLabelEncoder() label_encoder.fit(labels) y = label_encoder.encode(labels) # Class weights all_tags = list(itertools.chain.from_iterable(labels.values)) counts = np.bincount( [label_encoder.class_to_index[class_] for class_ in all_tags]) class_weights = {i: 1.0 / count for i, count in enumerate(counts)} # 7. Split data utils.set_seed(seed=params.seed) # needed for skmultilearn X = df.text.to_numpy() X_train, X_, y_train, y_ = data.iterative_train_test_split( X=X, y=y, train_size=params.train_size) X_val, X_test, y_val, y_test = data.iterative_train_test_split( X=X_, y=y_, train_size=0.5) test_df = pd.DataFrame({ "text": X_test, "tags": label_encoder.decode(y_test) }) # 8. Tokenize inputs tokenizer = data.Tokenizer(char_level=params.char_level) tokenizer.fit_on_texts(texts=X_train) X_train = np.array(tokenizer.texts_to_sequences(X_train), dtype=object) X_val = np.array(tokenizer.texts_to_sequences(X_val), dtype=object) X_test = np.array(tokenizer.texts_to_sequences(X_test), dtype=object) # 9. Create dataloaders train_dataset = data.CNNTextDataset(X=X_train, y=y_train, max_filter_size=params.max_filter_size) val_dataset = data.CNNTextDataset(X=X_val, y=y_val, max_filter_size=params.max_filter_size) train_dataloader = train_dataset.create_dataloader( batch_size=params.batch_size) val_dataloader = val_dataset.create_dataloader( batch_size=params.batch_size) # 10. Initialize model model = models.initialize_model( params=params, vocab_size=len(tokenizer), num_classes=len(label_encoder), device=device, ) # 11. Train model logger.info( f"Parameters: {json.dumps(params.__dict__, indent=2, cls=NumpyEncoder)}" ) params, model, loss = train.train( params=params, train_dataloader=train_dataloader, val_dataloader=val_dataloader, model=model, device=device, class_weights=class_weights, trial=trial, ) # 12. Evaluate model artifacts = { "params": params, "label_encoder": label_encoder, "tokenizer": tokenizer, "model": model, "loss": loss, } device = torch.device("cpu") y_true, y_pred, performance = eval.evaluate(df=test_df, artifacts=artifacts) artifacts["performance"] = performance return artifacts