def load(shuffle: bool, num_samples: int = 0) -> pd.DataFrame: """Load the data from local drive to a Pandas DataFrame. Args: shuffle (bool): Shuffle the data. num_samples (int, optional): Number of samples to include (used for quick testting). Defaults to 0 which includes all samples. Returns: Dataframe, projects and tags dictionaries. """ # Load data projects_fp = Path(config.DATA_DIR, "projects.json") tags_fp = Path(config.DATA_DIR, "tags.json") projects_dict = utils.load_dict(filepath=projects_fp) tags_dict = utils.load_dict(filepath=tags_fp) # Create dataframe df = pd.DataFrame(projects_dict) # Shuffling since projects are chronologically organized if shuffle: df = df.sample(frac=1).reset_index(drop=True) # Subset if num_samples: df = df[:num_samples] return df, projects_dict, tags_dict
def load_artifacts(run_id: str, device: torch.device = torch.device("cpu")) -> Dict: """Load artifacts for current model. Args: run_id (str): ID of the model run to load artifacts. device (torch.device): Device to run model on. Defaults to CPU. Returns: Artifacts needed for inference. """ # Load artifacts artifact_uri = mlflow.get_run(run_id=run_id).info.artifact_uri.split("file://")[-1] params = Namespace(**utils.load_dict(filepath=Path(artifact_uri, "params.json"))) label_encoder = data.MultiLabelLabelEncoder.load(fp=Path(artifact_uri, "label_encoder.json")) tokenizer = data.Tokenizer.load(fp=Path(artifact_uri, "tokenizer.json")) model_state = torch.load(Path(artifact_uri, "model.pt"), map_location=device) performance = utils.load_dict(filepath=Path(artifact_uri, "performance.json")) # Initialize model model = models.initialize_model( params=params, vocab_size=len(tokenizer), num_classes=len(label_encoder) ) model.load_state_dict(model_state) return { "params": params, "label_encoder": label_encoder, "tokenizer": tokenizer, "model": model, "performance": performance, }
def load_artifacts( model_dir: Path = config.MODEL_DIR, device: torch.device = torch.device("cpu") ) -> Dict: """Load artifacts for current model. Args: model_dir (Path): location of model artifacts. Defaults to config.MODEL_DIR. device (torch.device): Device to run model on. Defaults to CPU. Returns: Artifacts needed for inference. """ # Load artifacts params = Namespace(**utils.load_dict( filepath=Path(model_dir, "params.json"))) label_encoder = data.MultiLabelLabelEncoder.load( fp=Path(model_dir, "label_encoder.json")) tokenizer = data.Tokenizer.load(fp=Path(model_dir, "tokenizer.json")) model_state = torch.load(Path(model_dir, "model.pt"), map_location=device) performance = utils.load_dict(filepath=Path(model_dir, "performance.json")) # Initialize model model = models.initialize_model(params=params, vocab_size=len(tokenizer), num_classes=len(label_encoder)) model.load_state_dict(model_state) return { "params": params, "label_encoder": label_encoder, "tokenizer": tokenizer, "model": model, "performance": performance, }
def train_model( params_fp: Path = Path(config.CONFIG_DIR, "params.json"), model_dir: Optional[Path] = Path(config.MODEL_DIR), experiment_name: Optional[str] = "best", run_name: Optional[str] = "model", ) -> None: """Train a model using the specified parameters. Args: params_fp (Path, optional): Parameters to use for training. Defaults to `config/params.json`. model_dir (Path): location of model artifacts. Defaults to config.MODEL_DIR. experiment_name (str, optional): Name of the experiment to save the run to. Defaults to `best`. run_name (str, optional): Name of the run. Defaults to `model`. """ # Set experiment and start run params = Namespace(**utils.load_dict(filepath=params_fp)) # Start run mlflow.set_experiment(experiment_name=experiment_name) with mlflow.start_run(run_name=run_name): run_id = mlflow.active_run().info.run_id # Train artifacts = main.run(params=params) # Set tags tags = {} mlflow.set_tags(tags) # Log metrics performance = artifacts["performance"] logger.info(json.dumps(performance["overall"], indent=2)) metrics = { "precision": performance["overall"]["precision"], "recall": performance["overall"]["recall"], "f1": performance["overall"]["f1"], "best_val_loss": artifacts["loss"], "behavioral_score": performance["behavioral"]["score"], "slices_f1": performance["slices"]["overall"]["f1"], } mlflow.log_metrics(metrics) # Log artifacts with tempfile.TemporaryDirectory() as dp: utils.save_dict(vars(artifacts["params"]), Path(dp, "params.json"), cls=NumpyEncoder) utils.save_dict(performance, Path(dp, "performance.json")) artifacts["label_encoder"].save(Path(dp, "label_encoder.json")) artifacts["tokenizer"].save(Path(dp, "tokenizer.json")) torch.save(artifacts["model"].state_dict(), Path(dp, "model.pt")) mlflow.log_artifacts(dp) mlflow.log_params(vars(artifacts["params"])) # Save for repo open(Path(model_dir, "run_id.txt"), "w").write(run_id) utils.save_dict(vars(params), Path(model_dir, "params.json"), cls=NumpyEncoder) utils.save_dict(performance, Path(model_dir, "performance.json"))
def test_save_and_load_dict(): with tempfile.TemporaryDirectory() as dp: d = {"hello": "world"} fp = Path(dp, "d.json") utils.save_dict(d=d, filepath=fp) d = utils.load_dict(filepath=fp) assert d["hello"] == "world"
def performance( author: str = config.AUTHOR, repo: str = config.REPO, tag: str = "workspace", verbose: bool = True, ): if tag == "workspace": performance = utils.load_dict(filepath=Path(config.MODEL_DIR, "performance.json")) else: url = f"https://raw.githubusercontent.com/{author}/{repo}/{tag}/model/performance.json" performance = utils.load_json_from_url(url=url) if verbose: logger.info(json.dumps(performance, indent=2)) return performance
def train_model( args_fp: Path = Path(config.CONFIG_DIR, "args.json"), experiment_name: Optional[str] = "best", run_name: Optional[str] = "model", ) -> None: """Train a model using the specified parameters. Args: args_fp (Path, optional): Location of arguments to use for training. Defaults to `config/args.json`. experiment_name (str, optional): Name of the experiment to save the run to. Defaults to `best`. run_name (str, optional): Name of the run. Defaults to `model`. """ # Set experiment and start run args = Namespace(**utils.load_dict(filepath=args_fp)) # Start run mlflow.set_experiment(experiment_name=experiment_name) with mlflow.start_run(run_name=run_name ) as run: # NOQA: F841 (assigned to but never used) # Train artifacts = main.run(args=args) # Set tags tags = {"data_version": artifacts["data_version"]} mlflow.set_tags(tags) # Log metrics performance = artifacts["performance"] logger.info(json.dumps(performance["overall"], indent=2)) metrics = { "precision": performance["overall"]["precision"], "recall": performance["overall"]["recall"], "f1": performance["overall"]["f1"], "best_val_loss": artifacts["loss"], "behavioral_score": artifacts["behavioral_report"]["score"], "slices_f1": performance["slices"]["f1"], } mlflow.log_metrics(metrics) # Log artifacts with tempfile.TemporaryDirectory() as dp: artifacts["label_encoder"].save(Path(dp, "label_encoder.json")) artifacts["tokenizer"].save(Path(dp, "tokenizer.json")) torch.save(artifacts["model"].state_dict(), Path(dp, "model.pt")) utils.save_dict(performance, Path(dp, "performance.json")) utils.save_dict(artifacts["behavioral_report"], Path(dp, "behavioral_report.json")) mlflow.log_artifacts(dp) mlflow.log_params(vars(artifacts["args"]))
def compute_features( params_fp: Path = Path(config.CONFIG_DIR, "params.json"), ) -> None: """Compute and save features for training. Args: params_fp (Path, optional): Location of parameters (just using num_samples, num_epochs, etc.) to use for training. Defaults to `config/params.json`. """ # Parameters params = Namespace(**utils.load_dict(filepath=params_fp)) # Compute features data.compute_features(params=params) logger.info("✅ Computed features!")
def optimize( params_fp: Path = Path(config.CONFIG_DIR, "params.json"), study_name: Optional[str] = "optimization", num_trials: int = 100, ) -> None: """Optimize a subset of hyperparameters towards an objective. This saves the best trial's parameters into `config/params.json`. Args: params_fp (Path, optional): Location of parameters (just using num_samples, num_epochs, etc.) to use for training. Defaults to `config/params.json`. study_name (str, optional): Name of the study to save trial runs under. Defaults to `optimization`. num_trials (int, optional): Number of trials to run. Defaults to 100. """ # Starting parameters (not actually used but needed for set up) params = Namespace(**utils.load_dict(filepath=params_fp)) # Optimize pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=5) study = optuna.create_study(study_name=study_name, direction="maximize", pruner=pruner) mlflow_callback = MLflowCallback(tracking_uri=mlflow.get_tracking_uri(), metric_name="f1") study.optimize( lambda trial: main.objective(params, trial), n_trials=num_trials, callbacks=[mlflow_callback], ) # All trials trials_df = study.trials_dataframe() trials_df = trials_df.sort_values(["value"], ascending=False) # Best trial logger.info(f"Best value (f1): {study.best_trial.value}") params = {**params.__dict__, **study.best_trial.params} params["threshold"] = study.best_trial.user_attrs["threshold"] with open(params_fp, "w") as fp: json.dump(params, fp=fp, indent=2, cls=NumpyEncoder) logger.info(json.dumps(params, indent=2, cls=NumpyEncoder))
def optimize(num_trials: int = 100) -> None: """Optimize a subset of hyperparameters towards an objective. This saves the best trial's arguments into `config/args.json`. Args: num_trials (int, optional): Number of trials to run. Defaults to 100. """ # Starting arguments (not actually used but needed for set up) args_fp = Path(config.CONFIG_DIR, "args.json") args = Namespace(**utils.load_dict(filepath=args_fp)) # Optimize pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=5) study = optuna.create_study( study_name="optimization", direction="maximize", pruner=pruner ) mlflow_callback = MLflowCallback( tracking_uri=mlflow.get_tracking_uri(), metric_name="f1" ) study.optimize( lambda trial: train.objective(args, trial), n_trials=num_trials, callbacks=[mlflow_callback], ) # All trials trials_df = study.trials_dataframe() trials_df = trials_df.sort_values( ["value"], ascending=False ) # sort by metric trials_df.to_csv( Path(config.EXPERIMENTS_DIR, "trials.csv"), index=False ) # save # Best trial logger.info(f"Best value (f1): {study.best_trial.value}") params = {**args.__dict__, **study.best_trial.params} params["threshold"] = study.best_trial.user_attrs["threshold"] with open(Path(config.CONFIG_DIR, "args.json"), "w") as fp: json.dump(params, fp=fp, indent=2, cls=NumpyEncoder) logger.info(json.dumps(params, indent=2, cls=NumpyEncoder))
def train_model(args_fp: Path = Path(config.CONFIG_DIR, "args.json")) -> None: """Train a model using the specified parameters. Args: args_fp (Path, optional): Location of arguments to use for training. Defaults to `config/args.json`. """ # Set experiment and start run args = Namespace(**utils.load_dict(filepath=args_fp)) # Start run mlflow.set_experiment(experiment_name="best") with mlflow.start_run( run_name="cnn" ) as run: # NOQA: F841 (assigned to but never used) # Train artifacts = train.run(args=args) # Log key metrics performance = artifacts["performance"] loss = artifacts["loss"] mlflow.log_metrics({"precision": performance["overall"]["precision"]}) mlflow.log_metrics({"recall": performance["overall"]["recall"]}) mlflow.log_metrics({"f1": performance["overall"]["f1"]}) mlflow.log_metrics({"best_val_loss": loss}) # Log artifacts args = artifacts["args"] model = artifacts["model"] label_encoder = artifacts["label_encoder"] tokenizer = artifacts["tokenizer"] with tempfile.TemporaryDirectory() as fp: label_encoder.save(Path(fp, "label_encoder.json")) tokenizer.save(Path(fp, "tokenizer.json")) torch.save(model.state_dict(), Path(fp, "model.pt")) utils.save_dict(performance, Path(fp, "performance.json")) mlflow.log_artifacts(fp) mlflow.log_params(vars(args)) logger.info(json.dumps(performance["overall"], indent=2))
def load_artifacts( run_id: str, device: torch.device = torch.device("cpu"), ) -> Dict: """Load artifacts for a particular `run_id`. Args: run_id (str): ID of the run to load model artifacts from. device (torch.device): Device to run model on. Defaults to CPU. Returns: Artifacts needed for inference. """ # Load model client = mlflow.tracking.MlflowClient() device = torch.device("cpu") with tempfile.TemporaryDirectory() as fp: client.download_artifacts(run_id=run_id, path="", dst_path=fp) label_encoder = data.LabelEncoder.load( fp=Path(fp, "label_encoder.json")) tokenizer = data.Tokenizer.load(fp=Path(fp, "tokenizer.json")) model_state = torch.load(Path(fp, "model.pt"), map_location=device) performance = utils.load_dict(filepath=Path(fp, "performance.json")) # Load model run = mlflow.get_run(run_id=run_id) args = Namespace(**run.data.params) model = models.initialize_model(args=args, vocab_size=len(tokenizer), num_classes=len(label_encoder)) model.load_state_dict(model_state) return { "args": args, "label_encoder": label_encoder, "tokenizer": tokenizer, "model": model, "performance": performance, }
def df(): # Load features params_fp = Path(config.CONFIG_DIR, "params.json") params = Namespace(**utils.load_dict(filepath=params_fp)) df, _ = data.compute_features(params=params) return df
def df(): projects_fp = Path(config.DATA_DIR, "projects.json") projects_dict = utils.load_dict(filepath=projects_fp) df = pd.DataFrame(projects_dict) return df
def tags(): tags_fp = Path(config.DATA_DIR, "tags.json") tags_dict = utils.list_to_dict(utils.load_dict(filepath=tags_fp), key="tag") tags = list(tags_dict.keys()) return tags
if __name__ == "__main__": # pragma: no cover, playground for eval components import json from argparse import Namespace from pathlib import Path import numpy as np import pandas as pd from tagifai import config, data, main, utils from tagifai.config import logger # Set experiment and start run args_fp = Path(config.CONFIG_DIR, "args.json") args = Namespace(**utils.load_dict(filepath=args_fp)) # 1. Set seed utils.set_seed(seed=args.seed) # 2. Set device device = utils.set_device(cuda=args.cuda) # 3. Load data projects_fp = Path(config.DATA_DIR, "projects.json") tags_fp = Path(config.DATA_DIR, "tags.json") projects = utils.load_dict(filepath=projects_fp) tags_dict = utils.list_to_dict(utils.load_dict(filepath=tags_fp), key="tag") df = pd.DataFrame(projects) if args.shuffle:
def diff(commit_a: str = "workspace", commit_b: str = "head"): # pragma: no cover """Compare relevant differences (params, metrics) between commits. Inspired by DVC's `dvc metrics diff`but repurposed to display diffs pertinent to our experiments. Args: commit_a (str, optional): Primary commit. Defaults to "workspace". commit_b (str, optional): Commit to compare to. Defaults to "head". Raises: ValueError: Invalid commit. """ diffs = {} commits = ["a", "b"] if commit_a.lower() in ("head", "current"): commit_a = "main" if commit_b.lower() in ("head", "current"): commit_b = "main" # Get params params = {"a": {}, "b": {}} for i, commit in enumerate([commit_a, commit_b]): if commit == "workspace": params[commits[i]] = utils.load_dict( filepath=Path(config.CONFIG_DIR, "params.json")) continue params_url = ( f"https://raw.githubusercontent.com/GokuMohandas/applied-ml/{commit}/model/params.json" ) params[commits[i]] = utils.load_json_from_url(url=params_url) # Parameter differences diffs["params"] = {} for arg in params["a"]: a = params["a"][arg] b = params["b"][arg] if a != b: diffs["params"][arg] = {commit_a: a, commit_b: b} logger.info( f"Parameter differences:\n{json.dumps(diffs['params'], indent=2)}") # Get metrics metrics = {"a": {}, "b": {}} for i, commit in enumerate([commit_a, commit_b]): if commit == "workspace": metrics[commits[i]] = utils.load_dict( filepath=Path(config.MODEL_DIR, "performance.json")) continue metrics_url = f"https://raw.githubusercontent.com/GokuMohandas/applied-ml/{commit}/model/performance.json" metrics[commits[i]] = utils.load_json_from_url(url=metrics_url) # Recursively flatten metrics_a = pd.json_normalize(metrics["a"], sep=".").to_dict(orient="records")[0] metrics_b = pd.json_normalize(metrics["b"], sep=".").to_dict(orient="records")[0] if metrics_a.keys() != metrics_b.keys(): raise Exception( "Cannot compare these commits because they have different metrics." ) # Metric differences diffs["metrics"] = {} diffs["metrics"]["improvements"] = {} diffs["metrics"]["regressions"] = {} for metric in metrics_a: if ((metric in metrics_b) and (metrics_a[metric] != metrics_b[metric]) and (isinstance(metrics_a[metric], numbers.Number)) and (metric.split(".")[-1] != "num_samples")): item = { commit_a: metrics_a[metric], commit_b: metrics_b[metric], "diff": metrics_a[metric] - metrics_b[metric], } if item["diff"] >= 0.0: diffs["metrics"]["improvements"][metric] = item else: diffs["metrics"]["regressions"][metric] = item logger.info( f"Metric differences:\n{json.dumps(diffs['metrics'], indent=2)}") return diffs
def train(params: Namespace, trial: optuna.trial._trial.Trial = None) -> Dict: """Operations for training. Args: params (Namespace): Input parameters for operations. trial (optuna.trial._trial.Trail, optional): Optuna optimization trial. Defaults to None. Returns: Artifacts to save and load for later. """ # Set up utils.set_seed(seed=params.seed) device = utils.set_device(cuda=params.cuda) # Load features features_fp = Path(config.DATA_DIR, "features.json") tags_fp = Path(config.DATA_DIR, "tags.json") features = utils.load_dict(filepath=features_fp) tags_dict = utils.list_to_dict(utils.load_dict(filepath=tags_fp), key="tag") df = pd.DataFrame(features) if params.shuffle: df = df.sample(frac=1).reset_index(drop=True) df = df[:params.subset] # None = all samples # Prepare data (filter, clean, etc.) df, tags_above_freq, tags_below_freq = data.prepare( df=df, include=list(tags_dict.keys()), exclude=config.EXCLUDED_TAGS, min_tag_freq=params.min_tag_freq, ) params.num_samples = len(df) # Preprocess data df.text = df.text.apply(data.preprocess, lower=params.lower, stem=params.stem) # Encode labels labels = df.tags label_encoder = data.MultiLabelLabelEncoder() label_encoder.fit(labels) y = label_encoder.encode(labels) # Class weights all_tags = list(itertools.chain.from_iterable(labels.values)) counts = np.bincount( [label_encoder.class_to_index[class_] for class_ in all_tags]) class_weights = {i: 1.0 / count for i, count in enumerate(counts)} # Split data utils.set_seed(seed=params.seed) # needed for skmultilearn X = df.text.to_numpy() X_train, X_, y_train, y_ = data.iterative_train_test_split( X=X, y=y, train_size=params.train_size) X_val, X_test, y_val, y_test = data.iterative_train_test_split( X=X_, y=y_, train_size=0.5) test_df = pd.DataFrame({ "text": X_test, "tags": label_encoder.decode(y_test) }) # Tokenize inputs tokenizer = data.Tokenizer(char_level=params.char_level) tokenizer.fit_on_texts(texts=X_train) X_train = np.array(tokenizer.texts_to_sequences(X_train), dtype=object) X_val = np.array(tokenizer.texts_to_sequences(X_val), dtype=object) X_test = np.array(tokenizer.texts_to_sequences(X_test), dtype=object) # Create dataloaders train_dataset = data.CNNTextDataset(X=X_train, y=y_train, max_filter_size=params.max_filter_size) val_dataset = data.CNNTextDataset(X=X_val, y=y_val, max_filter_size=params.max_filter_size) train_dataloader = train_dataset.create_dataloader( batch_size=params.batch_size) val_dataloader = val_dataset.create_dataloader( batch_size=params.batch_size) # Initialize model model = models.initialize_model( params=params, vocab_size=len(tokenizer), num_classes=len(label_encoder), device=device, ) # Train model logger.info( f"Parameters: {json.dumps(params.__dict__, indent=2, cls=NumpyEncoder)}" ) class_weights_tensor = torch.Tensor(np.array(list(class_weights.values()))) loss_fn = nn.BCEWithLogitsLoss(weight=class_weights_tensor) optimizer = torch.optim.Adam(model.parameters(), lr=params.lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.05, patience=5) # Trainer module trainer = Trainer( model=model, device=device, loss_fn=loss_fn, optimizer=optimizer, scheduler=scheduler, trial=trial, ) # Train best_val_loss, best_model = trainer.train(params.num_epochs, params.patience, train_dataloader, val_dataloader) # Find best threshold _, y_true, y_prob = trainer.eval_step(dataloader=train_dataloader) params.threshold = find_best_threshold(y_true=y_true, y_prob=y_prob) # Evaluate model artifacts = { "params": params, "label_encoder": label_encoder, "tokenizer": tokenizer, "model": best_model, "loss": best_val_loss, } device = torch.device("cpu") y_true, y_pred, performance = eval.evaluate(df=test_df, artifacts=artifacts) artifacts["performance"] = performance return artifacts
def run(params: Namespace, trial: optuna.trial._trial.Trial = None) -> Dict: """Operations for training. Args: params (Namespace): Input parameters for operations. trial (optuna.trial._trial.Trail, optional): Optuna optimization trial. Defaults to None. Returns: Artifacts to save and load for later. """ # 1. Set seed utils.set_seed(seed=params.seed) # 2. Set device device = utils.set_device(cuda=params.cuda) # 3. Load data projects_fp = Path(config.DATA_DIR, "projects.json") tags_fp = Path(config.DATA_DIR, "tags.json") projects = utils.load_dict(filepath=projects_fp) tags_dict = utils.list_to_dict(utils.load_dict(filepath=tags_fp), key="tag") df = pd.DataFrame(projects) if params.shuffle: df = df.sample(frac=1).reset_index(drop=True) df = df[:params.subset] # None = all samples # 4. Prepare data (feature engineering, filter, clean) df, tags_above_freq, tags_below_freq = data.prepare( df=df, include=list(tags_dict.keys()), exclude=config.EXCLUDED_TAGS, min_tag_freq=params.min_tag_freq, ) params.num_samples = len(df) # 5. Preprocess data df.text = df.text.apply(data.preprocess, lower=params.lower, stem=params.stem) # 6. Encode labels labels = df.tags label_encoder = data.MultiLabelLabelEncoder() label_encoder.fit(labels) y = label_encoder.encode(labels) # Class weights all_tags = list(itertools.chain.from_iterable(labels.values)) counts = np.bincount( [label_encoder.class_to_index[class_] for class_ in all_tags]) class_weights = {i: 1.0 / count for i, count in enumerate(counts)} # 7. Split data utils.set_seed(seed=params.seed) # needed for skmultilearn X = df.text.to_numpy() X_train, X_, y_train, y_ = data.iterative_train_test_split( X=X, y=y, train_size=params.train_size) X_val, X_test, y_val, y_test = data.iterative_train_test_split( X=X_, y=y_, train_size=0.5) test_df = pd.DataFrame({ "text": X_test, "tags": label_encoder.decode(y_test) }) # 8. Tokenize inputs tokenizer = data.Tokenizer(char_level=params.char_level) tokenizer.fit_on_texts(texts=X_train) X_train = np.array(tokenizer.texts_to_sequences(X_train), dtype=object) X_val = np.array(tokenizer.texts_to_sequences(X_val), dtype=object) X_test = np.array(tokenizer.texts_to_sequences(X_test), dtype=object) # 9. Create dataloaders train_dataset = data.CNNTextDataset(X=X_train, y=y_train, max_filter_size=params.max_filter_size) val_dataset = data.CNNTextDataset(X=X_val, y=y_val, max_filter_size=params.max_filter_size) train_dataloader = train_dataset.create_dataloader( batch_size=params.batch_size) val_dataloader = val_dataset.create_dataloader( batch_size=params.batch_size) # 10. Initialize model model = models.initialize_model( params=params, vocab_size=len(tokenizer), num_classes=len(label_encoder), device=device, ) # 11. Train model logger.info( f"Parameters: {json.dumps(params.__dict__, indent=2, cls=NumpyEncoder)}" ) params, model, loss = train.train( params=params, train_dataloader=train_dataloader, val_dataloader=val_dataloader, model=model, device=device, class_weights=class_weights, trial=trial, ) # 12. Evaluate model artifacts = { "params": params, "label_encoder": label_encoder, "tokenizer": tokenizer, "model": model, "loss": loss, } device = torch.device("cpu") y_true, y_pred, performance = eval.evaluate(df=test_df, artifacts=artifacts) artifacts["performance"] = performance return artifacts
st.title("Tagifai · MLOps · Made With ML") """by [Goku Mohandas](https://twitter.com/GokuMohandas)""" st.info("🔍 Explore the different pages below.") # Pages pages = ["Data", "Performance", "Inference", "Inspection"] st.header("Pages") selected_page = st.radio("Select a page:", pages, index=2) if selected_page == "Data": st.header("Data") # Load data projects_fp = Path(config.DATA_DIR, "projects.json") tags_fp = Path(config.DATA_DIR, "tags.json") projects = utils.load_dict(filepath=projects_fp) tags_dict = utils.list_to_dict(utils.load_dict(filepath=tags_fp), key="tag") col1, col2 = st.beta_columns(2) with col1: st.subheader("Projects (sample)") st.write(projects[0]) with col2: st.subheader("Tags") tag = st.selectbox("Choose a tag", list(tags_dict.keys())) st.write(tags_dict[tag]) # Dataframe df = pd.DataFrame(projects) st.text(f"Projects (count: {len(df)}):") st.write(df)
def get_performance(model_dir): performance = utils.load_dict(filepath=Path(model_dir, "performance.json")) return performance
def predict(texts: List, run_id: str) -> Dict: """Predict tags for an input text using the best model from the `best` experiment. Usage: ```python texts = ["Transfer learning with BERT."] predict(texts=texts, run_id="264ac530b78c42608e5dea1086bc2c73") ``` <pre> [ { "input_text": "Transfer learning with BERT.", "preprocessed_text": "transfer learning bert", "predicted_tags": [ "attention", "language-modeling", "natural-language-processing", "transfer-learning", "transformers" ] } ] </pre> Note: The input argument `texts` can hold multiple input texts and so the resulting prediction dictionary will have `len(texts)` items. Args: texts (List): List of input text to predict tags for. run_id (str): ID of the run to load model artifacts from. Returns: Predicted tags for input texts. """ # Load artifacts from run client = mlflow.tracking.MlflowClient() run = mlflow.get_run(run_id=run_id) device = torch.device("cpu") with tempfile.TemporaryDirectory() as fp: client.download_artifacts(run_id=run_id, path="", dst_path=fp) args = Namespace(**utils.load_dict( filepath=Path(config.CONFIG_DIR, "args.json"))) label_encoder = data.LabelEncoder.load( fp=Path(fp, "label_encoder.json")) tokenizer = data.Tokenizer.load(fp=Path(fp, "tokenizer.json")) model_state = torch.load(Path(fp, "model.pt"), map_location=device) # performance = utils.load_dict(filepath=Path(fp, "performance.json")) # Load model args = Namespace(**run.data.params) model = train.initialize_model(args=args, vocab_size=len(tokenizer), num_classes=len(label_encoder)) model.load_state_dict(model_state) # Prepare data preprocessed_texts = [data.preprocess(text) for text in texts] X = np.array(tokenizer.texts_to_sequences(preprocessed_texts)) y_filler = label_encoder.encode( [np.array([label_encoder.classes[0]] * len(X))]) dataset = data.CNNTextDataset(X=X, y=y_filler, max_filter_size=int(args.max_filter_size)) dataloader = dataset.create_dataloader(batch_size=int(args.batch_size)) # Get predictions trainer = train.Trainer(model=model, device=device) _, y_prob = trainer.predict_step(dataloader) y_pred = np.array( [np.where(prob >= float(args.threshold), 1, 0) for prob in y_prob]) tags = label_encoder.decode(y_pred) predictions = [{ "input_text": texts[i], "preprocessed_text": preprocessed_texts[i], "predicted_tags": tags[i], } for i in range(len(tags))] return predictions