def load_artifacts(run_id: str, device: torch.device = torch.device("cpu")) -> Dict: """Load artifacts for current model. Args: run_id (str): ID of the model run to load artifacts. device (torch.device): Device to run model on. Defaults to CPU. Returns: Artifacts needed for inference. """ # Load artifacts artifact_uri = mlflow.get_run(run_id=run_id).info.artifact_uri.split("file://")[-1] params = Namespace(**utils.load_dict(filepath=Path(artifact_uri, "params.json"))) label_encoder = data.MultiLabelLabelEncoder.load(fp=Path(artifact_uri, "label_encoder.json")) tokenizer = data.Tokenizer.load(fp=Path(artifact_uri, "tokenizer.json")) model_state = torch.load(Path(artifact_uri, "model.pt"), map_location=device) performance = utils.load_dict(filepath=Path(artifact_uri, "performance.json")) # Initialize model model = models.initialize_model( params=params, vocab_size=len(tokenizer), num_classes=len(label_encoder) ) model.load_state_dict(model_state) return { "params": params, "label_encoder": label_encoder, "tokenizer": tokenizer, "model": model, "performance": performance, }
def load_artifacts( model_dir: Path = config.MODEL_DIR, device: torch.device = torch.device("cpu") ) -> Dict: """Load artifacts for current model. Args: model_dir (Path): location of model artifacts. Defaults to config.MODEL_DIR. device (torch.device): Device to run model on. Defaults to CPU. Returns: Artifacts needed for inference. """ # Load artifacts params = Namespace(**utils.load_dict( filepath=Path(model_dir, "params.json"))) label_encoder = data.MultiLabelLabelEncoder.load( fp=Path(model_dir, "label_encoder.json")) tokenizer = data.Tokenizer.load(fp=Path(model_dir, "tokenizer.json")) model_state = torch.load(Path(model_dir, "model.pt"), map_location=device) performance = utils.load_dict(filepath=Path(model_dir, "performance.json")) # Initialize model model = models.initialize_model(params=params, vocab_size=len(tokenizer), num_classes=len(label_encoder)) model.load_state_dict(model_state) return { "params": params, "label_encoder": label_encoder, "tokenizer": tokenizer, "model": model, "performance": performance, }
def load_artifacts( run_id: str, device: torch.device = torch.device("cpu"), ) -> Dict: """Load artifacts for a particular `run_id`. Args: run_id (str): ID of the run to load model artifacts from. device (torch.device): Device to run model on. Defaults to CPU. Returns: Artifacts needed for inference. """ # Load model client = mlflow.tracking.MlflowClient() device = torch.device("cpu") with tempfile.TemporaryDirectory() as fp: client.download_artifacts(run_id=run_id, path="", dst_path=fp) label_encoder = data.LabelEncoder.load( fp=Path(fp, "label_encoder.json")) tokenizer = data.Tokenizer.load(fp=Path(fp, "tokenizer.json")) model_state = torch.load(Path(fp, "model.pt"), map_location=device) performance = utils.load_dict(filepath=Path(fp, "performance.json")) # Load model run = mlflow.get_run(run_id=run_id) args = Namespace(**run.data.params) model = models.initialize_model(args=args, vocab_size=len(tokenizer), num_classes=len(label_encoder)) model.load_state_dict(model_state) return { "args": args, "label_encoder": label_encoder, "tokenizer": tokenizer, "model": model, "performance": performance, }
def setup_method(self): """Called before every method.""" # Args self.max_filter_size = 4 self.embedding_dim = 128 self.num_filters = 100 self.hidden_dim = 128 self.dropout_p = 0.5 args = Namespace( max_filter_size=self.max_filter_size, embedding_dim=self.embedding_dim, num_filters=self.num_filters, hidden_dim=self.hidden_dim, dropout_p=self.dropout_p, ) # Model self.vocab_size = 1000 self.num_classes = 10 utils.set_seed() self.cnn = models.initialize_model(args=args, vocab_size=self.vocab_size, num_classes=self.num_classes)
def run(args: Namespace, trial: optuna.trial._trial.Trial = None) -> Dict: """Operations for training. 1. Set seed 2. Set device 3. Load data 4. Clean data 5. Preprocess data 6. Encode labels 7. Split data 8. Tokenize inputs 9. Create dataloaders 10. Initialize model 11. Train model 12. Evaluate model Args: args (Namespace): Input arguments for operations. trial (optuna.trial._trial.Trail, optional): Optuna optimization trial. Defaults to None. Returns: Artifacts to save and load for later. """ # 1. Set seed utils.set_seed(seed=args.seed) # 2. Set device device = utils.set_device(cuda=args.cuda) # 3. Load data df, projects_dict, tags_dict = data.load( shuffle=args.shuffle, num_samples=args.num_samples ) # 4. Clean data df, tags_dict, tags_above_frequency = data.clean( df=df, tags_dict=tags_dict, min_tag_freq=args.min_tag_freq ) # 5. Preprocess data df.text = df.text.apply(data.preprocess, lower=args.lower, stem=args.stem) # 6. Encode labels y, class_weights, label_encoder = data.encode_labels(labels=df.tags) # 7. Split data utils.set_seed(seed=args.seed) # needed for skmultilearn X_train, X_val, X_test, y_train, y_val, y_test = data.split( X=df.text.to_numpy(), y=y, train_size=args.train_size ) # 8. Tokenize inputs X_train, tokenizer = data.tokenize_text( X=X_train, char_level=args.char_level ) X_val, _ = data.tokenize_text( X=X_val, char_level=args.char_level, tokenizer=tokenizer ) X_test, _ = data.tokenize_text( X=X_test, char_level=args.char_level, tokenizer=tokenizer ) # 9. Create dataloaders train_dataloader = data.get_dataloader( data=[X_train, y_train], max_filter_size=args.max_filter_size, batch_size=args.batch_size, ) val_dataloader = data.get_dataloader( data=[X_val, y_val], max_filter_size=args.max_filter_size, batch_size=args.batch_size, ) test_dataloader = data.get_dataloader( data=[X_test, y_test], max_filter_size=args.max_filter_size, batch_size=args.batch_size, ) # 10. Initialize model model = models.initialize_model( args=args, vocab_size=len(tokenizer), num_classes=len(label_encoder), device=device, ) # 11. Train model logger.info( f"Arguments: {json.dumps(args.__dict__, indent=2, cls=NumpyEncoder)}" ) args, model, loss = train( args=args, train_dataloader=train_dataloader, val_dataloader=val_dataloader, model=model, device=device, class_weights=class_weights, trial=trial, ) # 12. Evaluate model device = torch.device("cpu") performance = evaluate( dataloader=test_dataloader, model=model.to(device), device=device, threshold=args.threshold, classes=label_encoder.classes, ) return { "args": args, "label_encoder": label_encoder, "tokenizer": tokenizer, "model": model, "loss": loss, "performance": performance, }
def run(params: Namespace, trial: optuna.trial._trial.Trial = None) -> Dict: """Operations for training. Args: params (Namespace): Input parameters for operations. trial (optuna.trial._trial.Trail, optional): Optuna optimization trial. Defaults to None. Returns: Artifacts to save and load for later. """ # 1. Set seed utils.set_seed(seed=params.seed) # 2. Set device device = utils.set_device(cuda=params.cuda) # 3. Load data projects_fp = Path(config.DATA_DIR, "projects.json") tags_fp = Path(config.DATA_DIR, "tags.json") projects = utils.load_dict(filepath=projects_fp) tags_dict = utils.list_to_dict(utils.load_dict(filepath=tags_fp), key="tag") df = pd.DataFrame(projects) if params.shuffle: df = df.sample(frac=1).reset_index(drop=True) df = df[:params.subset] # None = all samples # 4. Prepare data (feature engineering, filter, clean) df, tags_above_freq, tags_below_freq = data.prepare( df=df, include=list(tags_dict.keys()), exclude=config.EXCLUDED_TAGS, min_tag_freq=params.min_tag_freq, ) params.num_samples = len(df) # 5. Preprocess data df.text = df.text.apply(data.preprocess, lower=params.lower, stem=params.stem) # 6. Encode labels labels = df.tags label_encoder = data.MultiLabelLabelEncoder() label_encoder.fit(labels) y = label_encoder.encode(labels) # Class weights all_tags = list(itertools.chain.from_iterable(labels.values)) counts = np.bincount( [label_encoder.class_to_index[class_] for class_ in all_tags]) class_weights = {i: 1.0 / count for i, count in enumerate(counts)} # 7. Split data utils.set_seed(seed=params.seed) # needed for skmultilearn X = df.text.to_numpy() X_train, X_, y_train, y_ = data.iterative_train_test_split( X=X, y=y, train_size=params.train_size) X_val, X_test, y_val, y_test = data.iterative_train_test_split( X=X_, y=y_, train_size=0.5) test_df = pd.DataFrame({ "text": X_test, "tags": label_encoder.decode(y_test) }) # 8. Tokenize inputs tokenizer = data.Tokenizer(char_level=params.char_level) tokenizer.fit_on_texts(texts=X_train) X_train = np.array(tokenizer.texts_to_sequences(X_train), dtype=object) X_val = np.array(tokenizer.texts_to_sequences(X_val), dtype=object) X_test = np.array(tokenizer.texts_to_sequences(X_test), dtype=object) # 9. Create dataloaders train_dataset = data.CNNTextDataset(X=X_train, y=y_train, max_filter_size=params.max_filter_size) val_dataset = data.CNNTextDataset(X=X_val, y=y_val, max_filter_size=params.max_filter_size) train_dataloader = train_dataset.create_dataloader( batch_size=params.batch_size) val_dataloader = val_dataset.create_dataloader( batch_size=params.batch_size) # 10. Initialize model model = models.initialize_model( params=params, vocab_size=len(tokenizer), num_classes=len(label_encoder), device=device, ) # 11. Train model logger.info( f"Parameters: {json.dumps(params.__dict__, indent=2, cls=NumpyEncoder)}" ) params, model, loss = train.train( params=params, train_dataloader=train_dataloader, val_dataloader=val_dataloader, model=model, device=device, class_weights=class_weights, trial=trial, ) # 12. Evaluate model artifacts = { "params": params, "label_encoder": label_encoder, "tokenizer": tokenizer, "model": model, "loss": loss, } device = torch.device("cpu") y_true, y_pred, performance = eval.evaluate(df=test_df, artifacts=artifacts) artifacts["performance"] = performance return artifacts
def train(params: Namespace, trial: optuna.trial._trial.Trial = None) -> Dict: """Operations for training. Args: params (Namespace): Input parameters for operations. trial (optuna.trial._trial.Trail, optional): Optuna optimization trial. Defaults to None. Returns: Artifacts to save and load for later. """ # Set up utils.set_seed(seed=params.seed) device = utils.set_device(cuda=params.cuda) # Load features features_fp = Path(config.DATA_DIR, "features.json") tags_fp = Path(config.DATA_DIR, "tags.json") features = utils.load_dict(filepath=features_fp) tags_dict = utils.list_to_dict(utils.load_dict(filepath=tags_fp), key="tag") df = pd.DataFrame(features) if params.shuffle: df = df.sample(frac=1).reset_index(drop=True) df = df[:params.subset] # None = all samples # Prepare data (filter, clean, etc.) df, tags_above_freq, tags_below_freq = data.prepare( df=df, include=list(tags_dict.keys()), exclude=config.EXCLUDED_TAGS, min_tag_freq=params.min_tag_freq, ) params.num_samples = len(df) # Preprocess data df.text = df.text.apply(data.preprocess, lower=params.lower, stem=params.stem) # Encode labels labels = df.tags label_encoder = data.MultiLabelLabelEncoder() label_encoder.fit(labels) y = label_encoder.encode(labels) # Class weights all_tags = list(itertools.chain.from_iterable(labels.values)) counts = np.bincount( [label_encoder.class_to_index[class_] for class_ in all_tags]) class_weights = {i: 1.0 / count for i, count in enumerate(counts)} # Split data utils.set_seed(seed=params.seed) # needed for skmultilearn X = df.text.to_numpy() X_train, X_, y_train, y_ = data.iterative_train_test_split( X=X, y=y, train_size=params.train_size) X_val, X_test, y_val, y_test = data.iterative_train_test_split( X=X_, y=y_, train_size=0.5) test_df = pd.DataFrame({ "text": X_test, "tags": label_encoder.decode(y_test) }) # Tokenize inputs tokenizer = data.Tokenizer(char_level=params.char_level) tokenizer.fit_on_texts(texts=X_train) X_train = np.array(tokenizer.texts_to_sequences(X_train), dtype=object) X_val = np.array(tokenizer.texts_to_sequences(X_val), dtype=object) X_test = np.array(tokenizer.texts_to_sequences(X_test), dtype=object) # Create dataloaders train_dataset = data.CNNTextDataset(X=X_train, y=y_train, max_filter_size=params.max_filter_size) val_dataset = data.CNNTextDataset(X=X_val, y=y_val, max_filter_size=params.max_filter_size) train_dataloader = train_dataset.create_dataloader( batch_size=params.batch_size) val_dataloader = val_dataset.create_dataloader( batch_size=params.batch_size) # Initialize model model = models.initialize_model( params=params, vocab_size=len(tokenizer), num_classes=len(label_encoder), device=device, ) # Train model logger.info( f"Parameters: {json.dumps(params.__dict__, indent=2, cls=NumpyEncoder)}" ) class_weights_tensor = torch.Tensor(np.array(list(class_weights.values()))) loss_fn = nn.BCEWithLogitsLoss(weight=class_weights_tensor) optimizer = torch.optim.Adam(model.parameters(), lr=params.lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.05, patience=5) # Trainer module trainer = Trainer( model=model, device=device, loss_fn=loss_fn, optimizer=optimizer, scheduler=scheduler, trial=trial, ) # Train best_val_loss, best_model = trainer.train(params.num_epochs, params.patience, train_dataloader, val_dataloader) # Find best threshold _, y_true, y_prob = trainer.eval_step(dataloader=train_dataloader) params.threshold = find_best_threshold(y_true=y_true, y_prob=y_prob) # Evaluate model artifacts = { "params": params, "label_encoder": label_encoder, "tokenizer": tokenizer, "model": best_model, "loss": best_val_loss, } device = torch.device("cpu") y_true, y_pred, performance = eval.evaluate(df=test_df, artifacts=artifacts) artifacts["performance"] = performance return artifacts