def setup_method(self): """Called before every method.""" self.X = [[4, 2, 3, 0], [2, 4, 3, 3], [2, 3, 0, 0]] self.y = [[0, 1], [1, 1], [1, 0]] self.max_filter_size = 2 self.batch_size = 1 self.dataset = data.CNNTextDataset(X=self.X, y=self.y, max_filter_size=self.max_filter_size)
def evaluate( df: pd.DataFrame, artifacts: Dict, device: torch.device = torch.device("cpu"), ) -> Tuple: """Evaluate performance on data. Args: df (pd.DataFrame): Dataframe (used for slicing). artifacts (Dict): Artifacts needed for inference. device (torch.device): Device to run model on. Defaults to CPU. Returns: Ground truth and predicted labels, performance. """ # Artifacts params = artifacts["params"] model = artifacts["model"] tokenizer = artifacts["tokenizer"] label_encoder = artifacts["label_encoder"] model = model.to(device) classes = label_encoder.classes # Create dataloader X = np.array(tokenizer.texts_to_sequences(df.text.to_numpy()), dtype="object") y = label_encoder.encode(df.tags) dataset = data.CNNTextDataset(X=X, y=y, max_filter_size=int(params.max_filter_size)) dataloader = dataset.create_dataloader(batch_size=int(params.batch_size)) # Determine predictions using threshold trainer = train.Trainer(model=model, device=device) y_true, y_prob = trainer.predict_step(dataloader=dataloader) y_pred = np.array( [np.where(prob >= float(params.threshold), 1, 0) for prob in y_prob]) # Evaluate performance performance = {} performance = get_metrics(df=df, y_true=y_true, y_pred=y_pred, classes=classes) performance["behavioral"] = get_behavioral_report(artifacts=artifacts) return y_true, y_pred, performance
def predict( texts: List, artifacts: Dict, device: torch.device = torch.device("cpu")) -> Dict: """Predict tags for an input text using the best model from the `best` experiment. Usage: ```python texts = ["Transfer learning with BERT."] artifacts = load_artifacts(run_id="264ac530b78c42608e5dea1086bc2c73") predict(texts=texts, artifacts=artifacts) ``` <pre> [ { "input_text": "Transfer learning with BERT.", "preprocessed_text": "transfer learning bert", "predicted_tags": [ "attention", "language-modeling", "natural-language-processing", "transfer-learning", "transformers" ] } ] </pre> Note: The input parameter `texts` can hold multiple input texts and so the resulting prediction dictionary will have `len(texts)` items. Args: texts (List): List of input texts to predict tags for. artifacts (Dict): Artifacts needed for inference. device (torch.device): Device to run model on. Defaults to CPU. Returns: Predicted tags for each of the input texts. """ # Retrieve artifacts params = artifacts["params"] label_encoder = artifacts["label_encoder"] tokenizer = artifacts["tokenizer"] model = artifacts["model"] # Prepare data preprocessed_texts = [ data.preprocess( text, lower=bool(strtobool(str( params.lower))), # params.lower could be str/bool stem=bool(strtobool(str(params.stem))), ) for text in texts ] X = np.array(tokenizer.texts_to_sequences(preprocessed_texts), dtype="object") y_filler = np.zeros((len(X), len(label_encoder))) dataset = data.CNNTextDataset(X=X, y=y_filler, max_filter_size=int(params.max_filter_size)) dataloader = dataset.create_dataloader(batch_size=int(params.batch_size)) # Get predictions trainer = train.Trainer(model=model, device=device) _, y_prob = trainer.predict_step(dataloader) y_pred = [ np.where(prob >= float(params.threshold), 1, 0) for prob in y_prob ] tags = label_encoder.decode(y_pred) predictions = [{ "input_text": texts[i], "preprocessed_text": preprocessed_texts[i], "predicted_tags": tags[i], } for i in range(len(tags))] return predictions
print(f"{len(cv_transformers_df)} projects") print(cv_transformers_df[["text", "tags"]].head()) short_text_df = slice_dataframe(test_df, short_text) print(f"{len(short_text_df)} projects") print(short_text_df[["text", "tags"]].head()) # 8. Tokenize inputs tokenizer = data.Tokenizer(char_level=args.char_level) tokenizer.fit_on_texts(texts=X_train) X_train = np.array(tokenizer.texts_to_sequences(X_train), dtype=object) X_val = np.array(tokenizer.texts_to_sequences(X_val), dtype=object) X_test = np.array(tokenizer.texts_to_sequences(X_test), dtype=object) # 9. Create dataloaders train_dataset = data.CNNTextDataset(X=X_train, y=y_train, max_filter_size=args.max_filter_size) val_dataset = data.CNNTextDataset(X=X_val, y=y_val, max_filter_size=args.max_filter_size) test_dataset = data.CNNTextDataset(X=X_test, y=y_test, max_filter_size=args.max_filter_size) train_dataloader = train_dataset.create_dataloader( batch_size=args.batch_size) val_dataloader = val_dataset.create_dataloader(batch_size=args.batch_size) test_dataloader = test_dataset.create_dataloader( batch_size=args.batch_size) # Load artifacts runs = utils.get_sorted_runs(experiment_name="best",
def run(params: Namespace, trial: optuna.trial._trial.Trial = None) -> Dict: """Operations for training. Args: params (Namespace): Input parameters for operations. trial (optuna.trial._trial.Trail, optional): Optuna optimization trial. Defaults to None. Returns: Artifacts to save and load for later. """ # 1. Set seed utils.set_seed(seed=params.seed) # 2. Set device device = utils.set_device(cuda=params.cuda) # 3. Load data projects_fp = Path(config.DATA_DIR, "projects.json") tags_fp = Path(config.DATA_DIR, "tags.json") projects = utils.load_dict(filepath=projects_fp) tags_dict = utils.list_to_dict(utils.load_dict(filepath=tags_fp), key="tag") df = pd.DataFrame(projects) if params.shuffle: df = df.sample(frac=1).reset_index(drop=True) df = df[:params.subset] # None = all samples # 4. Prepare data (feature engineering, filter, clean) df, tags_above_freq, tags_below_freq = data.prepare( df=df, include=list(tags_dict.keys()), exclude=config.EXCLUDED_TAGS, min_tag_freq=params.min_tag_freq, ) params.num_samples = len(df) # 5. Preprocess data df.text = df.text.apply(data.preprocess, lower=params.lower, stem=params.stem) # 6. Encode labels labels = df.tags label_encoder = data.MultiLabelLabelEncoder() label_encoder.fit(labels) y = label_encoder.encode(labels) # Class weights all_tags = list(itertools.chain.from_iterable(labels.values)) counts = np.bincount( [label_encoder.class_to_index[class_] for class_ in all_tags]) class_weights = {i: 1.0 / count for i, count in enumerate(counts)} # 7. Split data utils.set_seed(seed=params.seed) # needed for skmultilearn X = df.text.to_numpy() X_train, X_, y_train, y_ = data.iterative_train_test_split( X=X, y=y, train_size=params.train_size) X_val, X_test, y_val, y_test = data.iterative_train_test_split( X=X_, y=y_, train_size=0.5) test_df = pd.DataFrame({ "text": X_test, "tags": label_encoder.decode(y_test) }) # 8. Tokenize inputs tokenizer = data.Tokenizer(char_level=params.char_level) tokenizer.fit_on_texts(texts=X_train) X_train = np.array(tokenizer.texts_to_sequences(X_train), dtype=object) X_val = np.array(tokenizer.texts_to_sequences(X_val), dtype=object) X_test = np.array(tokenizer.texts_to_sequences(X_test), dtype=object) # 9. Create dataloaders train_dataset = data.CNNTextDataset(X=X_train, y=y_train, max_filter_size=params.max_filter_size) val_dataset = data.CNNTextDataset(X=X_val, y=y_val, max_filter_size=params.max_filter_size) train_dataloader = train_dataset.create_dataloader( batch_size=params.batch_size) val_dataloader = val_dataset.create_dataloader( batch_size=params.batch_size) # 10. Initialize model model = models.initialize_model( params=params, vocab_size=len(tokenizer), num_classes=len(label_encoder), device=device, ) # 11. Train model logger.info( f"Parameters: {json.dumps(params.__dict__, indent=2, cls=NumpyEncoder)}" ) params, model, loss = train.train( params=params, train_dataloader=train_dataloader, val_dataloader=val_dataloader, model=model, device=device, class_weights=class_weights, trial=trial, ) # 12. Evaluate model artifacts = { "params": params, "label_encoder": label_encoder, "tokenizer": tokenizer, "model": model, "loss": loss, } device = torch.device("cpu") y_true, y_pred, performance = eval.evaluate(df=test_df, artifacts=artifacts) artifacts["performance"] = performance return artifacts
def train(params: Namespace, trial: optuna.trial._trial.Trial = None) -> Dict: """Operations for training. Args: params (Namespace): Input parameters for operations. trial (optuna.trial._trial.Trail, optional): Optuna optimization trial. Defaults to None. Returns: Artifacts to save and load for later. """ # Set up utils.set_seed(seed=params.seed) device = utils.set_device(cuda=params.cuda) # Load features features_fp = Path(config.DATA_DIR, "features.json") tags_fp = Path(config.DATA_DIR, "tags.json") features = utils.load_dict(filepath=features_fp) tags_dict = utils.list_to_dict(utils.load_dict(filepath=tags_fp), key="tag") df = pd.DataFrame(features) if params.shuffle: df = df.sample(frac=1).reset_index(drop=True) df = df[:params.subset] # None = all samples # Prepare data (filter, clean, etc.) df, tags_above_freq, tags_below_freq = data.prepare( df=df, include=list(tags_dict.keys()), exclude=config.EXCLUDED_TAGS, min_tag_freq=params.min_tag_freq, ) params.num_samples = len(df) # Preprocess data df.text = df.text.apply(data.preprocess, lower=params.lower, stem=params.stem) # Encode labels labels = df.tags label_encoder = data.MultiLabelLabelEncoder() label_encoder.fit(labels) y = label_encoder.encode(labels) # Class weights all_tags = list(itertools.chain.from_iterable(labels.values)) counts = np.bincount( [label_encoder.class_to_index[class_] for class_ in all_tags]) class_weights = {i: 1.0 / count for i, count in enumerate(counts)} # Split data utils.set_seed(seed=params.seed) # needed for skmultilearn X = df.text.to_numpy() X_train, X_, y_train, y_ = data.iterative_train_test_split( X=X, y=y, train_size=params.train_size) X_val, X_test, y_val, y_test = data.iterative_train_test_split( X=X_, y=y_, train_size=0.5) test_df = pd.DataFrame({ "text": X_test, "tags": label_encoder.decode(y_test) }) # Tokenize inputs tokenizer = data.Tokenizer(char_level=params.char_level) tokenizer.fit_on_texts(texts=X_train) X_train = np.array(tokenizer.texts_to_sequences(X_train), dtype=object) X_val = np.array(tokenizer.texts_to_sequences(X_val), dtype=object) X_test = np.array(tokenizer.texts_to_sequences(X_test), dtype=object) # Create dataloaders train_dataset = data.CNNTextDataset(X=X_train, y=y_train, max_filter_size=params.max_filter_size) val_dataset = data.CNNTextDataset(X=X_val, y=y_val, max_filter_size=params.max_filter_size) train_dataloader = train_dataset.create_dataloader( batch_size=params.batch_size) val_dataloader = val_dataset.create_dataloader( batch_size=params.batch_size) # Initialize model model = models.initialize_model( params=params, vocab_size=len(tokenizer), num_classes=len(label_encoder), device=device, ) # Train model logger.info( f"Parameters: {json.dumps(params.__dict__, indent=2, cls=NumpyEncoder)}" ) class_weights_tensor = torch.Tensor(np.array(list(class_weights.values()))) loss_fn = nn.BCEWithLogitsLoss(weight=class_weights_tensor) optimizer = torch.optim.Adam(model.parameters(), lr=params.lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.05, patience=5) # Trainer module trainer = Trainer( model=model, device=device, loss_fn=loss_fn, optimizer=optimizer, scheduler=scheduler, trial=trial, ) # Train best_val_loss, best_model = trainer.train(params.num_epochs, params.patience, train_dataloader, val_dataloader) # Find best threshold _, y_true, y_prob = trainer.eval_step(dataloader=train_dataloader) params.threshold = find_best_threshold(y_true=y_true, y_prob=y_prob) # Evaluate model artifacts = { "params": params, "label_encoder": label_encoder, "tokenizer": tokenizer, "model": best_model, "loss": best_val_loss, } device = torch.device("cpu") y_true, y_pred, performance = eval.evaluate(df=test_df, artifacts=artifacts) artifacts["performance"] = performance return artifacts
def predict(texts: List, run_id: str) -> Dict: """Predict tags for an input text using the best model from the `best` experiment. Usage: ```python texts = ["Transfer learning with BERT."] predict(texts=texts, run_id="264ac530b78c42608e5dea1086bc2c73") ``` <pre> [ { "input_text": "Transfer learning with BERT.", "preprocessed_text": "transfer learning bert", "predicted_tags": [ "attention", "language-modeling", "natural-language-processing", "transfer-learning", "transformers" ] } ] </pre> Note: The input argument `texts` can hold multiple input texts and so the resulting prediction dictionary will have `len(texts)` items. Args: texts (List): List of input text to predict tags for. run_id (str): ID of the run to load model artifacts from. Returns: Predicted tags for input texts. """ # Load artifacts from run client = mlflow.tracking.MlflowClient() run = mlflow.get_run(run_id=run_id) device = torch.device("cpu") with tempfile.TemporaryDirectory() as fp: client.download_artifacts(run_id=run_id, path="", dst_path=fp) args = Namespace(**utils.load_dict( filepath=Path(config.CONFIG_DIR, "args.json"))) label_encoder = data.LabelEncoder.load( fp=Path(fp, "label_encoder.json")) tokenizer = data.Tokenizer.load(fp=Path(fp, "tokenizer.json")) model_state = torch.load(Path(fp, "model.pt"), map_location=device) # performance = utils.load_dict(filepath=Path(fp, "performance.json")) # Load model args = Namespace(**run.data.params) model = train.initialize_model(args=args, vocab_size=len(tokenizer), num_classes=len(label_encoder)) model.load_state_dict(model_state) # Prepare data preprocessed_texts = [data.preprocess(text) for text in texts] X = np.array(tokenizer.texts_to_sequences(preprocessed_texts)) y_filler = label_encoder.encode( [np.array([label_encoder.classes[0]] * len(X))]) dataset = data.CNNTextDataset(X=X, y=y_filler, max_filter_size=int(args.max_filter_size)) dataloader = dataset.create_dataloader(batch_size=int(args.batch_size)) # Get predictions trainer = train.Trainer(model=model, device=device) _, y_prob = trainer.predict_step(dataloader) y_pred = np.array( [np.where(prob >= float(args.threshold), 1, 0) for prob in y_prob]) tags = label_encoder.decode(y_pred) predictions = [{ "input_text": texts[i], "preprocessed_text": preprocessed_texts[i], "predicted_tags": tags[i], } for i in range(len(tags))] return predictions