def test_preprocess(text, lower, stem, stopwords, preprocessed_text): assert (data.preprocess( text=text, lower=lower, stem=stem, stopwords=stopwords, ) == preprocessed_text)
collocations=False, width=500, height=300, ).generate(" ".join(text)) plt.axis("off") plt.imshow(cloud) st.pyplot(plt) # Preprocessing st.write("---") st.subheader("Preprocessing") text = st.text_input("Input text", "Conditional generation using Variational Autoencoders.") filters = st.text_input("filters", "[!\"'#$%&()*+,-./:;<=>?@\\[]^_`{|}~]") lower = st.checkbox("lower", True) stem = st.checkbox("stem", False) preprocessed_text = data.preprocess(text=text, lower=lower, stem=stem, filters=filters) st.text("Preprocessed text:") st.write(preprocessed_text) elif selected_page == "Performance": st.header("Performance") # Get tags and respective parameters and performance tags = get_tags(author=config.AUTHOR, repo=config.REPO) # Key metrics key_metrics = [ "overall.f1", "overall.precision", "overall.recall", "behavioral.score",
def predict( texts: List, artifacts: Dict, device: torch.device = torch.device("cpu")) -> Dict: """Predict tags for an input text using the best model from the `best` experiment. Usage: ```python texts = ["Transfer learning with BERT."] artifacts = load_artifacts(run_id="264ac530b78c42608e5dea1086bc2c73") predict(texts=texts, artifacts=artifacts) ``` <pre> [ { "input_text": "Transfer learning with BERT.", "preprocessed_text": "transfer learning bert", "predicted_tags": [ "attention", "language-modeling", "natural-language-processing", "transfer-learning", "transformers" ] } ] </pre> Note: The input parameter `texts` can hold multiple input texts and so the resulting prediction dictionary will have `len(texts)` items. Args: texts (List): List of input texts to predict tags for. artifacts (Dict): Artifacts needed for inference. device (torch.device): Device to run model on. Defaults to CPU. Returns: Predicted tags for each of the input texts. """ # Retrieve artifacts params = artifacts["params"] label_encoder = artifacts["label_encoder"] tokenizer = artifacts["tokenizer"] model = artifacts["model"] # Prepare data preprocessed_texts = [ data.preprocess( text, lower=bool(strtobool(str( params.lower))), # params.lower could be str/bool stem=bool(strtobool(str(params.stem))), ) for text in texts ] X = np.array(tokenizer.texts_to_sequences(preprocessed_texts), dtype="object") y_filler = np.zeros((len(X), len(label_encoder))) dataset = data.CNNTextDataset(X=X, y=y_filler, max_filter_size=int(params.max_filter_size)) dataloader = dataset.create_dataloader(batch_size=int(params.batch_size)) # Get predictions trainer = train.Trainer(model=model, device=device) _, y_prob = trainer.predict_step(dataloader) y_pred = [ np.where(prob >= float(params.threshold), 1, 0) for prob in y_prob ] tags = label_encoder.decode(y_pred) predictions = [{ "input_text": texts[i], "preprocessed_text": preprocessed_texts[i], "predicted_tags": tags[i], } for i in range(len(tags))] return predictions
def predict(texts: List, run_id: str) -> Dict: """Predict tags for an input text using the best model from the `best` experiment. Usage: ```python texts = ["Transfer learning with BERT."] predict(texts=texts, run_id="264ac530b78c42608e5dea1086bc2c73") ``` <pre> [ { "input_text": "Transfer learning with BERT.", "preprocessed_text": "transfer learning bert", "predicted_tags": [ "attention", "language-modeling", "natural-language-processing", "transfer-learning", "transformers" ] } ] </pre> Note: The input argument `texts` can hold multiple input texts and so the resulting prediction dictionary will have `len(texts)` items. Args: texts (List): List of input text to predict tags for. run_id (str): ID of the run to load model artifacts from. Returns: Predicted tags for input texts. """ # Load artifacts from run client = mlflow.tracking.MlflowClient() run = mlflow.get_run(run_id=run_id) device = torch.device("cpu") with tempfile.TemporaryDirectory() as fp: client.download_artifacts(run_id=run_id, path="", dst_path=fp) args = Namespace(**utils.load_dict( filepath=Path(config.CONFIG_DIR, "args.json"))) label_encoder = data.LabelEncoder.load( fp=Path(fp, "label_encoder.json")) tokenizer = data.Tokenizer.load(fp=Path(fp, "tokenizer.json")) model_state = torch.load(Path(fp, "model.pt"), map_location=device) # performance = utils.load_dict(filepath=Path(fp, "performance.json")) # Load model args = Namespace(**run.data.params) model = train.initialize_model(args=args, vocab_size=len(tokenizer), num_classes=len(label_encoder)) model.load_state_dict(model_state) # Prepare data preprocessed_texts = [data.preprocess(text) for text in texts] X = np.array(tokenizer.texts_to_sequences(preprocessed_texts)) y_filler = label_encoder.encode( [np.array([label_encoder.classes[0]] * len(X))]) dataset = data.CNNTextDataset(X=X, y=y_filler, max_filter_size=int(args.max_filter_size)) dataloader = dataset.create_dataloader(batch_size=int(args.batch_size)) # Get predictions trainer = train.Trainer(model=model, device=device) _, y_prob = trainer.predict_step(dataloader) y_pred = np.array( [np.where(prob >= float(args.threshold), 1, 0) for prob in y_prob]) tags = label_encoder.decode(y_pred) predictions = [{ "input_text": texts[i], "preprocessed_text": preprocessed_texts[i], "predicted_tags": tags[i], } for i in range(len(tags))] return predictions