def test_predict_tags_mesh_cnn(mesh_cnn_path, mesh_label_binarizer_path): tags = predict_tags(X, mesh_cnn_path, mesh_label_binarizer_path, approach="mesh-cnn") assert len(tags) == 5 tags = predict_tags(X, mesh_cnn_path, mesh_label_binarizer_path, approach="mesh-cnn", probabilities=True) for tags_ in tags: for tag, prob in tags_.items(): assert 0 <= prob <= 1.0 tags = predict_tags(X, mesh_cnn_path, mesh_label_binarizer_path, approach="mesh-cnn", threshold=0) for tags_ in tags: assert len(tags_) == 5000 tags = predict_tags(X, mesh_cnn_path, mesh_label_binarizer_path, approach="mesh-cnn", threshold=1) for tags_ in tags: assert len(tags_) == 0
def test_predict_tags_science_ensemble(science_ensemble_path, label_binarizer_path): tags = predict_tags(X, model_path=science_ensemble_path, label_binarizer_path=label_binarizer_path, approach="science-ensemble") assert len(tags) == 5 tags = predict_tags(X, model_path=science_ensemble_path, label_binarizer_path=label_binarizer_path, approach="science-ensemble", probabilities=True) for tags_ in tags: for tag, prob in tags_.items(): assert 0 <= prob <= 1.0 tags = predict_tags(X, model_path=science_ensemble_path, label_binarizer_path=label_binarizer_path, approach="science-ensemble", threshold=0) for tags_ in tags: assert len(tags_) == 24 tags = predict_tags(X, model_path=science_ensemble_path, label_binarizer_path=label_binarizer_path, approach="science-ensemble", threshold=1) for tags_ in tags: assert len(tags_) == 0
def test_predict_tags_tfidf_svm(tfidf_svm_path, label_binarizer_path): tags = predict_tags(X, model_path=tfidf_svm_path, label_binarizer_path=label_binarizer_path, approach="tfidf-svm") assert len(tags) == 5 tags = predict_tags(X, model_path=tfidf_svm_path, label_binarizer_path=label_binarizer_path, approach="tfidf-svm", probabilities=True) for tags_ in tags: for tag, prob in tags_.items(): assert 0 <= prob <= 1.0 tags = predict_tags(X, model_path=tfidf_svm_path, label_binarizer_path=label_binarizer_path, approach="tfidf-svm", threshold=0) for tags_ in tags: assert len(tags_) == 24 tags = predict_tags(X, model_path=tfidf_svm_path, label_binarizer_path=label_binarizer_path, approach="tfidf-svm", threshold=1) for tags_ in tags: assert len(tags_) == 0
def test_predict_tags_mesh_xlinear(mesh_xlinear_path, mesh_label_binarizer_path): # We need to pass parameters because the load function is different # depending on the vectorizer library (pecos or sklearn) parameters = str({'vectorizer_library': 'sklearn'}) tags = predict_tags(X, mesh_xlinear_path, mesh_label_binarizer_path, approach="mesh-xlinear", parameters=parameters) assert len(tags) == 5 tags = predict_tags(X, mesh_xlinear_path, mesh_label_binarizer_path, approach="mesh-xlinear", parameters=parameters, probabilities=True) for tags_ in tags: for tag, prob in tags_.items(): assert 0 <= prob <= 1.0 tags = predict_tags(X, mesh_xlinear_path, mesh_label_binarizer_path, approach="mesh-xlinear", threshold=0, parameters=parameters) for tags_ in tags: assert len(tags_) == 5000 tags = predict_tags(X, mesh_xlinear_path, mesh_label_binarizer_path, approach="mesh-xlinear", threshold=1, parameters=parameters) for tags_ in tags: assert len(tags_) == 0
def tag_grants(grants_path, tagged_grants_path, model_path, label_binarizer_path, approach, threshold=0.5, grant_id_field="grant_id", grant_text_fields=["title", "synopsis"], text_null_value="No Data Entered"): with open(tagged_grants_path, "w") as tagged_grants_tf: fieldnames = ["Grant id", "Tag", "Prob"] csv_writer = csv.DictWriter(tagged_grants_tf, fieldnames=fieldnames, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) csv_writer.writeheader() for grants in yield_batched_grants(grants_path, 128): grants_text = [ " ".join([ grant[field].replace(text_null_value, "") for field in grant_text_fields ]) for grant in grants ] # Removes consecutive white spaces which are uninformative and may cause error #30 grants_text = [ " ".join(text.split()) for text in grants_text if text.strip() ] # Removes empty text grants_tags = predict_tags(grants_text, model_path, label_binarizer_path, approach, probabilities=True, threshold=threshold) for grant, tags in zip(grants, grants_tags): for tag, prob in tags.items(): csv_writer.writerow({ 'Grant id': grant[grant_id_field], 'Tag': tag, 'Prob': prob }) tagged_grants_tf.flush()
}, } model_option = st.sidebar.selectbox("Model", options=list(models.keys())) model = models[model_option] probabilities = st.sidebar.checkbox("Display probabilities") if text == DEFAULT_TEXT: st.stop() with st.spinner("Calculating tags..."): tags = predict_tags( [text], model["model_path"], model["label_binarizer_path"], model["approach"], probabilities=probabilities, threshold=threshold, ) tags = tags[0] st.success("Done!") if probabilities: tag_probs = [ {"Tag": tag, "Prob": prob} for tag, prob in tags.items() if prob > threshold ] st.table(pd.DataFrame(tag_probs)) tags = [tag_prob["Tag"] for tag_prob in tag_probs] else: for tag in tags: st.button(tag)