def test_compute_prominence_multiple(): strings = ['biden', 'joe biden', 'donald trump', 'D. Trump'] clusters = fuzzy_cluster(strings) clusters = compute_prominence(clusters) assert isinstance(clusters, list) assert len(clusters) > 0
def test_compute_prominence_weight_multipliers(): clusters = fuzzy_cluster(simulate_ner_data()) clusters = compute_prominence(clusters, weight_position=0.5, weight_multipliers=np.random.rand( len(clusters))) clusters = pd.DataFrame.from_dict(clusters) assert isinstance(clusters.prominence_score.tolist()[0], float)
def run_random(articles, entitites, id=None, scorer=partial_token_set_ratio, cutoff=75): if id is None: id = np.random.choice(articles.content_id.tolist()) article = articles[articles.content_id == id] article = article[['content_id', 'title', 'lead', 'body']] article_ents = entities[entities.content_id == id] article_ents = article_ents[article_ents.placement == "body"] preds = article_ents.to_dict(orient="records") t1 = time.time() clusters = fuzzy_cluster(preds, scorer=scorer, workers=4, cutoff=cutoff, merge_output=True) #pd.DataFrame.from_dict(clu ters) clusters = compute_prominence(clusters, merge_output=True, weight_position=.5) # subset location entities (for matching with cities) locations = [x["entity_group"] == "LOC" for x in clusters] locations = list(compress(clusters, locations)) clusters = locations clusters = match_whitelist(clusters, whitelist=whitelist, scorer=ratio, score_cutoff=95, merge_output=True, aggregate_cluster=True, workers=1) t2 = time.time() if len(clusters) > 0: clusters = pd.DataFrame.from_dict(clusters).sort_values( by="prominence_rank") print(id) #print(article.title.tolist()[0]) #print(article.lead.tolist()[0]) print(article.body.tolist()[0]) print(clusters) return t2 - t1
def test_compute_prominence_none(): clusters = fuzzy_cluster([]) clusters = compute_prominence(clusters) assert isinstance(clusters, list) assert len(clusters) == 0
def test_compute_prominence_single(): clusters = fuzzy_cluster(["Biden"]) clusters = compute_prominence(clusters) assert isinstance(clusters, list) assert len(clusters) == 1