def test_compute_prominence_multiple(): strings = ['biden', 'joe biden', 'donald trump', 'D. Trump'] clusters = fuzzy_cluster(strings) clusters = compute_prominence(clusters) assert isinstance(clusters, list) assert len(clusters) > 0
def test_whitelist_formatting(): # simulate data test_data = [{ 'word': 'Viborg', 'entity_group': 'LOC', 'cluster_id': 'A' }, { 'word': 'Uldum', 'entity_group': 'ORG', 'cluster_id': 'B' }, { 'word': 'Solgårde', 'entity_group': 'LOC', 'cluster_id': 'C' }] clusters = fuzzy_cluster(test_data) # Apply multiple whitelists out = apply_whitelists([c, m], clusters, score_cutoff=90) #### Format output # set desired columns cols = ['eblocal_code', 'municipality_code'] # format output out = format_output(out, columns=cols, drop_duplicates=True) assert isinstance(out, pd.DataFrame) assert len(out) > 0
def test_compute_prominence_weight_multipliers(): clusters = fuzzy_cluster(simulate_ner_data()) clusters = compute_prominence(clusters, weight_position=0.5, weight_multipliers=np.random.rand( len(clusters))) clusters = pd.DataFrame.from_dict(clusters) assert isinstance(clusters.prominence_score.tolist()[0], float)
def run_random(articles, entitites, id=None, scorer=partial_token_set_ratio, cutoff=75): if id is None: id = np.random.choice(articles.content_id.tolist()) article = articles[articles.content_id == id] article = article[['content_id', 'title', 'lead', 'body']] article_ents = entities[entities.content_id == id] article_ents = article_ents[article_ents.placement == "body"] preds = article_ents.to_dict(orient="records") t1 = time.time() clusters = fuzzy_cluster(preds, scorer=scorer, workers=4, cutoff=cutoff, merge_output=True) #pd.DataFrame.from_dict(clu ters) clusters = compute_prominence(clusters, merge_output=True, weight_position=.5) # subset location entities (for matching with cities) locations = [x["entity_group"] == "LOC" for x in clusters] locations = list(compress(clusters, locations)) clusters = locations clusters = match_whitelist(clusters, whitelist=whitelist, scorer=ratio, score_cutoff=95, merge_output=True, aggregate_cluster=True, workers=1) t2 = time.time() if len(clusters) > 0: clusters = pd.DataFrame.from_dict(clusters).sort_values( by="prominence_rank") print(id) #print(article.title.tolist()[0]) #print(article.lead.tolist()[0]) print(article.body.tolist()[0]) print(clusters) return t2 - t1
def test_municipalities_whitelist(): test_data = [{ 'word': 'Viborg Kommune', 'entity_group': 'LOC', 'cluster_id': 'ABE' }, { 'word': 'Uldum', 'entity_group': 'ORG', 'cluster_id': 'bambolino' }] clusters = fuzzy_cluster(test_data) out = m(clusters, score_cutoff=95) assert len(out) > 0
def test_whitelist_aggregate_cluster(): test_data = [{ 'word': 'Viborg', 'entity_group': 'LOC', 'cluster_id': 'ABE' }, { 'word': 'Uldum', 'entity_group': 'ORG', 'cluster_id': 'bambolino' }] clusters = fuzzy_cluster(test_data) out = c(clusters, aggregate_cluster=True) assert len(out) == 1
def test_whitelist_list_input(): test_data = [{ 'word': 'Viborg', 'entity_group': 'LO', 'cluster_id': 'ABE' }, { 'word': 'Uldum', 'entity_group': 'ORG', 'cluster_id': 'bambolino' }] clusters = fuzzy_cluster(test_data) matches = match_whitelist(clusters, whitelist=['Viborg']) assert len(matches) == 1
def test_whitelist_no_match(): test_data = [{ 'word': 'Viborg', 'entity_group': 'LO', 'cluster_id': 'ABE' }, { 'word': 'Uldum', 'entity_group': 'ORG', 'cluster_id': 'bambolino' }] clusters = fuzzy_cluster(test_data) out = c(clusters) assert len(out) == 0
def test_whitelist_no_input(): test_data = [] clusters = fuzzy_cluster(test_data) out = c(clusters) assert len(out) == 0
test_data = [{ 'word': 'Viborg', 'entity_group': 'LOC', 'cluster_id': 'A' }, { 'word': 'Uldum', 'entity_group': 'ORG', 'cluster_id': 'B' }, { 'word': 'Solgårde', 'entity_group': 'LOC', 'cluster_id': 'C' }] # cluster data clusters = fuzzy_cluster(test_data) # initiate relevant whitelists c = Cities() m = Municipalities() n = Neighborhoods() # Apply whitelists out = apply_whitelists([c, m, n], clusters, score_cutoff=90) #### Format output # set desired columns cols = ['neighborhood_code', 'city_code', 'municipality_code'] # format output out = format_output(out, columns=cols, drop_duplicates=True)
def test_fuzzy_cluster_none(): clusters = fuzzy_cluster([]) assert len(clusters) == 0 assert isinstance(clusters, list)
def test_fuzzy_cluster_single(): clusters = fuzzy_cluster(["smokie"]) assert len(clusters) == 1 assert isinstance(clusters, list)
def test_fuzzy_cluster(): strings = ['biden', 'joe biden', 'donald trump', 'D. Trump'] clusters = fuzzy_cluster(strings) assert isinstance(clusters, list)
def test_compute_prominence_none(): clusters = fuzzy_cluster([]) clusters = compute_prominence(clusters) assert isinstance(clusters, list) assert len(clusters) == 0
def test_compute_prominence_single(): clusters = fuzzy_cluster(["Biden"]) clusters = compute_prominence(clusters) assert isinstance(clusters, list) assert len(clusters) == 1