示例#1
0
def test_compute_prominence_multiple():
    strings = ['biden', 'joe biden', 'donald trump', 'D. Trump']

    clusters = fuzzy_cluster(strings)
    clusters = compute_prominence(clusters)
    assert isinstance(clusters, list)
    assert len(clusters) > 0
示例#2
0
def test_whitelist_formatting():
    # simulate data
    test_data = [{
        'word': 'Viborg',
        'entity_group': 'LOC',
        'cluster_id': 'A'
    }, {
        'word': 'Uldum',
        'entity_group': 'ORG',
        'cluster_id': 'B'
    }, {
        'word': 'Solgårde',
        'entity_group': 'LOC',
        'cluster_id': 'C'
    }]
    clusters = fuzzy_cluster(test_data)

    # Apply multiple whitelists
    out = apply_whitelists([c, m], clusters, score_cutoff=90)

    #### Format output
    # set desired columns
    cols = ['eblocal_code', 'municipality_code']

    # format output
    out = format_output(out, columns=cols, drop_duplicates=True)
    assert isinstance(out, pd.DataFrame)
    assert len(out) > 0
示例#3
0
def test_compute_prominence_weight_multipliers():
    clusters = fuzzy_cluster(simulate_ner_data())
    clusters = compute_prominence(clusters,
                                  weight_position=0.5,
                                  weight_multipliers=np.random.rand(
                                      len(clusters)))
    clusters = pd.DataFrame.from_dict(clusters)
    assert isinstance(clusters.prominence_score.tolist()[0], float)
示例#4
0
def run_random(articles,
               entitites,
               id=None,
               scorer=partial_token_set_ratio,
               cutoff=75):

    if id is None:
        id = np.random.choice(articles.content_id.tolist())

    article = articles[articles.content_id == id]
    article = article[['content_id', 'title', 'lead', 'body']]
    article_ents = entities[entities.content_id == id]
    article_ents = article_ents[article_ents.placement == "body"]
    preds = article_ents.to_dict(orient="records")

    t1 = time.time()

    clusters = fuzzy_cluster(preds,
                             scorer=scorer,
                             workers=4,
                             cutoff=cutoff,
                             merge_output=True)
    #pd.DataFrame.from_dict(clu ters)

    clusters = compute_prominence(clusters,
                                  merge_output=True,
                                  weight_position=.5)

    # subset location entities (for matching with cities)
    locations = [x["entity_group"] == "LOC" for x in clusters]
    locations = list(compress(clusters, locations))
    clusters = locations

    clusters = match_whitelist(clusters,
                               whitelist=whitelist,
                               scorer=ratio,
                               score_cutoff=95,
                               merge_output=True,
                               aggregate_cluster=True,
                               workers=1)

    t2 = time.time()

    if len(clusters) > 0:
        clusters = pd.DataFrame.from_dict(clusters).sort_values(
            by="prominence_rank")

    print(id)
    #print(article.title.tolist()[0])
    #print(article.lead.tolist()[0])
    print(article.body.tolist()[0])
    print(clusters)

    return t2 - t1
示例#5
0
def test_municipalities_whitelist():
    test_data = [{
        'word': 'Viborg Kommune',
        'entity_group': 'LOC',
        'cluster_id': 'ABE'
    }, {
        'word': 'Uldum',
        'entity_group': 'ORG',
        'cluster_id': 'bambolino'
    }]
    clusters = fuzzy_cluster(test_data)
    out = m(clusters, score_cutoff=95)
    assert len(out) > 0
示例#6
0
def test_whitelist_aggregate_cluster():
    test_data = [{
        'word': 'Viborg',
        'entity_group': 'LOC',
        'cluster_id': 'ABE'
    }, {
        'word': 'Uldum',
        'entity_group': 'ORG',
        'cluster_id': 'bambolino'
    }]
    clusters = fuzzy_cluster(test_data)
    out = c(clusters, aggregate_cluster=True)
    assert len(out) == 1
示例#7
0
def test_whitelist_list_input():
    test_data = [{
        'word': 'Viborg',
        'entity_group': 'LO',
        'cluster_id': 'ABE'
    }, {
        'word': 'Uldum',
        'entity_group': 'ORG',
        'cluster_id': 'bambolino'
    }]
    clusters = fuzzy_cluster(test_data)
    matches = match_whitelist(clusters, whitelist=['Viborg'])
    assert len(matches) == 1
示例#8
0
def test_whitelist_no_match():
    test_data = [{
        'word': 'Viborg',
        'entity_group': 'LO',
        'cluster_id': 'ABE'
    }, {
        'word': 'Uldum',
        'entity_group': 'ORG',
        'cluster_id': 'bambolino'
    }]
    clusters = fuzzy_cluster(test_data)
    out = c(clusters)
    assert len(out) == 0
示例#9
0
def test_whitelist_no_input():
    test_data = []
    clusters = fuzzy_cluster(test_data)
    out = c(clusters)
    assert len(out) == 0
示例#10
0
test_data = [{
    'word': 'Viborg',
    'entity_group': 'LOC',
    'cluster_id': 'A'
}, {
    'word': 'Uldum',
    'entity_group': 'ORG',
    'cluster_id': 'B'
}, {
    'word': 'Solgårde',
    'entity_group': 'LOC',
    'cluster_id': 'C'
}]

# cluster data
clusters = fuzzy_cluster(test_data)

# initiate relevant whitelists
c = Cities()
m = Municipalities()
n = Neighborhoods()

# Apply whitelists
out = apply_whitelists([c, m, n], clusters, score_cutoff=90)

#### Format output
# set desired columns
cols = ['neighborhood_code', 'city_code', 'municipality_code']

# format output
out = format_output(out, columns=cols, drop_duplicates=True)
示例#11
0
def test_fuzzy_cluster_none():
    clusters = fuzzy_cluster([])
    assert len(clusters) == 0
    assert isinstance(clusters, list)
示例#12
0
def test_fuzzy_cluster_single():
    clusters = fuzzy_cluster(["smokie"])
    assert len(clusters) == 1
    assert isinstance(clusters, list)
示例#13
0
def test_fuzzy_cluster():
    strings = ['biden', 'joe biden', 'donald trump', 'D. Trump']
    clusters = fuzzy_cluster(strings)
    assert isinstance(clusters, list)
示例#14
0
def test_compute_prominence_none():
    clusters = fuzzy_cluster([])
    clusters = compute_prominence(clusters)
    assert isinstance(clusters, list)
    assert len(clusters) == 0
示例#15
0
def test_compute_prominence_single():
    clusters = fuzzy_cluster(["Biden"])
    clusters = compute_prominence(clusters)
    assert isinstance(clusters, list)
    assert len(clusters) == 1