Exemplo n.º 1
0
def main(data, output, index, col_headers, prep, threshold):
    """Console script for disamby."""
    names_df = pd.read_csv(data, index_col=index)
    prep_dict = {
        'A': pre.compact_abbreviations,
        'P': pre.remove_punctuation,
        'W': pre.normalize_whitespace,
        '3': lambda x: pre.ngram(x, 3),
        '4': lambda x: pre.ngram(x, 4),
        '5': lambda x: pre.ngram(x, 5),
        'S': pre.split_words,
        'X': lambda x: pre.ngram(x[:33], 4) + pre.split_words(x)
    }
    columns = col_headers.split(',')
    pipeline = [prep_dict[action] for action in list(prep)]
    dis = Disamby(data=names_df[columns], preprocessors=pipeline)
    components = dis.disambiguated_sets(threshold,
                                        smoother='offset',
                                        offset=100)

    comp_to_id = dict()
    for comp in components:
        members = list(comp)
        representative = members[0]
        name = names_df.loc[representative, columns[0]]
        comp_to_id[name] = members

    with open(output, 'w') as f:
        json.dump(comp_to_id, f)
Exemplo n.º 2
0
import disamby.preprocessors as prep
from disamby import Disamby
import pytest


pipeline = [prep.normalize_whitespace,
            prep.remove_punctuation,
            prep.compact_abbreviations,
            lambda x: prep.ngram(x, 4)]


@pytest.mark.parametrize('size', [20, 1000, 2000])
def test_fitting(size, company_df, benchmark):
    df = company_df(size)
    benchmark(Disamby, df, pipeline)


@pytest.mark.parametrize('size', [20, 1000, 2000])
def test_sparse_find(size, company_df, benchmark):
    df = company_df(size)
    dis = Disamby(df, pipeline)
    idx = list(dis.records['name'].keys())[0]
    results = benchmark(dis.find, idx, .7)
    score_of_searched = max(x.score for x in results)
    assert score_of_searched == pytest.approx(1)

Exemplo n.º 3
0
def test_combined_preprocessors(raw, expected):
    reduced = normalize_whitespace(raw)
    abbreviated = compact_abbreviations(reduced)
    trigram = ngram(abbreviated, 4)
    assert trigram == expected
Exemplo n.º 4
0
def test_ngram(raw, n, expected):
    if expected is ValueError:
        with pytest.raises(expected):
            ngram(raw, n)
    else:
        assert ngram(raw, n) == expected
Exemplo n.º 5
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Tests for `disamby` package."""
import pytest
import pandas as pd
from disamby import Disamby
import disamby.preprocessors as prep

pipeline = [
    prep.normalize_whitespace, prep.remove_punctuation,
    prep.compact_abbreviations, lambda x: prep.ngram(x, 4)
]


@pytest.fixture
def disamby_fitted_instance(fake_names):
    names = fake_names(90)
    data_series = pd.Series(names)
    dis = Disamby()
    dis.fit(data_series, pipeline, 'streets')
    return dis


def test_frequency_counter(disamby_fitted_instance):
    dis = disamby_fitted_instance
    assert 'streets' in dis.fields
    counter = dis.field_freq['streets']
    assert counter.most_common(1)[0][1] >= 1


def test_identification_potential(disamby_fitted_instance):