예제 #1
0
def load_entities_dict():
    entities_fn = os.path.join(
        os.path.dirname(lexnlp_tests.this_test_data_path()), 'geoentities.csv')
    aliases_fn = os.path.join(
        os.path.dirname(lexnlp_tests.this_test_data_path()), 'geoaliases.csv')

    entities = {}

    with open(entities_fn, 'r', encoding='utf8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            entities[row['id']] = entity_config(
                row['id'],
                row['name'],
                int(row['priority']) if row['priority'] else 0,
                name_is_alias=True)

    with open(aliases_fn, 'r', encoding='utf8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            entity = entities.get(row['entity_id'])
            if entity:
                add_aliases_to_entity(
                    entity, row['alias'], row['locale'],
                    row['type'].startswith('iso')
                    or row['type'] == 'abbreviation')
    return entities.values()
예제 #2
0
def test_courts_longest_match():
    """
    Tests the case when there are courts having names/aliases being one a substring of another.
    In such case the court having longest alias should be returned for each conflicting matching.
    But for the case when there is another match of the court having shorter alias in that conflict,
    they both should be returned.
    :return:
    """
    courts_config_fn = os.path.join(
        os.path.dirname(lexnlp_tests.this_test_data_path()), 'us_courts.csv')
    courts_config_list = []
    with open(courts_config_fn, 'r', encoding='utf8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            cc = entity_config(row['Court ID'],
                               row['Court Type'] + '|' + row['Court Name'],
                               0,
                               row['Alias'].split(';') if row['Alias'] else [],
                               name_is_alias=False)
            add_alias_to_entity(cc, row['Court Name'])

            courts_config_list.append(cc)
    lexnlp_tests.test_extraction_func_on_test_data(
        get_courts,
        court_config_list=courts_config_list,
        actual_data_converter=lambda actual:
        [tuple(c[0][1].split('|')) for c in actual],
        debug_print=True)
예제 #3
0
def load_entities_dict():
    entities_fn = os.path.join(
        os.path.dirname(lexnlp_tests.this_test_data_path()), 'geoentities.csv')
    aliases_fn = os.path.join(
        os.path.dirname(lexnlp_tests.this_test_data_path()), 'geoaliases.csv')
    return load_entities_dict_by_path(entities_fn, aliases_fn)