def test_equal_aliases_in_dif_languages(self):
        mississippi = DictionaryEntry(1, 'Mississippi',
                                      aliases=[DictionaryEntryAlias('MS', is_abbreviation=True, language='en'),
                                               DictionaryEntryAlias('Mississippi', language='de'),
                                               DictionaryEntryAlias('Mississippi', language='en')])

        montserrat = DictionaryEntry(2, 'Montserrat',
                                     aliases=[DictionaryEntryAlias('MS', is_abbreviation=True, language='en'),
                                              DictionaryEntryAlias('Montserrat', language='de'),
                                              DictionaryEntryAlias('Montserrat', language='en')])
        canada = DictionaryEntry(3, 'Canada',
                                 aliases=[DictionaryEntryAlias('CAN', is_abbreviation=True, language='en'),
                                          DictionaryEntryAlias('Kanada', language='de'),
                                          DictionaryEntryAlias('Canada', language='en')])
        entities = [mississippi, montserrat, canada]

        text = '"MS" here can mean either "MMMississippi" or "MMMontserrat" because ' \
               'they have equal aliases in English. ' \
               'This test is here because in one version of the code alias texts were required to be unique. ' \
               '"CCCanada" (can) should not be detected because word "can" is in lowercase here.'

        parsed_enitities = list(find_dict_entities(
            text, default_language=LANG_EN.code,
            all_possible_entities=entities, text_languages=['en']))
        self.assertEqual(2, len(parsed_enitities))
        _ent, alias = parsed_enitities[0].entity
        self.assertEqual('MS', alias.alias)

        _ent, alias = parsed_enitities[1].entity
        self.assertEqual('MS', alias.alias)
    def test_abbreviations_simple(self):
        some_entity = DictionaryEntry(1, 'ITAbbrev', aliases=[DictionaryEntryAlias('IT', is_abbreviation=True)])
        some_entity1 = DictionaryEntry(2, 'ISAbbrev', aliases=[DictionaryEntryAlias('IS', is_abbreviation=True)])
        entities = [some_entity, some_entity1]

        text = '"IT\'s" entity should be detected even with "\'s" because ' \
               'tokenizer takes care of this kind of things. ' \
               '"ISS" entity should not be detected - bacause "is" word' \
               ' is in lowercase here and probably does not mean an abbreviation.'

        parsed_enitities = list(find_dict_entities(
            text, default_language=LANG_EN.code,
            all_possible_entities=entities, text_languages=['ge'],
            simplified_normalization=False))
        self.assertEqual(1, len(parsed_enitities))
        _ent, alias = parsed_enitities[0].entity
        self.assertEqual('IT', alias.alias)

        simply_parsed_enitities = list(find_dict_entities(
            text, default_language=LANG_EN.code,
            all_possible_entities=entities, text_languages=['ge'],
            simplified_normalization=True))
        self.assertEqual(len(parsed_enitities), len(simply_parsed_enitities))
        _ent, simply_alias = parsed_enitities[0].entity
        self.assertEqual(alias.alias, simply_alias.alias)
    def test_plural_case_matching(self):
        table = DictionaryEntry(
            1,
            'Table',
            aliases=[DictionaryEntryAlias('tbl.', is_abbreviation=True)],
            name_is_alias=True)
        man = DictionaryEntry(2, 'man', name_is_alias=True)
        masloboyka = DictionaryEntry(3, 'masloboyka', name_is_alias=True)

        entities = [table, man, masloboyka]

        text = 'We should detect the singular number of word "tables" here - the stemmer takes care of plural case. ' \
               'Unfortunately our stemmer is not able to convert word "men" to singular number yet :(. ' \
               'But it works for word "masloboykas" - a non existing word in English in plural case.'

        parsed_enitities = list(
            find_dict_entities(text,
                               all_possible_entities=entities,
                               use_stemmer=True))
        self.assertEqual(2, len(parsed_enitities))

        _ent, alias = parsed_enitities[0].entity
        self.assertEqual('Table', alias.alias)
        _ent, alias = parsed_enitities[1].entity
        self.assertEqual('masloboyka', alias.alias)
    def test_am_pm_none(self):
        simply_parse_mode = [False, True]
        for parse_mode in simply_parse_mode:
            am = DictionaryEntry(1, 'America',
                                 aliases=[DictionaryEntryAlias('AM', is_abbreviation=True)],
                                 name_is_alias=False)
            pm = DictionaryEntry(2, 'Postmodernism',
                                 aliases=[DictionaryEntryAlias('PM', is_abbreviation=True)],
                                 name_is_alias=False)

            entities = [am, pm]
            ents = list(find_dict_entities(
                'It is 11:00 AM or 11:00 PM now.',
                default_language=LANG_EN.code,
                all_possible_entities=entities, simplified_normalization=parse_mode))
            self.assertEqual(0, len(ents))

            ents = list(find_dict_entities('It is 11:00am now in (AM). Hello!',
                                           default_language=LANG_EN.code,
                                           all_possible_entities=entities,
                                           simplified_normalization=parse_mode))
            self.assertEqual(1, len(ents))
            self.assertEqual('America', ents[0].entity[0].name)

            ents = list(find_dict_entities('It is 11:00am now.',
                                           default_language=LANG_EN.code,
                                           all_possible_entities=entities,
                                           simplified_normalization=parse_mode))
            self.assertEqual(0, len(ents))
    def test_am_pm_none(self):
        am = DictionaryEntry(
            1,
            'America',
            aliases=[DictionaryEntryAlias('AM', is_abbreviation=True)],
            name_is_alias=False)
        pm = DictionaryEntry(
            2,
            'Postmodernism',
            aliases=[DictionaryEntryAlias('PM', is_abbreviation=True)],
            name_is_alias=False)

        entities = [am, pm]
        ents = list(
            find_dict_entities('It is 11:00 AM or 11:00 PM now.',
                               all_possible_entities=entities))
        self.assertEqual(0, len(ents))

        ents = list(
            find_dict_entities('It is 11:00am now in (AM). Hello!',
                               all_possible_entities=entities))
        self.assertEqual(1, len(ents))
        self.assertEqual('America', ents[0].entity[0].name)

        ents = list(
            find_dict_entities('It is 11:00am now.',
                               all_possible_entities=entities))
        self.assertEqual(0, len(ents))
    def test_conflicts_take_longest_match(self):
        some_entity = DictionaryEntry(1, 'Some Entity', aliases=[DictionaryEntryAlias('Something')])
        some_entity1 = DictionaryEntry(2, 'Some Entity One', aliases=[DictionaryEntryAlias('Something One')])
        some_entity2 = DictionaryEntry(3, 'Some Entity Two', aliases=[DictionaryEntryAlias('Something Two')])
        entities = [some_entity, some_entity1, some_entity2]

        text = '"Some Entity One" should be found in this text and "Someee Entityyy" should be ignored.'

        parsed_enitities = list(find_dict_entities(text,
                                                   all_possible_entities=entities,
                                                   default_language=LANG_EN.code))
        self.assertEqual(1, len(parsed_enitities))
        _ent, alias = parsed_enitities[0].entity
        self.assertEqual('Some Entity One', alias.alias)
    def test_conflicts_equal_length_take_same_language(self):
        some_entity = DictionaryEntry(1, 'Some Entity', aliases=[DictionaryEntryAlias('Something')])
        some_entity1 = DictionaryEntry(2, 'Some Entity1',
                                       aliases=[DictionaryEntryAlias('Some Entity One', language='fr')])
        some_entity2 = DictionaryEntry(3, 'Some Entity2', aliases=[DictionaryEntryAlias('Something Two')])
        entities = [some_entity, some_entity1, some_entity2]

        text = '"Some Entity One" should not be found in this text because it is not in German language.' \
               'Shorter match - "Someeee Entityyy" should be taken instead.'

        parsed_enitities = list(find_dict_entities(
            text, all_possible_entities=entities, text_languages=['de'],
            default_language=LANG_EN.code))
        self.assertEqual(1, len(parsed_enitities))
        _ent, alias = parsed_enitities[0].entity
        self.assertEqual('Some Entity', alias.alias)
    def test_find_dict_entities_empty_text(self):
        text = ''
        am = DictionaryEntry(1, 'America',
                             aliases=[DictionaryEntryAlias('AM', is_abbreviation=True)], name_is_alias=False)

        res = list(find_dict_entities(text, [am], default_language=LANG_EN.code))
        self.assertFalse(res)
def make_geoconfig():
    dir_path = os.path.dirname(os.path.realpath(__file__))
    ge_path = dir_path + '/../../../../test_data/lexnlp/extract/en/tests/test_geoentities/'
    entities_fn = ge_path + 'geoentities.csv'
    aliases_fn = ge_path + 'geoaliases.csv'
    return list(
        DictionaryEntry.load_entities_from_files(entities_fn, aliases_fn))
예제 #10
0
def make_records_typed(_apps, _schema_editor):
    for cache_key in CACHE_KEYS:
        typed = []  # type: List[DictionaryEntry]
        records = DbCache.get(cache_key)
        for record in records:
            if record.__class__.__name__ == 'DictionaryEntry':
                typed.append(record)
                continue
            try:
                aliases = [
                    DictionaryEntryAlias(alias, lang, is_abbr, alias_id,
                                         norm_als)
                    for alias, lang, is_abbr, alias_id, norm_als in record[3]
                ]
                rec = DictionaryEntry(record[0],
                                      record[1],
                                      priority=record[2],
                                      aliases=aliases)
                typed.append(rec)
            except Exception as e:
                print(
                    f'Unable to cast a record in "{cache_key}" to DictionaryEntry: {e}'
                )

        DbCache.put_to_db(cache_key, typed)
    if DbCache.INSTANCE:
        DbCache.INSTANCE.stop_watching()
예제 #11
0
def load_entities_dict_by_path(entities_fn: str, aliases_fn: str):
    entities = {}
    import csv

    with open(entities_fn, 'r', encoding='utf8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            entities[row['id']] = DictionaryEntry(
                id=int(row['id']),
                name=row['name'],
                priority=int(row['priority']) if row['priority'] else 0,
                name_is_alias=True)

    with open(aliases_fn, 'r', encoding='utf8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            entity = entities.get(row['entity_id'])  # type: DictionaryEntry
            if entity:
                alias = DictionaryEntryAlias(
                    alias=row['alias'],
                    language=row['locale'],
                    is_abbreviation=row['type'].startswith('iso')
                    or row['type'] == 'abbreviation')
                entity.aliases.append(alias)

    return entities.values()
예제 #12
0
    def en_parsers_speed(self):
        file_path = os.path.join(lexnlp_test_path, 'long_parsed_text.txt')
        with codecs.open(file_path, 'r', encoding='utf-8') as fr:
            text = fr.read()

        ge_path = os.path.join(lexnlp_test_path, 'lexnlp/extract/en/tests/test_geoentities/')
        entities_fn = ge_path + 'geoentities.csv'
        aliases_fn = ge_path + 'geoaliases.csv'
        geo_config = list(DictionaryEntry.load_entities_from_files(entities_fn, aliases_fn))

        times = {}  # type: Dict[str, float]
        self.check_time(text, lambda s: list(get_amounts(s)), 'get_amounts', times)
        self.check_time(text, lambda s: list(get_acts(s)), 'get_acts', times)
        self.check_time(text, lambda s: list(get_citations(s)), 'get_citations', times)
        self.check_time(text, lambda s: list(get_conditions(s)), 'get_conditions', times)
        self.check_time(text, lambda s: list(get_constraints(s)), 'get_constraints', times)
        self.check_time(text, lambda s: list(get_copyright(s)), 'get_copyright', times)
        self.check_time(text, lambda s: list(_get_courts(s)), 'get_courts', times)
        self.check_time(text, lambda s: list(get_cusip(s)), 'get_cusip', times)
        self.check_time(text, lambda s: list(get_dates(s)), 'get_dates', times)
        self.check_time(text, lambda s: list(get_definitions(s)), 'get_definitions', times)
        self.check_time(text, lambda s: list(get_distances(s)), 'get_distances', times)
        self.check_time(text, lambda s: list(get_durations(s)), 'get_durations', times)
        self.check_time(text, lambda s: list(get_geoentities(s, geo_config)), 'get_geoentities', times)
        self.check_time(text, lambda s: list(get_money(s)), 'get_money', times)
        self.check_time(text, lambda s: list(get_percents(s)), 'get_percents', times)
        self.check_time(text, lambda s: list(get_pii(s)), 'get_pii', times)
        self.check_time(text, lambda s: list(get_ratios(s)), 'get_ratios', times)
        self.check_time(text, lambda s: list(get_regulations(s)), 'get_regulations', times)
        self.check_time(text, lambda s: list(get_trademarks(s)), 'get_trademarks', times)
        self.check_time(text, lambda s: list(get_urls(s)), 'get_urls', times)

        self.assertTrue('get_amounts' in times)
예제 #13
0
def test_courts_longest_match():
    """
    Tests the case when there are courts having names/aliases being one a substring of another.
    In such case the court having longest alias should be returned for each conflicting matching.
    But for the case when there is another match of the court having shorter alias in that conflict,
    they both should be returned.
    :return:
    """
    courts_config_fn = os.path.join(os.path.dirname(lexnlp_tests.this_test_data_path()), 'us_courts.csv')
    courts_config_list = []
    with open(courts_config_fn, 'r', encoding='utf8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            aliases = []
            if row['Alias']:
                aliases = [DictionaryEntryAlias(r) for r in row['Alias'].split(';')]
            cc = DictionaryEntry(id=int(row['Court ID']),
                                 name=row['Court Type'] + '|' + row['Court Name'],
                                 priority=0,
                                 name_is_alias=False,
                                 aliases=aliases,
                               )
            cc.aliases.append(DictionaryEntryAlias(row['Court Name']))
            courts_config_list.append(cc)

    lexnlp_tests.test_extraction_func_on_test_data(get_courts, court_config_list=courts_config_list,
                                                   actual_data_converter=lambda actual:
                                                   [tuple(c[0].name.split('|')) for c in actual],
                                                   debug_print=True)
예제 #14
0
def get_geoentity_annotations_custom_settings(
    text: str,
    config: pd.DataFrame,
    alias_columns: Optional[List[DictionaryEntryAlias]] = None,
    priority_sort_column: Optional[str] = 'Entity Priority',
    conflict_resolving_field: str = 'none',
    priority_direction: str = 'asc',
    text_languages: List[str] = None,
    min_alias_len: Optional[int] = None,
    prepared_alias_ban_list: Optional[Dict[str, Tuple[List[str],
                                                      List[str]]]] = None,
    simplified_normalization: bool = False,
    local_name_column: Optional[str] = None,
    extra_columns: Optional[Dict[str, str]] = None
) -> Generator[GeoAnnotation, None, None]:
    entries = DictionaryEntry.load_entities_from_single_df(
        config,
        LANG_DE.code,
        alias_columns=alias_columns,
        priority_column=priority_sort_column,
        local_name_column=local_name_column,
        extra_columns=extra_columns)

    yield from get_geoentity_annotations(
        text,
        entries,
        conflict_resolving_field=conflict_resolving_field,
        priority_direction=priority_direction,
        text_languages=text_languages,
        min_alias_len=min_alias_len,
        prepared_alias_ban_list=prepared_alias_ban_list,
        simplified_normalization=simplified_normalization)
예제 #15
0
def build_dictionary_entry(row):
    aliases = []
    if not pandas.isnull(row["Alias"]):
        aliases = [DictionaryEntryAlias(r) for r in row['Alias'].split(';')]
    return DictionaryEntry(int(row['Court ID']),
                           row['Court Name'],
                           0,
                           aliases=aliases)
예제 #16
0
    def test_common_search_all_languages(self):
        some_entity = DictionaryEntry(
            1, 'Some Entity', aliases=[DictionaryEntryAlias('Something')])
        text = 'Some Entity should be found in this text.'

        enities = list(
            find_dict_entities(text, all_possible_entities=[some_entity]))
        self.assertEqual(1, len(enities))
        _ent, alias = enities[0].entity
        self.assertEqual('Some Entity', alias.alias)
예제 #17
0
    def test_alias_punktuation(self):
        table = DictionaryEntry(1, 'Kaban',
                                aliases=[DictionaryEntryAlias('K.A.B.A. N.', is_abbreviation=True)],
                                name_is_alias=False)
        entities = [table]
        text = 'Can we catch some K.A.B.A.N.s?'

        parsed_enitities = list(find_dict_entities(
            text,
            default_language=LANG_EN.code,
            all_possible_entities=entities, use_stemmer=True,
            simplified_normalization=False))
        self.assertEqual(1, len(parsed_enitities))

        _ent, alias = parsed_enitities[0].entity
        self.assertEqual('K.A.B.A. N.', alias.alias)
예제 #18
0
 def load_en_courts(self):
     court_df = pandas \
         .read_csv(
         "https://raw.githubusercontent.com/LexPredict/lexpredict-legal-dictionary/1.0.2/en/legal/us_courts"
         ".csv")
     # Create config objects
     court_config_list = []
     for _, row in court_df.iterrows():
         aliases = []
         if not pandas.isnull(row['Alias']):
             aliases = [
                 DictionaryEntryAlias(r) for r in row['Alias'].split(';')
             ]
         c = DictionaryEntry(id=int(row['Court ID']),
                             name=row['Court Name'],
                             priority=0,
                             name_is_alias=True,
                             aliases=aliases)
         court_config_list.append(c)
     return court_config_list
예제 #19
0
 def build_courts_config(self) -> List[DictionaryEntry]:
     courts_config_fn = os.path.join(
         DIR_ROOT,
         'test_data/lexnlp/extract/en/tests/test_courts/us_courts.csv')
     courts_config_list = []
     with open(courts_config_fn, 'r', encoding='utf8') as f:
         reader = csv.DictReader(f)
         for row in reader:
             aliases = []
             if row['Alias']:
                 aliases = [
                     DictionaryEntryAlias(r)
                     for r in row['Alias'].split(';')
                 ]
             cc = DictionaryEntry(
                 id=int(row['Court ID']),
                 name=row['Court Type'] + '|' + row['Court Name'],
                 priority=0,
                 name_is_alias=False,
                 aliases=aliases,
             )
             cc.aliases.append(DictionaryEntryAlias(row['Court Name']))
             courts_config_list.append(cc)
     return courts_config_list
예제 #20
0
def load_entities_dict():
    base_path = os.path.join(lexnlp_test_path,
                             'lexnlp/extract/en/tests/test_geoentities')
    entities_fn = os.path.join(base_path, 'geoentities.csv')
    aliases_fn = os.path.join(base_path, 'geoaliases.csv')
    return DictionaryEntry.load_entities_from_files(entities_fn, aliases_fn)