def test_am_pm_abbreviations():
    am = entity_config(1, 'America', aliases=[entity_alias('AM', is_abbreviation=True)], name_is_alias=False)
    pm = entity_config(2, 'Postmodernism', aliases=[entity_alias('PM', is_abbreviation=True)], name_is_alias=False)

    entities = [am, pm]

    lexnlp_tests.test_extraction_func([],
                                      find_dict_entities,
                                      'It is 11:00 AM or 11:00 PM now.',
                                      all_possible_entities=entities,
                                      actual_data_converter=lambda actual:
                                      [(get_entity_name(c[0]), c[1][0]) for c in actual],
                                      debug_print=True)

    lexnlp_tests.test_extraction_func([(am[1], 'AM')],
                                      find_dict_entities,
                                      'It is 11:00am now in (AM). Hello!',
                                      all_possible_entities=entities,
                                      actual_data_converter=lambda actual:
                                      [(get_entity_name(c[0]), c[1][0]) for c in actual],
                                      debug_print=True)

    lexnlp_tests.test_extraction_func([],
                                      find_dict_entities,
                                      'It is 11:00am now.',
                                      all_possible_entities=entities,
                                      actual_data_converter=lambda actual:
                                      [(get_entity_name(c[0]), c[1][0]) for c in actual],
                                      debug_print=True)
def test_geoentities():
    lexnlp_tests.test_extraction_func_on_test_data(
        get_geoentities,
        geo_config_list=_CONFIG,
        actual_data_converter=lambda actual:
        [get_entity_name(c[0]) for c in actual],
        debug_print=True)
Exemplo n.º 3
0
def test_geoentities_en_equal_match_take_top_prio():
    lexnlp_tests.test_extraction_func_on_test_data(get_geoentities, geo_config_list=_CONFIG,
                                                   priority=True,
                                                   text_languages='en',
                                                   actual_data_converter=lambda actual:
                                                   [(get_entity_name(c[0]), c[1][0]) for c in actual],
                                                   debug_print=True)
def test_plural_case_matching():
    table = entity_config(1,
                          'Table',
                          aliases=[entity_alias('tbl.', is_abbreviation=True)],
                          name_is_alias=True)

    man = entity_config(2, 'man', name_is_alias=True)

    masloboyka = entity_config(3, 'masloboyka', name_is_alias=True)

    entities = [table, man, masloboyka]

    text = 'We should detect the singular number of word "tables" here - the stemmer takes care of plural case. ' \
           'Unfortunately our stemmer is not able to convert word "men" to singular number yet :(. ' \
           'But it works for word "masloboykas" - a non existing word in English in plural case.'

    expected = (
        (table[1], 'Table'),
        (masloboyka[1], 'masloboyka'),
    )

    lexnlp_tests.test_extraction_func(
        expected,
        find_dict_entities,
        text,
        all_possible_entities=entities,
        use_stemmer=True,
        actual_data_converter=lambda actual: [(get_entity_name(c[0]), c[1][0])
                                              for c in actual],
        debug_print=True)
    def test_conflicts_equal_length_take_same_language(self):
        some_entity = entity_config(1, 'Some Entity', aliases=['Something'])
        some_entity1 = entity_config(
            2,
            'Some Entity1',
            aliases=[entity_alias('Some Entity One', language='fr')])
        some_entity2 = entity_config(3,
                                     'Some Entity2',
                                     aliases=['Something Two'])
        entities = [some_entity, some_entity1, some_entity2]

        text = '"Some Entity One" should not be found in this text because it is not in German language.' \
               'Shorter match - "Someeee Entityyy" should be taken instead.'

        expected = ((some_entity[1], 'Some Entity'), )

        lexnlp_tests.test_extraction_func(
            expected,
            find_dict_entities,
            text,
            all_possible_entities=entities,
            text_languages=['de'],
            actual_data_converter=lambda actual: [
                (get_entity_name(c.entity[0]), c.entity[1][0]) for c in actual
            ],
            debug_print=True)
Exemplo n.º 6
0
def test_geoentities_alias_filtering():
    prepared_alias_blacklist = prepare_alias_blacklist_dict([('Afghanistan', None, False), ('Mississippi', 'en', False),
                                                             ('AL', 'en', True)])
    lexnlp_tests.test_extraction_func_on_test_data(get_geoentities, geo_config_list=_CONFIG,
                                                   prepared_alias_black_list=prepared_alias_blacklist,
                                                   actual_data_converter=lambda actual:
                                                   [get_entity_name(c[0]) for c in actual],
                                                   debug_print=True,
                                                   start_from_csv_line=6)
def test_common_search_all_languages():
    some_entity = entity_config(1, 'Some Entity', aliases=['Something'])
    text = 'Some Entity should be found in this text.'

    expected = ((some_entity[1], 'Some Entity'),)

    lexnlp_tests.test_extraction_func(expected, find_dict_entities, text, all_possible_entities=[some_entity],
                                      actual_data_converter=lambda actual:
                                      [(get_entity_name(c[0]), c[1][0]) for c in actual],
                                      debug_print=True)
def test_conflicts_take_longest_match():
    some_entity = entity_config(1, 'Some Entity', aliases=['Something'])
    some_entity1 = entity_config(2, 'Some Entity One', aliases=['Something One'])
    some_entity2 = entity_config(3, 'Some Entity Two', aliases=['Something Two'])
    entities = [some_entity, some_entity1, some_entity2]

    text = '"Some Entity One" should be found in this text and "Someee Entityyy" should be ignored.'

    expected = ((some_entity1[1], 'Some Entity One'),)

    lexnlp_tests.test_extraction_func(expected, find_dict_entities, text, all_possible_entities=entities,
                                      actual_data_converter=lambda actual:
                                      [(get_entity_name(c[0]), c[1][0]) for c in actual],
                                      debug_print=True)
    def test_equal_aliases_in_dif_languages(self):
        mississippi = entity_config(1,
                                    'Mississippi',
                                    aliases=[
                                        entity_alias('MS',
                                                     is_abbreviation=True,
                                                     language='en'),
                                        entity_alias('Mississippi',
                                                     language='de'),
                                        entity_alias('Mississippi',
                                                     language='en')
                                    ])
        montserrat = entity_config(2,
                                   'Montserrat',
                                   aliases=[
                                       entity_alias('MS',
                                                    is_abbreviation=True,
                                                    language='en'),
                                       entity_alias('Montserrat',
                                                    language='de'),
                                       entity_alias('Montserrat',
                                                    language='en')
                                   ])
        canada = entity_config(3,
                               'Canada',
                               aliases=[
                                   entity_alias('CAN',
                                                is_abbreviation=True,
                                                language='en'),
                                   entity_alias('Kanada', language='de'),
                                   entity_alias('Canada', language='en')
                               ])
        entities = [mississippi, montserrat, canada]

        text = '"MS" here can mean either "MMMississippi" or "MMMontserrat" because they have equal aliases in English. ' \
               'This test is here because in one version of the code alias texts were required to be unique. ' \
               '"CCCanada" (can) should not be detected because word "can" is in lowercase here.'

        expected = ((mississippi[1], 'MS'), (montserrat[1], 'MS'))

        lexnlp_tests.test_extraction_func(
            expected,
            find_dict_entities,
            text,
            all_possible_entities=entities,
            text_languages=['en'],
            actual_data_converter=lambda actual: [
                (get_entity_name(c.entity[0]), c.entity[1][0]) for c in actual
            ],
            debug_print=True)
def test_abbreviations_simple():
    some_entity = entity_config(1, 'ITAbbrev', aliases=[entity_alias('IT', is_abbreviation=True)])
    some_entity1 = entity_config(2, 'ISAbbrev', aliases=[entity_alias('IS', is_abbreviation=True)])
    entities = [some_entity, some_entity1]

    text = '"IT\'s" entity should be detected even with "\'s" because tokenizer takes care of this kind of things. ' \
           '"ISS" entity should not be detected - bacause "is" word' \
           ' is in lowercase here and probably does not mean an abbreviation.'

    expected = ((some_entity[1], 'IT'),)

    lexnlp_tests.test_extraction_func(expected, find_dict_entities, text, all_possible_entities=entities,
                                      text_languages=['ge'],
                                      actual_data_converter=lambda actual:
                                      [(get_entity_name(c[0]), c[1][0]) for c in actual],
                                      debug_print=True)