def test_abbreviations_simple(self):
        some_entity = DictionaryEntry(1, 'ITAbbrev', aliases=[DictionaryEntryAlias('IT', is_abbreviation=True)])
        some_entity1 = DictionaryEntry(2, 'ISAbbrev', aliases=[DictionaryEntryAlias('IS', is_abbreviation=True)])
        entities = [some_entity, some_entity1]

        text = '"IT\'s" entity should be detected even with "\'s" because ' \
               'tokenizer takes care of this kind of things. ' \
               '"ISS" entity should not be detected - bacause "is" word' \
               ' is in lowercase here and probably does not mean an abbreviation.'

        parsed_enitities = list(find_dict_entities(
            text, default_language=LANG_EN.code,
            all_possible_entities=entities, text_languages=['ge'],
            simplified_normalization=False))
        self.assertEqual(1, len(parsed_enitities))
        _ent, alias = parsed_enitities[0].entity
        self.assertEqual('IT', alias.alias)

        simply_parsed_enitities = list(find_dict_entities(
            text, default_language=LANG_EN.code,
            all_possible_entities=entities, text_languages=['ge'],
            simplified_normalization=True))
        self.assertEqual(len(parsed_enitities), len(simply_parsed_enitities))
        _ent, simply_alias = parsed_enitities[0].entity
        self.assertEqual(alias.alias, simply_alias.alias)
    def test_am_pm_none(self):
        am = DictionaryEntry(
            1,
            'America',
            aliases=[DictionaryEntryAlias('AM', is_abbreviation=True)],
            name_is_alias=False)
        pm = DictionaryEntry(
            2,
            'Postmodernism',
            aliases=[DictionaryEntryAlias('PM', is_abbreviation=True)],
            name_is_alias=False)

        entities = [am, pm]
        ents = list(
            find_dict_entities('It is 11:00 AM or 11:00 PM now.',
                               all_possible_entities=entities))
        self.assertEqual(0, len(ents))

        ents = list(
            find_dict_entities('It is 11:00am now in (AM). Hello!',
                               all_possible_entities=entities))
        self.assertEqual(1, len(ents))
        self.assertEqual('America', ents[0].entity[0].name)

        ents = list(
            find_dict_entities('It is 11:00am now.',
                               all_possible_entities=entities))
        self.assertEqual(0, len(ents))
    def test_am_pm_none(self):
        simply_parse_mode = [False, True]
        for parse_mode in simply_parse_mode:
            am = DictionaryEntry(1, 'America',
                                 aliases=[DictionaryEntryAlias('AM', is_abbreviation=True)],
                                 name_is_alias=False)
            pm = DictionaryEntry(2, 'Postmodernism',
                                 aliases=[DictionaryEntryAlias('PM', is_abbreviation=True)],
                                 name_is_alias=False)

            entities = [am, pm]
            ents = list(find_dict_entities(
                'It is 11:00 AM or 11:00 PM now.',
                default_language=LANG_EN.code,
                all_possible_entities=entities, simplified_normalization=parse_mode))
            self.assertEqual(0, len(ents))

            ents = list(find_dict_entities('It is 11:00am now in (AM). Hello!',
                                           default_language=LANG_EN.code,
                                           all_possible_entities=entities,
                                           simplified_normalization=parse_mode))
            self.assertEqual(1, len(ents))
            self.assertEqual('America', ents[0].entity[0].name)

            ents = list(find_dict_entities('It is 11:00am now.',
                                           default_language=LANG_EN.code,
                                           all_possible_entities=entities,
                                           simplified_normalization=parse_mode))
            self.assertEqual(0, len(ents))
    def test_find_dict_entities_empty_text(self):
        text = ''
        am = DictionaryEntry(1, 'America',
                             aliases=[DictionaryEntryAlias('AM', is_abbreviation=True)], name_is_alias=False)

        res = list(find_dict_entities(text, [am], default_language=LANG_EN.code))
        self.assertFalse(res)
示例#5
0
def get_courts(text: str,
               court_config_list: List[Tuple[int, str, int, List[Tuple[str, str, bool, int]]]],
               priority: bool = False,
               text_languages: List[str] = None) -> Generator[Tuple[Tuple, Tuple], Any, Any]:
    """
    Searches for courts from the provided config list and yields tuples of (court_config, court_alias).
    Court config is: (court_id, court_name, [list of aliases])
    Alias is: (alias_text, language, is_abbrev, alias_id)

    This method uses general searching routines for dictionary entities from dict_entities.py module.
    Methods of dict_entities module can be used for comfortable creating the config: entity_config(),
    entity_alias(), add_aliases_to_entity().
    :param text:
    :param court_config_list: List list of all possible known courts in the form of tuples:
     (id, name, [(alias, lang, is_abbrev], ...).
    :param return_source:
    :param priority: If two courts found with the totally equal matching aliases - then use the one with the lowest id.
    :param text_languages: Language(s) of the source text. If a language is specified then only aliases of this
    language will be searched for. For example: this allows ignoring "Island" - a German language
     alias of Iceland for English texts.
    :return: Generates tuples: (court entity, court alias)
    """
    yield from find_dict_entities(text, court_config_list,
                                  conflict_resolving_func=conflicts_take_first_by_id if priority else None,
                                  text_languages=text_languages)
    def test_plural_case_matching(self):
        table = DictionaryEntry(
            1,
            'Table',
            aliases=[DictionaryEntryAlias('tbl.', is_abbreviation=True)],
            name_is_alias=True)
        man = DictionaryEntry(2, 'man', name_is_alias=True)
        masloboyka = DictionaryEntry(3, 'masloboyka', name_is_alias=True)

        entities = [table, man, masloboyka]

        text = 'We should detect the singular number of word "tables" here - the stemmer takes care of plural case. ' \
               'Unfortunately our stemmer is not able to convert word "men" to singular number yet :(. ' \
               'But it works for word "masloboykas" - a non existing word in English in plural case.'

        parsed_enitities = list(
            find_dict_entities(text,
                               all_possible_entities=entities,
                               use_stemmer=True))
        self.assertEqual(2, len(parsed_enitities))

        _ent, alias = parsed_enitities[0].entity
        self.assertEqual('Table', alias.alias)
        _ent, alias = parsed_enitities[1].entity
        self.assertEqual('masloboyka', alias.alias)
    def test_equal_aliases_in_dif_languages(self):
        mississippi = DictionaryEntry(1, 'Mississippi',
                                      aliases=[DictionaryEntryAlias('MS', is_abbreviation=True, language='en'),
                                               DictionaryEntryAlias('Mississippi', language='de'),
                                               DictionaryEntryAlias('Mississippi', language='en')])

        montserrat = DictionaryEntry(2, 'Montserrat',
                                     aliases=[DictionaryEntryAlias('MS', is_abbreviation=True, language='en'),
                                              DictionaryEntryAlias('Montserrat', language='de'),
                                              DictionaryEntryAlias('Montserrat', language='en')])
        canada = DictionaryEntry(3, 'Canada',
                                 aliases=[DictionaryEntryAlias('CAN', is_abbreviation=True, language='en'),
                                          DictionaryEntryAlias('Kanada', language='de'),
                                          DictionaryEntryAlias('Canada', language='en')])
        entities = [mississippi, montserrat, canada]

        text = '"MS" here can mean either "MMMississippi" or "MMMontserrat" because ' \
               'they have equal aliases in English. ' \
               'This test is here because in one version of the code alias texts were required to be unique. ' \
               '"CCCanada" (can) should not be detected because word "can" is in lowercase here.'

        parsed_enitities = list(find_dict_entities(
            text, default_language=LANG_EN.code,
            all_possible_entities=entities, text_languages=['en']))
        self.assertEqual(2, len(parsed_enitities))
        _ent, alias = parsed_enitities[0].entity
        self.assertEqual('MS', alias.alias)

        _ent, alias = parsed_enitities[1].entity
        self.assertEqual('MS', alias.alias)
def test_find_dict_entities_empty_text():
    text = ''
    am = entity_config(1,
                       'America',
                       aliases=[entity_alias('AM', is_abbreviation=True)],
                       name_is_alias=False)

    res = list(find_dict_entities(text, [am]))
    assert_false(res)
    def test_common_search_all_languages(self):
        some_entity = DictionaryEntry(
            1, 'Some Entity', aliases=[DictionaryEntryAlias('Something')])
        text = 'Some Entity should be found in this text.'

        enities = list(
            find_dict_entities(text, all_possible_entities=[some_entity]))
        self.assertEqual(1, len(enities))
        _ent, alias = enities[0].entity
        self.assertEqual('Some Entity', alias.alias)
示例#10
0
def get_courts(text: str,
               court_config_list: List[Tuple[int, str, int, List[Tuple[str, str, bool, int]]]],
               priority: bool = False,
               text_languages: List[str] = None) -> Generator[Tuple[Tuple, Tuple], Any, Any]:
    """
    See lexnlp/extract/en/tests/test_courts.py
    """
    yield from find_dict_entities(text, court_config_list,
                                  conflict_resolving_func=conflicts_take_first_by_id if priority else None,
                                  text_languages=text_languages)
示例#11
0
def get_courts(text: str,
               court_config_list: List[DictionaryEntry],
               priority: bool = False,
               text_languages: List[str] = None) -> Generator[Tuple[DictionaryEntry, DictionaryEntryAlias], Any, Any]:
    """
    See lexnlp/extract/en/tests/test_courts.py
    """
    for ent in find_dict_entities(text, court_config_list,
                                  conflict_resolving_func=conflicts_take_first_by_id if priority else None,
                                  text_languages=text_languages):
        yield ent.entity
示例#12
0
def get_geoentities(
    text: str,
    geo_config_list: List[DictionaryEntry],
    priority: bool = False,
    priority_by_id: bool = False,
    text_languages: List[str] = None,
    min_alias_len: int = geoentities_config.MIN_ALIAS_LEN,
    prepared_alias_ban_list: Union[None, Dict[str, Tuple[
        List[str], List[str]]]] = _ALIAS_BLACK_LIST_PREPARED,
    simplified_normalization: bool = False
) -> Generator[Tuple[DictionaryEntry, DictionaryEntryAlias], Any, Any]:
    """
    Searches for geo entities from the provided config list and yields pairs of (entity, alias).
    Entity is: (entity_id, name, [list of aliases])
    Alias is: (alias_text, lang, is_abbrev, alias_id)

    This method uses general searching routines for dictionary entities from dict_entities.py module.
    Methods of dict_entities module can be used for comfortable creating the config: entity_config(),
    entity_alias(), add_aliases_to_entity().
    :param text:
    :param geo_config_list: List of all possible known geo entities in the form of tuples
    (id, name, [(alias, lang, is_abbrev, alias_id), ...]).
    :param priority: If two entities found with the totally equal matching aliases -
    then use the one with the greatest priority field.
    :param priority_by_id: If two entities found with the totally equal matching aliases -
    then use the one with the lowest id.
    :param text_languages: Language(s) of the source text. If a language is specified then only aliases of this
    language will be searched for. For example: this allows ignoring "Island" - a German language
     alias of Iceland for English texts.
    :param min_alias_len: Minimal length of geo entity aliases to search for.
    :param prepared_alias_ban_list: List of aliases to exclude from searching in the form:
     dict of lang -> (list of normalized non-abbreviation aliases, list of normalized abbreviation aliases).
     Use dict_entities.prepare_alias_banlist_dict() for preparing this dict.
    :param simplified_normalization: don't use NLTK for "normalizing" text
    :return: Generates tuples: (entity, alias)
    """
    conflict_resolving_func = None

    if priority_by_id:
        conflict_resolving_func = conflicts_take_first_by_id

    if priority:
        conflict_resolving_func = conflicts_top_by_priority

    for ent in find_dict_entities(
            text,
            geo_config_list,
            conflict_resolving_func=conflict_resolving_func,
            text_languages=text_languages,
            min_alias_len=min_alias_len,
            prepared_alias_ban_list=prepared_alias_ban_list,
            simplified_normalization=simplified_normalization):
        yield ent.entity
    def test_conflicts_take_longest_match(self):
        some_entity = DictionaryEntry(1, 'Some Entity', aliases=[DictionaryEntryAlias('Something')])
        some_entity1 = DictionaryEntry(2, 'Some Entity One', aliases=[DictionaryEntryAlias('Something One')])
        some_entity2 = DictionaryEntry(3, 'Some Entity Two', aliases=[DictionaryEntryAlias('Something Two')])
        entities = [some_entity, some_entity1, some_entity2]

        text = '"Some Entity One" should be found in this text and "Someee Entityyy" should be ignored.'

        parsed_enitities = list(find_dict_entities(text,
                                                   all_possible_entities=entities,
                                                   default_language=LANG_EN.code))
        self.assertEqual(1, len(parsed_enitities))
        _ent, alias = parsed_enitities[0].entity
        self.assertEqual('Some Entity One', alias.alias)
    def test_alias_punktuation(self):
        table = DictionaryEntry(1, 'Kaban',
                                aliases=[DictionaryEntryAlias('K.A.B.A. N.', is_abbreviation=True)],
                                name_is_alias=False)
        entities = [table]
        text = 'Can we catch some K.A.B.A.N.s?'

        parsed_enitities = list(find_dict_entities(
            text,
            default_language=LANG_EN.code,
            all_possible_entities=entities, use_stemmer=True,
            simplified_normalization=False))
        self.assertEqual(1, len(parsed_enitities))

        _ent, alias = parsed_enitities[0].entity
        self.assertEqual('K.A.B.A. N.', alias.alias)
    def test_conflicts_equal_length_take_same_language(self):
        some_entity = DictionaryEntry(1, 'Some Entity', aliases=[DictionaryEntryAlias('Something')])
        some_entity1 = DictionaryEntry(2, 'Some Entity1',
                                       aliases=[DictionaryEntryAlias('Some Entity One', language='fr')])
        some_entity2 = DictionaryEntry(3, 'Some Entity2', aliases=[DictionaryEntryAlias('Something Two')])
        entities = [some_entity, some_entity1, some_entity2]

        text = '"Some Entity One" should not be found in this text because it is not in German language.' \
               'Shorter match - "Someeee Entityyy" should be taken instead.'

        parsed_enitities = list(find_dict_entities(
            text, all_possible_entities=entities, text_languages=['de'],
            default_language=LANG_EN.code))
        self.assertEqual(1, len(parsed_enitities))
        _ent, alias = parsed_enitities[0].entity
        self.assertEqual('Some Entity', alias.alias)
示例#16
0
 def get_geoentity_entries(
     self, text: str
 ) -> Generator[Tuple[DictionaryEntry, DictionaryEntryAlias], Any, Any]:
     """
     This method uses general searching routines for dictionary entities from dict_entities.py module.
     Methods of dict_entities module can be used for comfortable creating the config: entity_config(),
     entity_alias(), add_aliases_to_entity().
     """
     for ent in find_dict_entities(
             text,
             self.geo_config_list,
             conflict_resolving_func=self.conflict_resolving_func,
             priority_direction=self.priority_direction,
             default_language=self.language,
             text_languages=self.text_languages,
             min_alias_len=self.min_alias_len,
             prepared_alias_ban_list=self.prepared_alias_ban_list,
             simplified_normalization=self.simplified_normalization):
         yield ent.entity
示例#17
0
    def get_geoentity_annotations(
            self, text: str) -> Generator[GeoAnnotation, None, None]:
        """
        This method uses general searching routines for dictionary entities from dict_entities.py module.
        Methods of dict_entities module can be used for comfortable creating the config: entity_config(),
        entity_alias(), add_aliases_to_entity().
        """
        dic_entries = find_dict_entities(
            text,
            self.geo_config_list,
            self.language,
            conflict_resolving_func=self.conflict_resolving_func,
            priority_direction=self.priority_direction,
            text_languages=self.text_languages,
            min_alias_len=self.min_alias_len,
            prepared_alias_ban_list=self.prepared_alias_ban_list,
            simplified_normalization=self.simplified_normalization)

        for ent in dic_entries:
            ant = GeoAnnotation(coords=ent.coords)
            if ent.entity[0]:
                toponim = ent.entity[0]  # type: DictionaryEntry
                ant.entity_id = toponim.id
                ant.entity_category = toponim.category
                ant.entity_priority = toponim.priority
                ant.name_en = toponim.entity_name
                # year = TextAnnotation.get_int_value(toponim.id)  # ?
                # if year:
                #     ant.year = year
                ant.name = toponim.name
                if toponim.extra_columns:
                    for extr_col in toponim.extra_columns:
                        setattr(ant, extr_col, toponim.extra_columns[extr_col])

            if ent.entity[1]:  # alias
                ant.alias = ent.entity[1].alias
                ant.locale = ent.entity[1].language
            if not ant.locale:
                ant.locale = self.language
            yield ant
示例#18
0
def get_geoentity_annotations(
    text: str,
    geo_config_list: List[DictionaryEntry],
    priority: bool = False,
    priority_by_id: bool = False,
    text_languages: List[str] = None,
    min_alias_len: int = geoentities_config.MIN_ALIAS_LEN,
    prepared_alias_ban_list: Union[None, Dict[str, Tuple[
        List[str], List[str]]]] = _ALIAS_BLACK_LIST_PREPARED,
    simplified_normalization: bool = False
) -> Generator[GeoAnnotation, None, None]:
    "See get_geoentities"

    conflict_resolving_func = None

    if priority_by_id:
        conflict_resolving_func = conflicts_take_first_by_id

    if priority:
        conflict_resolving_func = conflicts_top_by_priority

    dic_entries = find_dict_entities(
        text,
        geo_config_list,
        conflict_resolving_func=conflict_resolving_func,
        text_languages=text_languages,
        min_alias_len=min_alias_len,
        prepared_alias_ban_list=prepared_alias_ban_list,
        simplified_normalization=simplified_normalization)

    for ent in dic_entries:
        ant = GeoAnnotation(coords=ent.coords)
        if ent.entity[0]:
            toponim = ent.entity[0]  # type: DictionaryEntry
            year = TextAnnotation.get_int_value(toponim.id)
            if year:
                ant.year = year
            ant.name = toponim.name
        yield ant
示例#19
0
def get_court_annotations(
    locale: str,
    text: str,
    court_config_list: List[DictionaryEntry],
    priority: bool = False,
    text_locales: List[str] = (),
    simplified_normalization: bool = False
) -> Generator[CourtAnnotation, None, None]:
    locale_obj = Locale(locale)
    dic_entries = find_dict_entities(
        text,
        court_config_list,
        default_language=locale_obj.language,
        conflict_resolving_func=conflicts_take_first_by_id
        if priority else None,
        text_languages=[Locale(item).language for item in text_locales],
        simplified_normalization=simplified_normalization)
    for ent in dic_entries:
        ant = CourtAnnotation(coords=ent.coords)
        if ent.entity[0]:
            toponim = ent.entity[0]  # type: DictionaryEntry
            ant.entity_id = toponim.id
            ant.entity_category = toponim.category
            ant.entity_priority = toponim.priority
            ant.name_en = toponim.entity_name
            ant.name = toponim.name
            if toponim.extra_columns:
                for extr_col in toponim.extra_columns:
                    setattr(ant, extr_col, toponim.extra_columns[extr_col])

        if ent.entity[1]:  # alias
            ant.alias = ent.entity[1].alias
            ant.locale = ent.entity[1].language
        if not ant.locale:
            ant.locale = locale_obj.language
        yield ant