def test_abbreviations_simple(self): some_entity = DictionaryEntry(1, 'ITAbbrev', aliases=[DictionaryEntryAlias('IT', is_abbreviation=True)]) some_entity1 = DictionaryEntry(2, 'ISAbbrev', aliases=[DictionaryEntryAlias('IS', is_abbreviation=True)]) entities = [some_entity, some_entity1] text = '"IT\'s" entity should be detected even with "\'s" because ' \ 'tokenizer takes care of this kind of things. ' \ '"ISS" entity should not be detected - bacause "is" word' \ ' is in lowercase here and probably does not mean an abbreviation.' parsed_enitities = list(find_dict_entities( text, default_language=LANG_EN.code, all_possible_entities=entities, text_languages=['ge'], simplified_normalization=False)) self.assertEqual(1, len(parsed_enitities)) _ent, alias = parsed_enitities[0].entity self.assertEqual('IT', alias.alias) simply_parsed_enitities = list(find_dict_entities( text, default_language=LANG_EN.code, all_possible_entities=entities, text_languages=['ge'], simplified_normalization=True)) self.assertEqual(len(parsed_enitities), len(simply_parsed_enitities)) _ent, simply_alias = parsed_enitities[0].entity self.assertEqual(alias.alias, simply_alias.alias)
def test_am_pm_none(self): am = DictionaryEntry( 1, 'America', aliases=[DictionaryEntryAlias('AM', is_abbreviation=True)], name_is_alias=False) pm = DictionaryEntry( 2, 'Postmodernism', aliases=[DictionaryEntryAlias('PM', is_abbreviation=True)], name_is_alias=False) entities = [am, pm] ents = list( find_dict_entities('It is 11:00 AM or 11:00 PM now.', all_possible_entities=entities)) self.assertEqual(0, len(ents)) ents = list( find_dict_entities('It is 11:00am now in (AM). Hello!', all_possible_entities=entities)) self.assertEqual(1, len(ents)) self.assertEqual('America', ents[0].entity[0].name) ents = list( find_dict_entities('It is 11:00am now.', all_possible_entities=entities)) self.assertEqual(0, len(ents))
def test_am_pm_none(self): simply_parse_mode = [False, True] for parse_mode in simply_parse_mode: am = DictionaryEntry(1, 'America', aliases=[DictionaryEntryAlias('AM', is_abbreviation=True)], name_is_alias=False) pm = DictionaryEntry(2, 'Postmodernism', aliases=[DictionaryEntryAlias('PM', is_abbreviation=True)], name_is_alias=False) entities = [am, pm] ents = list(find_dict_entities( 'It is 11:00 AM or 11:00 PM now.', default_language=LANG_EN.code, all_possible_entities=entities, simplified_normalization=parse_mode)) self.assertEqual(0, len(ents)) ents = list(find_dict_entities('It is 11:00am now in (AM). Hello!', default_language=LANG_EN.code, all_possible_entities=entities, simplified_normalization=parse_mode)) self.assertEqual(1, len(ents)) self.assertEqual('America', ents[0].entity[0].name) ents = list(find_dict_entities('It is 11:00am now.', default_language=LANG_EN.code, all_possible_entities=entities, simplified_normalization=parse_mode)) self.assertEqual(0, len(ents))
def test_find_dict_entities_empty_text(self): text = '' am = DictionaryEntry(1, 'America', aliases=[DictionaryEntryAlias('AM', is_abbreviation=True)], name_is_alias=False) res = list(find_dict_entities(text, [am], default_language=LANG_EN.code)) self.assertFalse(res)
def get_courts(text: str, court_config_list: List[Tuple[int, str, int, List[Tuple[str, str, bool, int]]]], priority: bool = False, text_languages: List[str] = None) -> Generator[Tuple[Tuple, Tuple], Any, Any]: """ Searches for courts from the provided config list and yields tuples of (court_config, court_alias). Court config is: (court_id, court_name, [list of aliases]) Alias is: (alias_text, language, is_abbrev, alias_id) This method uses general searching routines for dictionary entities from dict_entities.py module. Methods of dict_entities module can be used for comfortable creating the config: entity_config(), entity_alias(), add_aliases_to_entity(). :param text: :param court_config_list: List list of all possible known courts in the form of tuples: (id, name, [(alias, lang, is_abbrev], ...). :param return_source: :param priority: If two courts found with the totally equal matching aliases - then use the one with the lowest id. :param text_languages: Language(s) of the source text. If a language is specified then only aliases of this language will be searched for. For example: this allows ignoring "Island" - a German language alias of Iceland for English texts. :return: Generates tuples: (court entity, court alias) """ yield from find_dict_entities(text, court_config_list, conflict_resolving_func=conflicts_take_first_by_id if priority else None, text_languages=text_languages)
def test_plural_case_matching(self): table = DictionaryEntry( 1, 'Table', aliases=[DictionaryEntryAlias('tbl.', is_abbreviation=True)], name_is_alias=True) man = DictionaryEntry(2, 'man', name_is_alias=True) masloboyka = DictionaryEntry(3, 'masloboyka', name_is_alias=True) entities = [table, man, masloboyka] text = 'We should detect the singular number of word "tables" here - the stemmer takes care of plural case. ' \ 'Unfortunately our stemmer is not able to convert word "men" to singular number yet :(. ' \ 'But it works for word "masloboykas" - a non existing word in English in plural case.' parsed_enitities = list( find_dict_entities(text, all_possible_entities=entities, use_stemmer=True)) self.assertEqual(2, len(parsed_enitities)) _ent, alias = parsed_enitities[0].entity self.assertEqual('Table', alias.alias) _ent, alias = parsed_enitities[1].entity self.assertEqual('masloboyka', alias.alias)
def test_equal_aliases_in_dif_languages(self): mississippi = DictionaryEntry(1, 'Mississippi', aliases=[DictionaryEntryAlias('MS', is_abbreviation=True, language='en'), DictionaryEntryAlias('Mississippi', language='de'), DictionaryEntryAlias('Mississippi', language='en')]) montserrat = DictionaryEntry(2, 'Montserrat', aliases=[DictionaryEntryAlias('MS', is_abbreviation=True, language='en'), DictionaryEntryAlias('Montserrat', language='de'), DictionaryEntryAlias('Montserrat', language='en')]) canada = DictionaryEntry(3, 'Canada', aliases=[DictionaryEntryAlias('CAN', is_abbreviation=True, language='en'), DictionaryEntryAlias('Kanada', language='de'), DictionaryEntryAlias('Canada', language='en')]) entities = [mississippi, montserrat, canada] text = '"MS" here can mean either "MMMississippi" or "MMMontserrat" because ' \ 'they have equal aliases in English. ' \ 'This test is here because in one version of the code alias texts were required to be unique. ' \ '"CCCanada" (can) should not be detected because word "can" is in lowercase here.' parsed_enitities = list(find_dict_entities( text, default_language=LANG_EN.code, all_possible_entities=entities, text_languages=['en'])) self.assertEqual(2, len(parsed_enitities)) _ent, alias = parsed_enitities[0].entity self.assertEqual('MS', alias.alias) _ent, alias = parsed_enitities[1].entity self.assertEqual('MS', alias.alias)
def test_find_dict_entities_empty_text(): text = '' am = entity_config(1, 'America', aliases=[entity_alias('AM', is_abbreviation=True)], name_is_alias=False) res = list(find_dict_entities(text, [am])) assert_false(res)
def test_common_search_all_languages(self): some_entity = DictionaryEntry( 1, 'Some Entity', aliases=[DictionaryEntryAlias('Something')]) text = 'Some Entity should be found in this text.' enities = list( find_dict_entities(text, all_possible_entities=[some_entity])) self.assertEqual(1, len(enities)) _ent, alias = enities[0].entity self.assertEqual('Some Entity', alias.alias)
def get_courts(text: str, court_config_list: List[Tuple[int, str, int, List[Tuple[str, str, bool, int]]]], priority: bool = False, text_languages: List[str] = None) -> Generator[Tuple[Tuple, Tuple], Any, Any]: """ See lexnlp/extract/en/tests/test_courts.py """ yield from find_dict_entities(text, court_config_list, conflict_resolving_func=conflicts_take_first_by_id if priority else None, text_languages=text_languages)
def get_courts(text: str, court_config_list: List[DictionaryEntry], priority: bool = False, text_languages: List[str] = None) -> Generator[Tuple[DictionaryEntry, DictionaryEntryAlias], Any, Any]: """ See lexnlp/extract/en/tests/test_courts.py """ for ent in find_dict_entities(text, court_config_list, conflict_resolving_func=conflicts_take_first_by_id if priority else None, text_languages=text_languages): yield ent.entity
def get_geoentities( text: str, geo_config_list: List[DictionaryEntry], priority: bool = False, priority_by_id: bool = False, text_languages: List[str] = None, min_alias_len: int = geoentities_config.MIN_ALIAS_LEN, prepared_alias_ban_list: Union[None, Dict[str, Tuple[ List[str], List[str]]]] = _ALIAS_BLACK_LIST_PREPARED, simplified_normalization: bool = False ) -> Generator[Tuple[DictionaryEntry, DictionaryEntryAlias], Any, Any]: """ Searches for geo entities from the provided config list and yields pairs of (entity, alias). Entity is: (entity_id, name, [list of aliases]) Alias is: (alias_text, lang, is_abbrev, alias_id) This method uses general searching routines for dictionary entities from dict_entities.py module. Methods of dict_entities module can be used for comfortable creating the config: entity_config(), entity_alias(), add_aliases_to_entity(). :param text: :param geo_config_list: List of all possible known geo entities in the form of tuples (id, name, [(alias, lang, is_abbrev, alias_id), ...]). :param priority: If two entities found with the totally equal matching aliases - then use the one with the greatest priority field. :param priority_by_id: If two entities found with the totally equal matching aliases - then use the one with the lowest id. :param text_languages: Language(s) of the source text. If a language is specified then only aliases of this language will be searched for. For example: this allows ignoring "Island" - a German language alias of Iceland for English texts. :param min_alias_len: Minimal length of geo entity aliases to search for. :param prepared_alias_ban_list: List of aliases to exclude from searching in the form: dict of lang -> (list of normalized non-abbreviation aliases, list of normalized abbreviation aliases). Use dict_entities.prepare_alias_banlist_dict() for preparing this dict. :param simplified_normalization: don't use NLTK for "normalizing" text :return: Generates tuples: (entity, alias) """ conflict_resolving_func = None if priority_by_id: conflict_resolving_func = conflicts_take_first_by_id if priority: conflict_resolving_func = conflicts_top_by_priority for ent in find_dict_entities( text, geo_config_list, conflict_resolving_func=conflict_resolving_func, text_languages=text_languages, min_alias_len=min_alias_len, prepared_alias_ban_list=prepared_alias_ban_list, simplified_normalization=simplified_normalization): yield ent.entity
def test_conflicts_take_longest_match(self): some_entity = DictionaryEntry(1, 'Some Entity', aliases=[DictionaryEntryAlias('Something')]) some_entity1 = DictionaryEntry(2, 'Some Entity One', aliases=[DictionaryEntryAlias('Something One')]) some_entity2 = DictionaryEntry(3, 'Some Entity Two', aliases=[DictionaryEntryAlias('Something Two')]) entities = [some_entity, some_entity1, some_entity2] text = '"Some Entity One" should be found in this text and "Someee Entityyy" should be ignored.' parsed_enitities = list(find_dict_entities(text, all_possible_entities=entities, default_language=LANG_EN.code)) self.assertEqual(1, len(parsed_enitities)) _ent, alias = parsed_enitities[0].entity self.assertEqual('Some Entity One', alias.alias)
def test_alias_punktuation(self): table = DictionaryEntry(1, 'Kaban', aliases=[DictionaryEntryAlias('K.A.B.A. N.', is_abbreviation=True)], name_is_alias=False) entities = [table] text = 'Can we catch some K.A.B.A.N.s?' parsed_enitities = list(find_dict_entities( text, default_language=LANG_EN.code, all_possible_entities=entities, use_stemmer=True, simplified_normalization=False)) self.assertEqual(1, len(parsed_enitities)) _ent, alias = parsed_enitities[0].entity self.assertEqual('K.A.B.A. N.', alias.alias)
def test_conflicts_equal_length_take_same_language(self): some_entity = DictionaryEntry(1, 'Some Entity', aliases=[DictionaryEntryAlias('Something')]) some_entity1 = DictionaryEntry(2, 'Some Entity1', aliases=[DictionaryEntryAlias('Some Entity One', language='fr')]) some_entity2 = DictionaryEntry(3, 'Some Entity2', aliases=[DictionaryEntryAlias('Something Two')]) entities = [some_entity, some_entity1, some_entity2] text = '"Some Entity One" should not be found in this text because it is not in German language.' \ 'Shorter match - "Someeee Entityyy" should be taken instead.' parsed_enitities = list(find_dict_entities( text, all_possible_entities=entities, text_languages=['de'], default_language=LANG_EN.code)) self.assertEqual(1, len(parsed_enitities)) _ent, alias = parsed_enitities[0].entity self.assertEqual('Some Entity', alias.alias)
def get_geoentity_entries( self, text: str ) -> Generator[Tuple[DictionaryEntry, DictionaryEntryAlias], Any, Any]: """ This method uses general searching routines for dictionary entities from dict_entities.py module. Methods of dict_entities module can be used for comfortable creating the config: entity_config(), entity_alias(), add_aliases_to_entity(). """ for ent in find_dict_entities( text, self.geo_config_list, conflict_resolving_func=self.conflict_resolving_func, priority_direction=self.priority_direction, default_language=self.language, text_languages=self.text_languages, min_alias_len=self.min_alias_len, prepared_alias_ban_list=self.prepared_alias_ban_list, simplified_normalization=self.simplified_normalization): yield ent.entity
def get_geoentity_annotations( self, text: str) -> Generator[GeoAnnotation, None, None]: """ This method uses general searching routines for dictionary entities from dict_entities.py module. Methods of dict_entities module can be used for comfortable creating the config: entity_config(), entity_alias(), add_aliases_to_entity(). """ dic_entries = find_dict_entities( text, self.geo_config_list, self.language, conflict_resolving_func=self.conflict_resolving_func, priority_direction=self.priority_direction, text_languages=self.text_languages, min_alias_len=self.min_alias_len, prepared_alias_ban_list=self.prepared_alias_ban_list, simplified_normalization=self.simplified_normalization) for ent in dic_entries: ant = GeoAnnotation(coords=ent.coords) if ent.entity[0]: toponim = ent.entity[0] # type: DictionaryEntry ant.entity_id = toponim.id ant.entity_category = toponim.category ant.entity_priority = toponim.priority ant.name_en = toponim.entity_name # year = TextAnnotation.get_int_value(toponim.id) # ? # if year: # ant.year = year ant.name = toponim.name if toponim.extra_columns: for extr_col in toponim.extra_columns: setattr(ant, extr_col, toponim.extra_columns[extr_col]) if ent.entity[1]: # alias ant.alias = ent.entity[1].alias ant.locale = ent.entity[1].language if not ant.locale: ant.locale = self.language yield ant
def get_geoentity_annotations( text: str, geo_config_list: List[DictionaryEntry], priority: bool = False, priority_by_id: bool = False, text_languages: List[str] = None, min_alias_len: int = geoentities_config.MIN_ALIAS_LEN, prepared_alias_ban_list: Union[None, Dict[str, Tuple[ List[str], List[str]]]] = _ALIAS_BLACK_LIST_PREPARED, simplified_normalization: bool = False ) -> Generator[GeoAnnotation, None, None]: "See get_geoentities" conflict_resolving_func = None if priority_by_id: conflict_resolving_func = conflicts_take_first_by_id if priority: conflict_resolving_func = conflicts_top_by_priority dic_entries = find_dict_entities( text, geo_config_list, conflict_resolving_func=conflict_resolving_func, text_languages=text_languages, min_alias_len=min_alias_len, prepared_alias_ban_list=prepared_alias_ban_list, simplified_normalization=simplified_normalization) for ent in dic_entries: ant = GeoAnnotation(coords=ent.coords) if ent.entity[0]: toponim = ent.entity[0] # type: DictionaryEntry year = TextAnnotation.get_int_value(toponim.id) if year: ant.year = year ant.name = toponim.name yield ant
def get_court_annotations( locale: str, text: str, court_config_list: List[DictionaryEntry], priority: bool = False, text_locales: List[str] = (), simplified_normalization: bool = False ) -> Generator[CourtAnnotation, None, None]: locale_obj = Locale(locale) dic_entries = find_dict_entities( text, court_config_list, default_language=locale_obj.language, conflict_resolving_func=conflicts_take_first_by_id if priority else None, text_languages=[Locale(item).language for item in text_locales], simplified_normalization=simplified_normalization) for ent in dic_entries: ant = CourtAnnotation(coords=ent.coords) if ent.entity[0]: toponim = ent.entity[0] # type: DictionaryEntry ant.entity_id = toponim.id ant.entity_category = toponim.category ant.entity_priority = toponim.priority ant.name_en = toponim.entity_name ant.name = toponim.name if toponim.extra_columns: for extr_col in toponim.extra_columns: setattr(ant, extr_col, toponim.extra_columns[extr_col]) if ent.entity[1]: # alias ant.alias = ent.entity[1].alias ant.locale = ent.entity[1].language if not ant.locale: ant.locale = locale_obj.language yield ant