def _extract_variants_from_text(self, field, text: str, **kwargs): geo_entities = None document = kwargs.get('document') if document is not None: # try to extract from GeoEntityUsage # pros: faster extraction # cons: we may extract extra entities geo_entities = extract_models.GeoEntityUsage.objects \ .filter(text_unit__document=document, text_unit__unit_type='sentence', text_unit__text__contains=text) \ .values_list('entity__name', flat=True) if not geo_entities: from apps.extract import dict_data_cache geo_config = dict_data_cache.get_geo_config() text_languages = None if document: text_languages = models.TextUnit.objects.filter( document=document, text__contains=text).values_list('language', flat=True) if document.language and not text_languages: text_languages = [document.language] geo_entities = [ i[0][1] for i in get_geoentities(text, geo_config_list=geo_config, text_languages=text_languages, priority=True) ] return list(geo_entities) or None
def parse(self, log: ProcessLogger, text, text_unit_id, text_unit_lang, document_initial_load: bool = False, **kwargs) -> ParseResults: priority = kwargs.get('priority', True) geo_config = dict_data_cache.get_geo_config() from apps.extract.app_vars import SIMPLE_LOCATOR_TOKENIZATION simple_norm = SIMPLE_LOCATOR_TOKENIZATION.val entity_alias_pairs = list(geoentities.get_geoentities(text, geo_config, text_languages=[text_unit_lang], priority=priority, simplified_normalization=simple_norm)) entity_ids = [entity.id for entity, _alias in entity_alias_pairs] if entity_ids: unique_entities = set(entity_ids) alias_ids = [alias.alias_id for _entity, alias in entity_alias_pairs] unique_aliases = set(alias_ids) return ParseResults({ GeoEntityUsage: [GeoEntityUsage(text_unit_id=text_unit_id, entity_id=idd, count=entity_ids.count(idd)) for idd in unique_entities], GeoAliasUsage: [GeoAliasUsage(text_unit_id=text_unit_id, alias_id=idd, count=alias_ids.count(idd)) for idd in unique_aliases if idd]})
def parse(self, log: ProcessLogger, text, text_unit_id, text_unit_lang, document_initial_load: bool = False, **kwargs) -> ParseResults: priority = kwargs.get('priority', True) geo_config = dict_data_cache.get_geo_config() entity_alias_pairs = list( geoentities.get_geoentities(text, geo_config, text_languages=[text_unit_lang], priority=priority)) entity_ids = [ dict_entities.get_entity_id(entity) for entity, _alias in entity_alias_pairs ] if entity_ids: unique_entities = set(entity_ids) alias_ids = [ dict_entities.get_alias_id(alias) for _entity, alias in entity_alias_pairs ] unique_aliases = set(alias_ids) return ParseResults({ GeoEntityUsage: [ GeoEntityUsage(text_unit_id=text_unit_id, entity_id=idd, count=entity_ids.count(idd)) for idd in unique_entities ], GeoAliasUsage: [ GeoAliasUsage(text_unit_id=text_unit_id, alias_id=idd, count=alias_ids.count(idd)) for idd in unique_aliases if idd ] })