def _extract_variants_from_text(self, field, text: str, **kwargs):
        geo_entities = None
        document = kwargs.get('document')
        if document is not None:
            # try to extract from GeoEntityUsage
            # pros: faster extraction
            # cons: we may extract extra entities
            geo_entities = extract_models.GeoEntityUsage.objects \
                .filter(text_unit__document=document,
                        text_unit__unit_type='sentence',
                        text_unit__text__contains=text) \
                .values_list('entity__name', flat=True)

        if not geo_entities:
            from apps.extract import dict_data_cache
            geo_config = dict_data_cache.get_geo_config()

            text_languages = None
            if document:
                text_languages = models.TextUnit.objects.filter(
                    document=document,
                    text__contains=text).values_list('language', flat=True)
                if document.language and not text_languages:
                    text_languages = [document.language]

            geo_entities = [
                i[0][1] for i in get_geoentities(text,
                                                 geo_config_list=geo_config,
                                                 text_languages=text_languages,
                                                 priority=True)
            ]
        return list(geo_entities) or None
Пример #2
0
    def parse(self, log: ProcessLogger, text, text_unit_id, text_unit_lang,
              document_initial_load: bool = False, **kwargs) -> ParseResults:
        priority = kwargs.get('priority', True)
        geo_config = dict_data_cache.get_geo_config()
        from apps.extract.app_vars import SIMPLE_LOCATOR_TOKENIZATION
        simple_norm = SIMPLE_LOCATOR_TOKENIZATION.val
        entity_alias_pairs = list(geoentities.get_geoentities(text,
                                                              geo_config,
                                                              text_languages=[text_unit_lang],
                                                              priority=priority,
                                                              simplified_normalization=simple_norm))

        entity_ids = [entity.id for entity, _alias in entity_alias_pairs]
        if entity_ids:
            unique_entities = set(entity_ids)
            alias_ids = [alias.alias_id for _entity, alias in entity_alias_pairs]
            unique_aliases = set(alias_ids)

            return ParseResults({
                GeoEntityUsage: [GeoEntityUsage(text_unit_id=text_unit_id,
                                                entity_id=idd,
                                                count=entity_ids.count(idd)) for idd in unique_entities],
                GeoAliasUsage: [GeoAliasUsage(text_unit_id=text_unit_id,
                                              alias_id=idd,
                                              count=alias_ids.count(idd)) for idd in unique_aliases if idd]})
    def _extract_variants_from_text(self, field, text: str, **kwargs):

        geo_entities = None
        document = kwargs.get('document')
        if document is not None:
            # try to extract from GeoEntityUsage
            # pros: faster extraction
            # cons: we may extract extra entities
            geo_entities = extract_models.GeoEntityUsage.objects.filter(
                text_unit__document=document,
                text_unit__unit_type='sentence',
                text_unit__text__contains=text).values('entity_id', 'entity__name')

        if not geo_entities:
            from apps.task.tasks import CACHE_KEY_GEO_CONFIG
            from apps.common.advancedcelery.db_cache import DbCache
            geo_config = DbCache.get(CACHE_KEY_GEO_CONFIG)

            text_languages = None
            if document:
                text_languages = models.TextUnit.objects.filter(
                    document=document,
                    text__contains=text).values_list('language', flat=True)
                if document.language and not text_languages:
                    text_languages = [document.language]

            geo_entities = [{'entity_id': i[0][0], 'entity__name': i[0][1]} for i in
                            get_geoentities(text,
                                            geo_config_list=geo_config,
                                            text_languages=text_languages,
                                            priority=True)]

        return list(geo_entities) or None
Пример #4
0
    def en_parsers_speed(self):
        file_path = os.path.join(lexnlp_test_path, 'long_parsed_text.txt')
        with codecs.open(file_path, 'r', encoding='utf-8') as fr:
            text = fr.read()

        ge_path = os.path.join(lexnlp_test_path, 'lexnlp/extract/en/tests/test_geoentities/')
        entities_fn = ge_path + 'geoentities.csv'
        aliases_fn = ge_path + 'geoaliases.csv'
        geo_config = list(DictionaryEntry.load_entities_from_files(entities_fn, aliases_fn))

        times = {}  # type: Dict[str, float]
        self.check_time(text, lambda s: list(get_amounts(s)), 'get_amounts', times)
        self.check_time(text, lambda s: list(get_acts(s)), 'get_acts', times)
        self.check_time(text, lambda s: list(get_citations(s)), 'get_citations', times)
        self.check_time(text, lambda s: list(get_conditions(s)), 'get_conditions', times)
        self.check_time(text, lambda s: list(get_constraints(s)), 'get_constraints', times)
        self.check_time(text, lambda s: list(get_copyright(s)), 'get_copyright', times)
        self.check_time(text, lambda s: list(_get_courts(s)), 'get_courts', times)
        self.check_time(text, lambda s: list(get_cusip(s)), 'get_cusip', times)
        self.check_time(text, lambda s: list(get_dates(s)), 'get_dates', times)
        self.check_time(text, lambda s: list(get_definitions(s)), 'get_definitions', times)
        self.check_time(text, lambda s: list(get_distances(s)), 'get_distances', times)
        self.check_time(text, lambda s: list(get_durations(s)), 'get_durations', times)
        self.check_time(text, lambda s: list(get_geoentities(s, geo_config)), 'get_geoentities', times)
        self.check_time(text, lambda s: list(get_money(s)), 'get_money', times)
        self.check_time(text, lambda s: list(get_percents(s)), 'get_percents', times)
        self.check_time(text, lambda s: list(get_pii(s)), 'get_pii', times)
        self.check_time(text, lambda s: list(get_ratios(s)), 'get_ratios', times)
        self.check_time(text, lambda s: list(get_regulations(s)), 'get_regulations', times)
        self.check_time(text, lambda s: list(get_trademarks(s)), 'get_trademarks', times)
        self.check_time(text, lambda s: list(get_urls(s)), 'get_urls', times)

        self.assertTrue('get_amounts' in times)
Пример #5
0
def get_geoentities_routine(
        text: str,
        geo_config_list: List[DictionaryEntry],
        conflict_resolving_field: str = 'none',
        priority_direction: str = 'asc',
        text_languages: Optional[str] = None,
        min_alias_len: Optional[int] = None,
        prepared_alias_ban_list: Optional[
            Dict[str, Tuple[List[str], List[str]]]] = None,
        simplified_normalization: bool = False) -> \
        Generator[Tuple[DictionaryEntry, DictionaryEntryAlias], Any, Any]:
    yield from get_geoentities(text, geo_config_list, conflict_resolving_field,
                               priority_direction,
                               [text_languages] if text_languages else None,
                               min_alias_len, prepared_alias_ban_list,
                               simplified_normalization)
    def parse(self,
              log: ProcessLogger,
              text,
              text_unit_id,
              text_unit_lang,
              document_initial_load: bool = False,
              **kwargs) -> ParseResults:
        priority = kwargs.get('priority', True)
        geo_config = dict_data_cache.get_geo_config()
        entity_alias_pairs = list(
            geoentities.get_geoentities(text,
                                        geo_config,
                                        text_languages=[text_unit_lang],
                                        priority=priority))

        entity_ids = [
            dict_entities.get_entity_id(entity)
            for entity, _alias in entity_alias_pairs
        ]
        if entity_ids:
            unique_entities = set(entity_ids)
            alias_ids = [
                dict_entities.get_alias_id(alias)
                for _entity, alias in entity_alias_pairs
            ]
            unique_aliases = set(alias_ids)

            return ParseResults({
                GeoEntityUsage: [
                    GeoEntityUsage(text_unit_id=text_unit_id,
                                   entity_id=idd,
                                   count=entity_ids.count(idd))
                    for idd in unique_entities
                ],
                GeoAliasUsage: [
                    GeoAliasUsage(text_unit_id=text_unit_id,
                                  alias_id=idd,
                                  count=alias_ids.count(idd))
                    for idd in unique_aliases if idd
                ]
            })
    def test_multiline_address(self):
        text = """
        Sincerely,
        DUKE REALTY CORPORATION
        Ana M. Hernandez
        Property Administrator
        
        2400 North Commerce  Parkway
        Suite 405
        Weston, FL 33326
        Main: 954-453-5660
        P: 954-453-5265
        F: 954.453.5695 
        [email protected] 
        www.dukerealty.com
        
        
        Cc: File
        LEASE
        """

        ds = list(get_geoentities(text, GEO_CONFIG))
        self.assertEqual(1, len(ds))  # how come?
def test_geoentities_counting():
    text = 'And AND AND AND And'
    actual = list(get_geoentities(text, geo_config_list=_CONFIG))
    assert len(actual) == 3