def test_no_locations(self): ''' Test that the extractor works with an empty list of locations. ''' extractor = geoextract.NameExtractor() pipeline = geoextract.Pipeline([], extractors=[extractor]) assert pipeline.extract('foobar') == []
def __init__(self, locations, subs=None, stem="german"): if subs is None: if stem == "german": subs = [(r"str\b", "strasse")] else: subs = [] normalizer = geoextract.BasicNormalizer(subs=subs, stem=stem) name_extractor = geoextract.NameExtractor() address_pattern = re.compile( r""" (?P<street>[^\W\d_](?:[^\W\d_]|\s)*[^\W\d_]) \s+ (?P<house_number>([1-9]\d*)[\w-]*) ( \s+ ( (?P<postcode>\d{5}) \s+ )? (?P<city>([^\W\d_]|-)+) )? """, flags=re.UNICODE | re.VERBOSE, ) pattern_extractor = geoextract.PatternExtractor([address_pattern]) extractors = [pattern_extractor, name_extractor] keys_to_keep = ["name", "street", "house_number", "postcode", "city"] postprocessors = [(geoextract.KeyFilterPostprocessor(keys_to_keep))] super().__init__( locations, extractors=extractors, normalizer=normalizer, postprocessors=postprocessors, )
def __init__(self, locations, subs=None, stem='german'): if subs is None: if stem == 'german': subs = [(r'str\b', 'strasse')] else: subs = [] normalizer = geoextract.BasicNormalizer(subs=subs, stem=stem) name_extractor = geoextract.NameExtractor() address_pattern = re.compile(r''' (?P<street>[^\W\d_](?:[^\W\d_]|\s)*[^\W\d_]) \s+ (?P<house_number>([1-9]\d*)[\w-]*) ( \s+ ( (?P<postcode>\d{5}) \s+ )? (?P<city>([^\W\d_]|-)+) )? ''', flags=re.UNICODE | re.VERBOSE) pattern_extractor = geoextract.PatternExtractor([address_pattern]) extractors = [pattern_extractor, name_extractor] keys_to_keep = ['name', 'street', 'house_number', 'postcode', 'city'] postprocessors = [(geoextract.KeyFilterPostprocessor(keys_to_keep))] super().__init__(locations, extractors=extractors, normalizer=normalizer, postprocessors=postprocessors)
stem='german') # # NAMES # # Many places can be referred to using just their name, for example specific # buildings (e.g. the Brandenburger Tor), streets (Hauptstraße) or other # points of interest. These can be extracted using the ``NameExtractor``. # # Note that extractor will automatically receive the (normalized) location # names from the pipeline we construct later, so there's no need to explicitly # pass them to the constructor. name_extractor = geoextract.NameExtractor() # # PATTERNS # # For locations that are notated using a semi-structured format (addresses) # the ``PatternExtractor`` is a good choice. It looks for matches of regular # expressions. # # The patterns should have named groups, their sub-matches will be # returned in the extracted locations. address_pattern = re.compile(r''' (?P<street>[^\W\d_](?:[^\W\d_]|\s)*[^\W\d_])
def setup(self): # noqa: D102 self.ex = geoextract.NameExtractor() self.names = ['foo', 'foobar', 'a space', 'öüä'] pipeline = mock.Mock(normalized_names=self.names) self.ex.setup(pipeline)
def extract_found_locations(text, bodies=None): """ :type text: str :type bodies: list of Body :return: list """ search_for = create_geoextract_data(bodies) # # STRING NORMALIZATION # # Strings must be normalized before searching and matching them. This includes # technical normalization (e.g. Unicode normalization), linguistic # normalization (e.g. stemming) and content normalization (e.g. synonym # handling). normalizer = geoextract.BasicNormalizer(subs=[(r'str\b', 'strasse')], stem='german') # # NAMES # # Many places can be referred to using just their name, for example specific # buildings (e.g. the Brandenburger Tor), streets (Hauptstraße) or other # points of interest. These can be extracted using the ``NameExtractor``. # # Note that extractor will automatically receive the (normalized) location # names from the pipeline we construct later, so there's no need to explicitly # pass them to the constructor. name_extractor = geoextract.NameExtractor() # # PATTERNS # # For locations that are notated using a semi-structured format (addresses) # the ``PatternExtractor`` is a good choice. It looks for matches of regular # expressions. # # The patterns should have named groups, their sub-matches will be # returned in the extracted locations. address_pattern = re.compile(r''' (?P<street>[^\W\d_](?:[^\W\d_]|\s)*[^\W\d_]) \s+ (?P<house_number>([1-9]\d*)[\w-]*) ( \s+ ( (?P<postcode>\d{5}) \s+ )? (?P<city>([^\W\d_]|-)+) )? ''', flags=re.UNICODE | re.VERBOSE) pattern_extractor = geoextract.PatternExtractor([address_pattern]) # # POSTPROCESSING # # Once locations are extracted you might want to postprocess them, for example # to remove certain attributes that are useful for validation but are not # intended for publication. Or you may want to remove a certain address that's # printed in the footer of all the documents you're processing. # # GeoExtract allows you to do this by using one or more postprocessors. In this # example we will remove all but a few keys from our location dicts. keys_to_keep = ['name', 'street', 'house_number', 'postcode', 'city'] key_filter_postprocessor = geoextract.KeyFilterPostprocessor(keys_to_keep) # # PIPELINE CONSTRUCTION # # A pipeline connects all the different components. # # Here we're using custom extractors and a custom normalizer. We could also # provide our own code for splitting a document into chunks and for validation, # but for simplicity we'll use the default implementations in these cases. pipeline = geoextract.Pipeline( search_for, extractors=[pattern_extractor, name_extractor], normalizer=normalizer, postprocessors=[key_filter_postprocessor], ) return pipeline.extract(text)