예제 #1
0
 def test_no_locations(self):
     '''
     Test that the extractor works with an empty list of locations.
     '''
     extractor = geoextract.NameExtractor()
     pipeline = geoextract.Pipeline([], extractors=[extractor])
     assert pipeline.extract('foobar') == []
예제 #2
0
    def __init__(self, locations, subs=None, stem="german"):
        if subs is None:
            if stem == "german":
                subs = [(r"str\b", "strasse")]
            else:
                subs = []
        normalizer = geoextract.BasicNormalizer(subs=subs, stem=stem)

        name_extractor = geoextract.NameExtractor()

        address_pattern = re.compile(
            r"""
            (?P<street>[^\W\d_](?:[^\W\d_]|\s)*[^\W\d_])
            \s+
            (?P<house_number>([1-9]\d*)[\w-]*)
            (
                \s+
                (
                    (?P<postcode>\d{5})
                    \s+
                )?
                (?P<city>([^\W\d_]|-)+)
            )?
        """,
            flags=re.UNICODE | re.VERBOSE,
        )

        pattern_extractor = geoextract.PatternExtractor([address_pattern])

        extractors = [pattern_extractor, name_extractor]

        keys_to_keep = ["name", "street", "house_number", "postcode", "city"]
        postprocessors = [(geoextract.KeyFilterPostprocessor(keys_to_keep))]

        super().__init__(
            locations,
            extractors=extractors,
            normalizer=normalizer,
            postprocessors=postprocessors,
        )
    def __init__(self, locations, subs=None, stem='german'):
        if subs is None:
            if stem == 'german':
                subs = [(r'str\b', 'strasse')]
            else:
                subs = []
        normalizer = geoextract.BasicNormalizer(subs=subs, stem=stem)

        name_extractor = geoextract.NameExtractor()

        address_pattern = re.compile(r'''
            (?P<street>[^\W\d_](?:[^\W\d_]|\s)*[^\W\d_])
            \s+
            (?P<house_number>([1-9]\d*)[\w-]*)
            (
                \s+
                (
                    (?P<postcode>\d{5})
                    \s+
                )?
                (?P<city>([^\W\d_]|-)+)
            )?
        ''',
                                     flags=re.UNICODE | re.VERBOSE)

        pattern_extractor = geoextract.PatternExtractor([address_pattern])

        extractors = [pattern_extractor, name_extractor]

        keys_to_keep = ['name', 'street', 'house_number', 'postcode', 'city']
        postprocessors = [(geoextract.KeyFilterPostprocessor(keys_to_keep))]

        super().__init__(locations,
                         extractors=extractors,
                         normalizer=normalizer,
                         postprocessors=postprocessors)
예제 #4
0
                                        stem='german')


#
# NAMES
#

# Many places can be referred to using just their name, for example specific
# buildings (e.g. the Brandenburger Tor), streets (Hauptstraße) or other
# points of interest. These can be extracted using the ``NameExtractor``.
#
# Note that extractor will automatically receive the (normalized) location
# names from the pipeline we construct later, so there's no need to explicitly
# pass them to the constructor.

name_extractor = geoextract.NameExtractor()


#
# PATTERNS
#

# For locations that are notated using a semi-structured format (addresses)
# the ``PatternExtractor`` is a good choice. It looks for matches of regular
# expressions.
#
# The patterns should have named groups, their sub-matches will be
# returned in the extracted locations.

address_pattern = re.compile(r'''
    (?P<street>[^\W\d_](?:[^\W\d_]|\s)*[^\W\d_])
예제 #5
0
 def setup(self):  # noqa: D102
     self.ex = geoextract.NameExtractor()
     self.names = ['foo', 'foobar', 'a space', 'öüä']
     pipeline = mock.Mock(normalized_names=self.names)
     self.ex.setup(pipeline)
예제 #6
0
def extract_found_locations(text, bodies=None):
    """
    :type text: str
    :type bodies: list of Body
    :return: list
    """
    search_for = create_geoextract_data(bodies)

    #
    # STRING NORMALIZATION
    #

    # Strings must be normalized before searching and matching them. This includes
    # technical normalization (e.g. Unicode normalization), linguistic
    # normalization (e.g. stemming) and content normalization (e.g. synonym
    # handling).

    normalizer = geoextract.BasicNormalizer(subs=[(r'str\b', 'strasse')],
                                            stem='german')

    #
    # NAMES
    #

    # Many places can be referred to using just their name, for example specific
    # buildings (e.g. the Brandenburger Tor), streets (Hauptstraße) or other
    # points of interest. These can be extracted using the ``NameExtractor``.
    #
    # Note that extractor will automatically receive the (normalized) location
    # names from the pipeline we construct later, so there's no need to explicitly
    # pass them to the constructor.

    name_extractor = geoextract.NameExtractor()

    #
    # PATTERNS
    #

    # For locations that are notated using a semi-structured format (addresses)
    # the ``PatternExtractor`` is a good choice. It looks for matches of regular
    # expressions.
    #
    # The patterns should have named groups, their sub-matches will be
    # returned in the extracted locations.

    address_pattern = re.compile(r'''
        (?P<street>[^\W\d_](?:[^\W\d_]|\s)*[^\W\d_])
        \s+
        (?P<house_number>([1-9]\d*)[\w-]*)
        (
            \s+
            (
                (?P<postcode>\d{5})
                \s+
            )?
            (?P<city>([^\W\d_]|-)+)
        )?
    ''',
                                 flags=re.UNICODE | re.VERBOSE)

    pattern_extractor = geoextract.PatternExtractor([address_pattern])

    #
    # POSTPROCESSING
    #

    # Once locations are extracted you might want to postprocess them, for example
    # to remove certain attributes that are useful for validation but are not
    # intended for publication. Or you may want to remove a certain address that's
    # printed in the footer of all the documents you're processing.
    #
    # GeoExtract allows you to do this by using one or more postprocessors. In this
    # example we will remove all but a few keys from our location dicts.

    keys_to_keep = ['name', 'street', 'house_number', 'postcode', 'city']
    key_filter_postprocessor = geoextract.KeyFilterPostprocessor(keys_to_keep)

    #
    # PIPELINE CONSTRUCTION
    #

    # A pipeline connects all the different components.
    #
    # Here we're using custom extractors and a custom normalizer. We could also
    # provide our own code for splitting a document into chunks and for validation,
    # but for simplicity we'll use the default implementations in these cases.

    pipeline = geoextract.Pipeline(
        search_for,
        extractors=[pattern_extractor, name_extractor],
        normalizer=normalizer,
        postprocessors=[key_filter_postprocessor],
    )

    return pipeline.extract(text)