Exemplo n.º 1
0
def latlon_to_decimal(latitude, longitude):
    have_lat = False
    have_lon = False
    if latitude is None or longitude is None:
        return None, None

    latitude = safe_decode(latitude).strip(u' ,;|')
    longitude = safe_decode(longitude).strip(u' ,;|')

    latitude = latitude.replace(u',', u'.')
    longitude = longitude.replace(u',', u'.')

    lat_dms = latitude_dms_regex.match(latitude)
    lat_dir = latitude_decimal_with_direction_regex.match(latitude)

    if lat_dms:
        d, m, s, c = lat_dms.groups()
        sign = direction_sign(c)
        latitude = degrees_to_decimal(d or 0, m or 0, s or 0)
        have_lat = True
    elif lat_dir:
        d, c = lat_dir.groups()
        sign = direction_sign(c)
        latitude = float(d) * sign
        have_lat = True
    else:
        latitude = re.sub(beginning_re, u'', latitude)
        latitude = re.sub(end_re, u'', latitude)

    lon_dms = longitude_dms_regex.match(longitude)
    lon_dir = longitude_decimal_with_direction_regex.match(longitude)

    if lon_dms:
        d, m, s, c = lon_dms.groups()
        sign = direction_sign(c)
        longitude = degrees_to_decimal(d or 0, m or 0, s or 0)
        have_lon = True
    elif lon_dir:
        d, c = lon_dir.groups()
        sign = direction_sign(c)
        longitude = float(d) * sign
        have_lon = True
    else:
        longitude = re.sub(beginning_re, u'', longitude)
        longitude = re.sub(end_re, u'', longitude)

    latitude = float(latitude)
    longitude = float(longitude)

    if not is_valid_latitude(latitude):
        raise ValueError('Invalid latitude: {}'.format(latitude))

    if not is_valid_longitude(longitude):
        raise ValueError('Invalid longitude: {}'.format(longitude))

    latitude = to_valid_latitude(latitude)
    longitude = to_valid_longitude(longitude)

    return latitude, longitude
Exemplo n.º 2
0
 def write(self, f):
     writer = csv.writer(f, delimiter='\t')
     for k, v in six.iteritems(self.idf_counts):
         writer.writerow([
             safe_decode('{}{}'.format(self.term_key_prefix,
                                       safe_decode(k))),
             safe_decode(v)
         ])
     writer.writerow([self.doc_count_key, safe_decode(self.N)])
Exemplo n.º 3
0
    def near_dupe_hashes(cls,
                         address,
                         geohash_precision=DEFAULT_GEOHASH_PRECISION,
                         use_latlon=True,
                         use_city=False,
                         use_postal_code=False):
        address_expansions = cls.component_expansions(address)

        lat = address.get(Coordinates.LATITUDE)
        lon = address.get(Coordinates.LONGITUDE)
        postcode = safe_decode(address.get(AddressComponents.POSTAL_CODE,
                                           u'')).strip()
        city = safe_decode(address.get(AddressComponents.CITY, u'')).strip()

        if not any(address_expansions):
            return

        if lat and lon and use_latlon and not (
            (isclose(lat, 0.0) and isclose(lon, 0.0)) or lat >= 90.0
                or lat <= -90.0):
            geo = geohash.encode(lat, lon)[:geohash_precision]
            geohash_neighbors = [geo] + geohash.neighbors(geo)

            base_key = cls.GEOHASH_KEY_PREFIX

            for keys in six.itertools.product(geohash_neighbors,
                                              *address_expansions):
                yield u'{}|{}'.format(base_key, u'|'.join(keys))

        if postcode and use_postal_code:
            postcode_expansions = expand_address(
                postcode, address_components=ADDRESS_POSTAL_CODE)

            base_key = cls.POSTCODE_KEY_PREFIX

            for keys in six.itertools.product(postcode_expansions,
                                              *address_expansions):
                yield u'{}|{}'.format(base_key, u'|'.join(keys))

        if city and use_city:
            city_expansions = expand_address(
                city, address_components=ADDRESS_TOPONYM)

            base_key = cls.CITY_KEY_PREFIX

            for keys in six.itertools.product(city_expansions,
                                              *address_expansions):
                yield u'{}|{}'.format(base_key, u'|'.join(keys))
Exemplo n.º 4
0
    def read(cls, f):
        reader = csv.reader(f, delimiter='\t')

        info_gain = {}
        for key, val in reader:
            val = float(val)
            key = safe_decode(key)
            info_gain[key] = val
        return info_gain
Exemplo n.º 5
0
    def read(self, f):
        reader = csv.reader(f, delimiter='\t')
        term_prefix_len = len(self.term_key_prefix)
        for key, val in reader:
            val = int(val)
            key = safe_decode(key)
            if key.startswith(self.term_key_prefix):
                key = key[term_prefix_len:]

                self.idf_counts[key] += val
            elif key == self.doc_count_key:
                self.N += val
Exemplo n.º 6
0
    def component_equals(cls, c1, c2, component, no_whitespace=True):
        if not c1 or not c2:
            return False

        c1 = safe_decode(c1)
        c2 = safe_decode(c2)
        if no_whitespace and whitespace_regex.sub(
                u'', c1.lower()) == whitespace_regex.sub(u'', c2.lower()):
            return True

        expansions1 = expand_address(c1, address_components=component)
        expansions2 = expand_address(c2, address_components=component)

        if not no_whitespace:
            set_expansions1 = set(expansions1)
            set_expansions2 = set(expansions2)
        else:
            set_expansions1 = set(
                [whitespace_regex.sub(u'', e1) for e1 in expansions1])
            set_expansions2 = set(
                [whitespace_regex.sub(u'', e2) for e2 in expansions2])

        return len(set_expansions1 & set_expansions2) > 0
Exemplo n.º 7
0
    def from_geojson(cls, data):
        properties = data.get('properties')
        properties = {k: safe_decode(v) if k in cls.field_map.aliases else v for k, v in six.iteritems(properties)}
        fields = cls.field_map.replace(properties)
        lon, lat = data.get('geometry', {}).get('coordinates', (None, None))
        try:
            lat, lon = latlon_to_decimal(lat, lon)
        except ValueError:
            lat = lon = None

        if lat is not None:
            fields[Coordinates.LATITUDE] = lat
        if lon is not None:
            fields[Coordinates.LONGITUDE] = lon

        return fields
Exemplo n.º 8
0
    >>> latlon_to_decimal('40°42′46″N', '74°00′21″W') # returns (40.71277777777778, 74.00583333333333)
    >>> latlon_to_decimal('40,74 N', '74,001 W') # returns (40.74, -74.001)
    >>> to_valid_longitude(360.0)
    >>> latitude_is_valid(90.0)
'''

import math
import re

from lieu.encoding import safe_decode
from lieu.floats import isclose

beginning_re = re.compile('^[^0-9\-]+', re.UNICODE)
end_re = re.compile('[^0-9]+$', re.UNICODE)

latitude_dms_regex = re.compile(safe_decode(r'^(-?[0-9]{1,2})[ ]*[ :°ºd][ ]*([0-5]?[0-9])?[ ]*[:\'\u2032m]?[ ]*([0-5]?[0-9](?:\.\d+)?)?[ ]*[:\?\"\u2033s]?[ ]*(N|n|S|s)?$'), re.I | re.UNICODE)
longitude_dms_regex = re.compile(safe_decode(r'^(-?1[0-8][0-9]|0?[0-9]{1,2})[ ]*[ :°ºd][ ]*([0-5]?[0-9])?[ ]*[:\'\u2032m]?[ ]*([0-5]?[0-9](?:\.\d+)?)?[ ]*[:\?\"\u2033s]?[ ]*(E|e|W|w)?$'), re.I | re.UNICODE)

latitude_decimal_with_direction_regex = re.compile('^(-?[0-9][0-9](?:\.[0-9]+))[ ]*[ :°ºd]?[ ]*(N|n|S|s)$', re.I)
longitude_decimal_with_direction_regex = re.compile('^(-?1[0-8][0-9]|0?[0-9][0-9](?:\.[0-9]+))[ ]*[ :°ºd]?[ ]*(E|e|W|w)$', re.I)

direction_sign_map = {'n': 1, 's': -1, 'e': 1, 'w': -1}


def direction_sign(d):
    if d is None:
        return 1
    d = d.lower().strip()
    if d in direction_sign_map:
        return direction_sign_map[d]
    else:
Exemplo n.º 9
0
    def address_dupe_status(cls, a1, a2, languages=None, fuzzy_street_name=False):
        a1_street = a1.get(AddressComponents.STREET)
        a2_street = a2.get(AddressComponents.STREET)

        if a1_street:
            a1_street = a1_street.strip()
        if a2_street:
            a2_street = a2_street.strip()

        a1_house_number = a1.get(AddressComponents.HOUSE_NUMBER)
        a2_house_number = a2.get(AddressComponents.HOUSE_NUMBER)

        if a1_house_number:
            a1_house_number = safe_decode(a1_house_number).strip()
        if a2_house_number:
            a2_house_number = safe_decode(a2_house_number).strip()

        a1_base_house_number = a1.get(AddressComponents.HOUSE_NUMBER_BASE)
        a2_base_house_number = a2.get(AddressComponents.HOUSE_NUMBER_BASE)

        if a1_base_house_number:
            a1_base_house_number = safe_decode(a1_base_house_number).strip()
        if a2_base_house_number:
            a2_base_house_number = safe_decode(a2_base_house_number).strip()

        if (a1_street and not a2_street) or (a2_street and not a1_street):
            return NULL_DUPE

        if (a1_house_number and not a2_house_number) or (a2_house_number and not a1_house_number):
            return NULL_DUPE

        have_street = a1_street and a2_street
        same_street = False
        street_status = duplicate_status.NON_DUPLICATE

        street_sim = 0.0
        if have_street:
            street_dupe_status = StreetDeduper.street_dupe_status(a1_street, a2_street, languages=languages, fuzzy=fuzzy_street_name)
            same_street = street_dupe_status.status in (duplicate_status.EXACT_DUPLICATE, duplicate_status.LIKELY_DUPLICATE)

            if not same_street:
                return Dupe(status=duplicate_status.NON_DUPLICATE, sim=street_sim)

        have_house_number = a1_house_number and a2_house_number
        have_base_house_number = a1_base_house_number or a2_base_house_number
        same_house_number = False
        house_number_status = duplicate_status.NON_DUPLICATE
        house_number_sim = 0.0

        if have_house_number:
            house_number_status = is_house_number_duplicate(a1_house_number, a2_house_number, languages=languages)
            same_house_number = house_number_status == duplicate_status.EXACT_DUPLICATE
            if same_house_number:
                house_number_sim = 1.0

            if have_base_house_number and not same_house_number:
                a1h = a1_base_house_number or a1_house_number
                a2h = a2_base_house_number or a2_house_number

                base_house_number_status = is_house_number_duplicate(a1h, a2h, languages=languages)
                same_house_number = base_house_number_status == duplicate_status.EXACT_DUPLICATE
                if same_house_number:
                    house_number_status = duplicate_status.LIKELY_DUPLICATE
                    house_number_sim = 0.9

            if not same_house_number:
                return Dupe(status=duplicate_status.NON_DUPLICATE, sim=house_number_sim)

        if not have_house_number and not have_street:
            return NULL_DUPE

        if have_street and same_street and have_house_number and same_house_number:
            min_status, min_sim = min((street_dupe_status.status, street_dupe_status.sim), (house_number_status, house_number_sim))
            return Dupe(status=min_status, sim=min_sim)
        elif have_house_number and same_house_number and not have_street:
            return Dupe(status=house_number_status, sim=house_number_sim)
        elif have_street and same_street and not have_house_number:
            return Dupe(status=street_status, sim=street_sim)

        return NULL_DUPE
Exemplo n.º 10
0
    >>> to_valid_longitude(360.0)
    >>> latitude_is_valid(90.0)
'''

import math
import re

from lieu.encoding import safe_decode
from lieu.floats import isclose

beginning_re = re.compile('^[^0-9\-]+', re.UNICODE)
end_re = re.compile('[^0-9]+$', re.UNICODE)

latitude_dms_regex = re.compile(
    safe_decode(
        r'^(-?[0-9]{1,2})[ ]*[ :°ºd][ ]*([0-5]?[0-9])?[ ]*[:\'\u2032m]?[ ]*([0-5]?[0-9](?:\.\d+)?)?[ ]*[:\?\"\u2033s]?[ ]*(N|n|S|s)?$'
    ), re.I | re.UNICODE)
longitude_dms_regex = re.compile(
    safe_decode(
        r'^(-?1[0-8][0-9]|0?[0-9]{1,2})[ ]*[ :°ºd][ ]*([0-5]?[0-9])?[ ]*[:\'\u2032m]?[ ]*([0-5]?[0-9](?:\.\d+)?)?[ ]*[:\?\"\u2033s]?[ ]*(E|e|W|w)?$'
    ), re.I | re.UNICODE)

latitude_decimal_with_direction_regex = re.compile(
    '^(-?[0-9][0-9](?:\.[0-9]+))[ ]*[ :°ºd]?[ ]*(N|n|S|s)$', re.I)
longitude_decimal_with_direction_regex = re.compile(
    '^(-?1[0-8][0-9]|0?[0-9][0-9](?:\.[0-9]+))[ ]*[ :°ºd]?[ ]*(E|e|W|w)$',
    re.I)

direction_sign_map = {'n': 1, 's': -1, 'e': 1, 'w': -1}

Exemplo n.º 11
0
 def write(self, f):
     writer = csv.writer(f, delimiter='\t')
     for k, v in six.iteritems(self.info_gain):
         writer.writerow([safe_decode(k), safe_decode(v)])