def latlon_to_decimal(latitude, longitude): have_lat = False have_lon = False if latitude is None or longitude is None: return None, None latitude = safe_decode(latitude).strip(u' ,;|') longitude = safe_decode(longitude).strip(u' ,;|') latitude = latitude.replace(u',', u'.') longitude = longitude.replace(u',', u'.') lat_dms = latitude_dms_regex.match(latitude) lat_dir = latitude_decimal_with_direction_regex.match(latitude) if lat_dms: d, m, s, c = lat_dms.groups() sign = direction_sign(c) latitude = degrees_to_decimal(d or 0, m or 0, s or 0) have_lat = True elif lat_dir: d, c = lat_dir.groups() sign = direction_sign(c) latitude = float(d) * sign have_lat = True else: latitude = re.sub(beginning_re, u'', latitude) latitude = re.sub(end_re, u'', latitude) lon_dms = longitude_dms_regex.match(longitude) lon_dir = longitude_decimal_with_direction_regex.match(longitude) if lon_dms: d, m, s, c = lon_dms.groups() sign = direction_sign(c) longitude = degrees_to_decimal(d or 0, m or 0, s or 0) have_lon = True elif lon_dir: d, c = lon_dir.groups() sign = direction_sign(c) longitude = float(d) * sign have_lon = True else: longitude = re.sub(beginning_re, u'', longitude) longitude = re.sub(end_re, u'', longitude) latitude = float(latitude) longitude = float(longitude) if not is_valid_latitude(latitude): raise ValueError('Invalid latitude: {}'.format(latitude)) if not is_valid_longitude(longitude): raise ValueError('Invalid longitude: {}'.format(longitude)) latitude = to_valid_latitude(latitude) longitude = to_valid_longitude(longitude) return latitude, longitude
def write(self, f): writer = csv.writer(f, delimiter='\t') for k, v in six.iteritems(self.idf_counts): writer.writerow([ safe_decode('{}{}'.format(self.term_key_prefix, safe_decode(k))), safe_decode(v) ]) writer.writerow([self.doc_count_key, safe_decode(self.N)])
def near_dupe_hashes(cls, address, geohash_precision=DEFAULT_GEOHASH_PRECISION, use_latlon=True, use_city=False, use_postal_code=False): address_expansions = cls.component_expansions(address) lat = address.get(Coordinates.LATITUDE) lon = address.get(Coordinates.LONGITUDE) postcode = safe_decode(address.get(AddressComponents.POSTAL_CODE, u'')).strip() city = safe_decode(address.get(AddressComponents.CITY, u'')).strip() if not any(address_expansions): return if lat and lon and use_latlon and not ( (isclose(lat, 0.0) and isclose(lon, 0.0)) or lat >= 90.0 or lat <= -90.0): geo = geohash.encode(lat, lon)[:geohash_precision] geohash_neighbors = [geo] + geohash.neighbors(geo) base_key = cls.GEOHASH_KEY_PREFIX for keys in six.itertools.product(geohash_neighbors, *address_expansions): yield u'{}|{}'.format(base_key, u'|'.join(keys)) if postcode and use_postal_code: postcode_expansions = expand_address( postcode, address_components=ADDRESS_POSTAL_CODE) base_key = cls.POSTCODE_KEY_PREFIX for keys in six.itertools.product(postcode_expansions, *address_expansions): yield u'{}|{}'.format(base_key, u'|'.join(keys)) if city and use_city: city_expansions = expand_address( city, address_components=ADDRESS_TOPONYM) base_key = cls.CITY_KEY_PREFIX for keys in six.itertools.product(city_expansions, *address_expansions): yield u'{}|{}'.format(base_key, u'|'.join(keys))
def read(cls, f): reader = csv.reader(f, delimiter='\t') info_gain = {} for key, val in reader: val = float(val) key = safe_decode(key) info_gain[key] = val return info_gain
def read(self, f): reader = csv.reader(f, delimiter='\t') term_prefix_len = len(self.term_key_prefix) for key, val in reader: val = int(val) key = safe_decode(key) if key.startswith(self.term_key_prefix): key = key[term_prefix_len:] self.idf_counts[key] += val elif key == self.doc_count_key: self.N += val
def component_equals(cls, c1, c2, component, no_whitespace=True): if not c1 or not c2: return False c1 = safe_decode(c1) c2 = safe_decode(c2) if no_whitespace and whitespace_regex.sub( u'', c1.lower()) == whitespace_regex.sub(u'', c2.lower()): return True expansions1 = expand_address(c1, address_components=component) expansions2 = expand_address(c2, address_components=component) if not no_whitespace: set_expansions1 = set(expansions1) set_expansions2 = set(expansions2) else: set_expansions1 = set( [whitespace_regex.sub(u'', e1) for e1 in expansions1]) set_expansions2 = set( [whitespace_regex.sub(u'', e2) for e2 in expansions2]) return len(set_expansions1 & set_expansions2) > 0
def from_geojson(cls, data): properties = data.get('properties') properties = {k: safe_decode(v) if k in cls.field_map.aliases else v for k, v in six.iteritems(properties)} fields = cls.field_map.replace(properties) lon, lat = data.get('geometry', {}).get('coordinates', (None, None)) try: lat, lon = latlon_to_decimal(lat, lon) except ValueError: lat = lon = None if lat is not None: fields[Coordinates.LATITUDE] = lat if lon is not None: fields[Coordinates.LONGITUDE] = lon return fields
>>> latlon_to_decimal('40°42′46″N', '74°00′21″W') # returns (40.71277777777778, 74.00583333333333) >>> latlon_to_decimal('40,74 N', '74,001 W') # returns (40.74, -74.001) >>> to_valid_longitude(360.0) >>> latitude_is_valid(90.0) ''' import math import re from lieu.encoding import safe_decode from lieu.floats import isclose beginning_re = re.compile('^[^0-9\-]+', re.UNICODE) end_re = re.compile('[^0-9]+$', re.UNICODE) latitude_dms_regex = re.compile(safe_decode(r'^(-?[0-9]{1,2})[ ]*[ :°ºd][ ]*([0-5]?[0-9])?[ ]*[:\'\u2032m]?[ ]*([0-5]?[0-9](?:\.\d+)?)?[ ]*[:\?\"\u2033s]?[ ]*(N|n|S|s)?$'), re.I | re.UNICODE) longitude_dms_regex = re.compile(safe_decode(r'^(-?1[0-8][0-9]|0?[0-9]{1,2})[ ]*[ :°ºd][ ]*([0-5]?[0-9])?[ ]*[:\'\u2032m]?[ ]*([0-5]?[0-9](?:\.\d+)?)?[ ]*[:\?\"\u2033s]?[ ]*(E|e|W|w)?$'), re.I | re.UNICODE) latitude_decimal_with_direction_regex = re.compile('^(-?[0-9][0-9](?:\.[0-9]+))[ ]*[ :°ºd]?[ ]*(N|n|S|s)$', re.I) longitude_decimal_with_direction_regex = re.compile('^(-?1[0-8][0-9]|0?[0-9][0-9](?:\.[0-9]+))[ ]*[ :°ºd]?[ ]*(E|e|W|w)$', re.I) direction_sign_map = {'n': 1, 's': -1, 'e': 1, 'w': -1} def direction_sign(d): if d is None: return 1 d = d.lower().strip() if d in direction_sign_map: return direction_sign_map[d] else:
def address_dupe_status(cls, a1, a2, languages=None, fuzzy_street_name=False): a1_street = a1.get(AddressComponents.STREET) a2_street = a2.get(AddressComponents.STREET) if a1_street: a1_street = a1_street.strip() if a2_street: a2_street = a2_street.strip() a1_house_number = a1.get(AddressComponents.HOUSE_NUMBER) a2_house_number = a2.get(AddressComponents.HOUSE_NUMBER) if a1_house_number: a1_house_number = safe_decode(a1_house_number).strip() if a2_house_number: a2_house_number = safe_decode(a2_house_number).strip() a1_base_house_number = a1.get(AddressComponents.HOUSE_NUMBER_BASE) a2_base_house_number = a2.get(AddressComponents.HOUSE_NUMBER_BASE) if a1_base_house_number: a1_base_house_number = safe_decode(a1_base_house_number).strip() if a2_base_house_number: a2_base_house_number = safe_decode(a2_base_house_number).strip() if (a1_street and not a2_street) or (a2_street and not a1_street): return NULL_DUPE if (a1_house_number and not a2_house_number) or (a2_house_number and not a1_house_number): return NULL_DUPE have_street = a1_street and a2_street same_street = False street_status = duplicate_status.NON_DUPLICATE street_sim = 0.0 if have_street: street_dupe_status = StreetDeduper.street_dupe_status(a1_street, a2_street, languages=languages, fuzzy=fuzzy_street_name) same_street = street_dupe_status.status in (duplicate_status.EXACT_DUPLICATE, duplicate_status.LIKELY_DUPLICATE) if not same_street: return Dupe(status=duplicate_status.NON_DUPLICATE, sim=street_sim) have_house_number = a1_house_number and a2_house_number have_base_house_number = a1_base_house_number or a2_base_house_number same_house_number = False house_number_status = duplicate_status.NON_DUPLICATE house_number_sim = 0.0 if have_house_number: house_number_status = is_house_number_duplicate(a1_house_number, a2_house_number, languages=languages) same_house_number = house_number_status == duplicate_status.EXACT_DUPLICATE if same_house_number: house_number_sim = 1.0 if have_base_house_number and not same_house_number: a1h = a1_base_house_number or a1_house_number a2h = a2_base_house_number or a2_house_number base_house_number_status = is_house_number_duplicate(a1h, a2h, languages=languages) same_house_number = base_house_number_status == duplicate_status.EXACT_DUPLICATE if same_house_number: house_number_status = duplicate_status.LIKELY_DUPLICATE house_number_sim = 0.9 if not same_house_number: return Dupe(status=duplicate_status.NON_DUPLICATE, sim=house_number_sim) if not have_house_number and not have_street: return NULL_DUPE if have_street and same_street and have_house_number and same_house_number: min_status, min_sim = min((street_dupe_status.status, street_dupe_status.sim), (house_number_status, house_number_sim)) return Dupe(status=min_status, sim=min_sim) elif have_house_number and same_house_number and not have_street: return Dupe(status=house_number_status, sim=house_number_sim) elif have_street and same_street and not have_house_number: return Dupe(status=street_status, sim=street_sim) return NULL_DUPE
>>> to_valid_longitude(360.0) >>> latitude_is_valid(90.0) ''' import math import re from lieu.encoding import safe_decode from lieu.floats import isclose beginning_re = re.compile('^[^0-9\-]+', re.UNICODE) end_re = re.compile('[^0-9]+$', re.UNICODE) latitude_dms_regex = re.compile( safe_decode( r'^(-?[0-9]{1,2})[ ]*[ :°ºd][ ]*([0-5]?[0-9])?[ ]*[:\'\u2032m]?[ ]*([0-5]?[0-9](?:\.\d+)?)?[ ]*[:\?\"\u2033s]?[ ]*(N|n|S|s)?$' ), re.I | re.UNICODE) longitude_dms_regex = re.compile( safe_decode( r'^(-?1[0-8][0-9]|0?[0-9]{1,2})[ ]*[ :°ºd][ ]*([0-5]?[0-9])?[ ]*[:\'\u2032m]?[ ]*([0-5]?[0-9](?:\.\d+)?)?[ ]*[:\?\"\u2033s]?[ ]*(E|e|W|w)?$' ), re.I | re.UNICODE) latitude_decimal_with_direction_regex = re.compile( '^(-?[0-9][0-9](?:\.[0-9]+))[ ]*[ :°ºd]?[ ]*(N|n|S|s)$', re.I) longitude_decimal_with_direction_regex = re.compile( '^(-?1[0-8][0-9]|0?[0-9][0-9](?:\.[0-9]+))[ ]*[ :°ºd]?[ ]*(E|e|W|w)$', re.I) direction_sign_map = {'n': 1, 's': -1, 'e': 1, 'w': -1}
def write(self, f): writer = csv.writer(f, delimiter='\t') for k, v in six.iteritems(self.info_gain): writer.writerow([safe_decode(k), safe_decode(v)])