def geocode_batch(self, vals): """ @param vals an iterator of "county name, state name" strings @return iterator of (loctype, latlon or shapeid) """ Q = """ select c.shape_id from __dbtruck_county__ as c, __dbtruck_state__ as s, __dbtruck_countyname__ as cn, __dbtruck_statename__ as sn where c.fips = cn.fips and s.fips = sn.fips and c.state_fips = s.fips and levenshtein(cn.name, %s) < 1 + 0.2*char_length(cn.name) and levenshtein(sn.name, %s) < 1 + 0.2*char_length(sn.name) order by levenshtein(cn.name, %s) + levenshtein(sn.name, %s) asc limit 1 """ for v in vals: try: v = to_utf(v.lower()) idx = v.rindex(',') c, s = v[:idx], v[idx:] # county, state args = (c, s, c, s) shapeids = self.session.bind.execute(Q, args).fetchone() if not shapeids: yield None else: yield SHAPE, shapeids[0] except Exception as e: print e yield None
def geocode(self, address): """ Try to pick the best geocoder for the address. Google has a low daily limit so try to use other geocoders if possible If the address looks like a standard address (NUMBER WORDS+), then use yahoo, bing, geocoder.us Otherwise if it is a place name, try geonames If all else fails, use Google """ address = address.replace('SOUTH BOSTON', 'BOSTON') query = to_utf(address).lower() if not query: return [] if query in self.cache: try: ret = pickle.loads(self.cache[query]) if ret and isinstance(ret[0], basestring): return [ret] return ret except KeyboardInterrupt: raise except: pass geocoder = self.get_initial_geocoder(address) print geocoder, address try: result = geocoder.geocode(query, exactly_one=False) self.ncalls += 1 if not result: raise Exception('no result found for %s' % query) except Exception as e: print e # try: # geocoder = geocoders.Google() # result = geocoder.geocode(query, exactly_one=False) # self.ncalls += 1 # except Exception as e: # print e # result = [] self.cache[query] = pickle.dumps(result) if result and isinstance(result[0], basestring): result = [result] return result
def populate_shadow_cols(db, tablemd, schema): tablename = tablemd.tablename colnames = schema.columns.keys() arg = ','.join(colnames) resproxy = db.execute("""select %s from %s order by id asc""" % (arg, tablename)) annotations = defaultdict(list) for anno in tablemd.annotations: annotations[anno.name].append((anno.loctype, anno.extractor())) def annotate_shadow(shadow_data, loctype, vals): for sd, v in zip(shadow_data, vals): sd[loctype] = v while True: rows = resproxy.fetchmany(2000) if not rows: break coldatas = zip(*rows) ids = None shadow_data = [dict() for row in rows] for cn, cd in zip(colnames, coldatas): cd = [ re_badchar.sub(' ', to_utf(v)).lower().strip() for v in cd if v] annos = annotations[cn] for loctype, extractor in annos: extracted = map(extractor, cd) if loctype == 'latlon': lats, lons = zip(*extracted) annotate_shadow(shadow_data, 'latitude', lats) annotate_shadow(shadow_data, 'longitude', lons) else: annotate_shadow(shadow_data, loctype, extracted) ids = cd if cn == 'id' else ids print 'saving', len(rows) save_shadow(db, tablename, ids, shadow_data) loctypes = set([anno.loctype for anno in tablemd.annotations]) if ('latlon' in loctypes or ('latitude' in loctypes and 'longitude' in loctypes)): return False return True
def geocode_batch(self, vals): """ @return iterator of (loctype, latlon or shapeid) """ Q = """select shape_id from __dbtruck_state__ s, __dbtruck_statename__ sn where s.fips = sn.fips and levenshtein(name, %s) < 1 + 0.2*char_length(name) order by levenshtein(name, %s) limit 1""" for v in vals: try: v = to_utf(v.lower()) shapeids = self.session.bind.execute(Q, (v,v)).fetchone() yield SHAPE, shapeids[0] except Exception as e: print e yield None
def _geocode(self, address, restriction=''): """ Try to pick the best geocoder for the address. Google has a low daily limit so try to use other geocoders if possible If the address looks like a standard address (NUMBER WORDS+), then use yahoo, bing, geocoder.us Otherwise if it is a place name, try geonames If all else fails, use Google """ geocoder = None format_string = self.get_format_string(restriction) query = to_utf((format_string % address).lower()) if not query: return [] if query in self.cache: try: ret = pickle.loads(self.cache[query]) if ret and isinstance(ret[0], basestring): return [ret] return ret except KeyboardInterrupt: raise except: pass if re_addr2.search(address) and restriction: rand = random.random() if rand < 0.5: geocoder = geocoders.Yahoo(settings.YAHOO_APPID) #elif rand < 0.8: else: geocoder = geocoders.Bing(settings.BING_APIKEY) #else: # geocoder = geocoders.GeocoderDotUS() else: geocoder = geocoders.GeoNames() try: result = geocoder.geocode(query, exactly_one=False) self.ncalls += 1 if not result: raise Exception('no result found for %s' % query) except Exception as e: print geocoder, e try: geocoder = geocoders.Google() result = geocoder.geocode(query, exactly_one=False) self.ncalls += 1 except Exception as e: print e result = [] self.cache[query] = pickle.dumps(result) if result and isinstance(result[0], basestring): result = [result] return result
r = shapefile.Reader("./tl_2008_us_county") db = create_engine(DBURI) db_session = scoped_session(sessionmaker(autocommit=False, autoflush=True, bind=db)) seen = set() for idx in xrange(r.numRecords): recshape = r.shapeRecord(idx) rec, shape = recshape.record, recshape.shape statefp, countyfp, shortname, name = rec[0], rec[1], rec[4].lower(), rec[5].lower() countyfp = int(statefp + countyfp) statefp = int(statefp) shortname = to_utf(shortname) name = to_utf(name) print countyfp, "\t", name shp = get_wkt(shape) names = [CountyName(fips=statefp, name=name), CountyName(fips=statefp, name=shortname)] shapeobj = Shape(shp) county = County(fips=countyfp, state_fips=statefp, names=names, county_shape=shapeobj) db_session.add_all(names) db_session.add_all([shapeobj, county]) db_session.commit()
def possible_loc(colname, vals): def is_ok(new_vals, maxposs=vals, thresh=0.65): n = 0 for v in new_vals: if isinstance(v, list) or isinstance(v, tuple): if filter(lambda s:s, v): n += 1 else: if v != None: n += 1 if float(n) > thresh * len(maxposs): return n return False vals = [ re_badchar.sub(' ', to_utf(v)).lower().strip() for v in vals if v] nonempty = [v for v in vals if v] colname = colname.lower().strip() ret = {}#defaultdict() if 'lat' in colname: lats = map(parse_lat, vals) if is_ok(lats, nonempty, thresh=0.8): ret['latitude'] = 'parse_lat' if 'lon' in colname: lons = map(parse_lon, vals) if is_ok(lons, nonempty, thresh=0.8): ret['longitude'] = 'parse_lon' if 'latitude' in ret and 'longitude' in ret: return ret if is_ok(map(parse_coords, vals), nonempty, thresh=0.5): ret['latlon'] = 'parse_coords' return ret if 'zip' in colname: zips = map(parse_zip, vals) if is_ok(zips, nonempty): return {"zipcode" : 'parse_zip'} if colname.startswith('st'): states = map(parse_state, vals) if is_ok(states, nonempty): return {'state' : 'parse_state'} zips = map(parse_per_word_zip, vals) if is_ok(zips, nonempty, thresh=0.8): ret['zipcode'] = 'parse_per_word_zip' states = map(parse_per_word_state, vals) if is_ok(states, nonempty, thresh=0.8): ret['state'] = 'parse_per_word_state' # county codes # countries # street addresses (number string string suffix) # column is not a single attribute, lets look for composite data # ok maybe its embedded in the text?? addrs = map(parse_addrs, vals) if is_ok(addrs, nonempty, thresh=0.55): ret['address'] = 'parse_addrs' return ret