Пример #1
0
    def geocode_batch(self, vals):
        """
        @param vals an iterator of "county name, state name" strings
        @return iterator of (loctype, latlon or shapeid)
        """
        Q = """
        select c.shape_id
        from __dbtruck_county__ as c, __dbtruck_state__ as s,
             __dbtruck_countyname__ as cn, __dbtruck_statename__ as sn
        where c.fips = cn.fips and s.fips = sn.fips and c.state_fips = s.fips and
              levenshtein(cn.name, %s) < 1 + 0.2*char_length(cn.name) and
              levenshtein(sn.name, %s) < 1 + 0.2*char_length(sn.name)
        order by levenshtein(cn.name, %s) + levenshtein(sn.name, %s) asc
        limit 1
        """

        for v in vals:
            try:
                v = to_utf(v.lower())
                idx = v.rindex(',')
                c, s =  v[:idx], v[idx:] # county, state

                args = (c, s, c, s)
                shapeids = self.session.bind.execute(Q, args).fetchone()
                if not shapeids:
                    yield None
                else:
                    yield SHAPE, shapeids[0]
            except Exception as e:
                print e
                yield None
Пример #2
0
    def geocode(self, address):
        """
        Try to pick the best geocoder for the address.  Google has a low
        daily limit so try to use other geocoders if possible

        If the address looks like a standard address (NUMBER WORDS+),
        then use yahoo, bing, geocoder.us

        Otherwise if it is a place name, try geonames

        If all else fails, use Google
        """
        address = address.replace('SOUTH BOSTON', 'BOSTON')
        
        query = to_utf(address).lower()

        if not query:
            return []

        if query in self.cache:
            try:
                ret = pickle.loads(self.cache[query])
                if ret and isinstance(ret[0], basestring):
                    return [ret]
                return ret
            except KeyboardInterrupt:
                raise
            except:
                pass


        
        geocoder = self.get_initial_geocoder(address)
        print geocoder, address

        try:
            result = geocoder.geocode(query, exactly_one=False)
            self.ncalls += 1
            if not result:
               raise Exception('no result found for %s' % query)
        except Exception as e:
            print e
            # try:
            #     geocoder = geocoders.Google()
            #     result = geocoder.geocode(query, exactly_one=False)
            #     self.ncalls += 1
            # except Exception as e:
            #     print e
            #     result = []


        self.cache[query] = pickle.dumps(result)

        if result and isinstance(result[0], basestring):
            result = [result]
            
        return result
Пример #3
0
def populate_shadow_cols(db, tablemd, schema):
    tablename = tablemd.tablename
    colnames = schema.columns.keys()

    arg = ','.join(colnames)
    resproxy = db.execute("""select %s from %s order by id asc""" % (arg, tablename))

    annotations = defaultdict(list)
    for anno in tablemd.annotations:
        annotations[anno.name].append((anno.loctype, anno.extractor()))

    def annotate_shadow(shadow_data, loctype, vals):
        for sd, v in zip(shadow_data, vals):
            sd[loctype] = v

    while True:
        rows = resproxy.fetchmany(2000)
        if not rows:
            break
        
        coldatas = zip(*rows)
        ids = None
        shadow_data = [dict() for row in rows]
        for cn, cd in zip(colnames, coldatas):
            cd = [ re_badchar.sub(' ', to_utf(v)).lower().strip() for v in cd if v]
            annos = annotations[cn]

            for loctype, extractor in annos:
                extracted = map(extractor, cd)
                if loctype == 'latlon':
                    lats, lons = zip(*extracted)

                    annotate_shadow(shadow_data, 'latitude', lats)
                    annotate_shadow(shadow_data, 'longitude', lons)
                else:
                    annotate_shadow(shadow_data, loctype, extracted)

            ids = cd if cn == 'id' else ids

        print 'saving', len(rows)
        save_shadow(db, tablename, ids, shadow_data)


    loctypes = set([anno.loctype for anno in tablemd.annotations])
    if ('latlon' in loctypes or
        ('latitude' in loctypes and 'longitude' in loctypes)):
        return False
    return True
Пример #4
0
 def geocode_batch(self, vals):
     """
     @return iterator of (loctype, latlon or shapeid)
     """
     Q = """select shape_id
            from __dbtruck_state__ s,  __dbtruck_statename__ sn
            where s.fips = sn.fips and
            levenshtein(name, %s) < 1 + 0.2*char_length(name)
            order by levenshtein(name, %s) limit 1"""
     for v in vals:
         try:
             v = to_utf(v.lower())
             shapeids = self.session.bind.execute(Q, (v,v)).fetchone()
             yield SHAPE, shapeids[0]
         except Exception as e:
             print e
             yield None
Пример #5
0
    def _geocode(self, address, restriction=''):
        """
        Try to pick the best geocoder for the address.  Google has a low
        daily limit so try to use other geocoders if possible

        If the address looks like a standard address (NUMBER WORDS+),
        then use yahoo, bing, geocoder.us

        Otherwise if it is a place name, try geonames

        If all else fails, use Google
        """
        geocoder = None
        format_string = self.get_format_string(restriction)
        query = to_utf((format_string % address).lower())

        if not query:
            return []

        if query in self.cache:
            try:
                ret = pickle.loads(self.cache[query])
                if ret and isinstance(ret[0], basestring):
                    return [ret]
                return ret
            except KeyboardInterrupt:
                raise
            except:
                pass
        
        if re_addr2.search(address) and restriction:
            rand = random.random()
            if rand < 0.5:
                geocoder = geocoders.Yahoo(settings.YAHOO_APPID)
                #elif rand < 0.8:
            else:
                geocoder = geocoders.Bing(settings.BING_APIKEY)
            #else:
            #    geocoder = geocoders.GeocoderDotUS()
        else:
            geocoder = geocoders.GeoNames()
        
        try:
            result = geocoder.geocode(query, exactly_one=False)
            self.ncalls += 1
            if not result:
               raise Exception('no result found for %s' % query)
        except Exception as e:
            print geocoder, e
            
            try:
                geocoder = geocoders.Google()
                result = geocoder.geocode(query, exactly_one=False)
                self.ncalls += 1
            except Exception as e:
                print e
                result = []


        self.cache[query] = pickle.dumps(result)

        if result and isinstance(result[0], basestring):
            result = [result]
        return result
Пример #6
0
r = shapefile.Reader("./tl_2008_us_county")

db = create_engine(DBURI)
db_session = scoped_session(sessionmaker(autocommit=False, autoflush=True, bind=db))

seen = set()

for idx in xrange(r.numRecords):
    recshape = r.shapeRecord(idx)

    rec, shape = recshape.record, recshape.shape

    statefp, countyfp, shortname, name = rec[0], rec[1], rec[4].lower(), rec[5].lower()
    countyfp = int(statefp + countyfp)
    statefp = int(statefp)
    shortname = to_utf(shortname)
    name = to_utf(name)
    print countyfp, "\t", name

    shp = get_wkt(shape)

    names = [CountyName(fips=statefp, name=name), CountyName(fips=statefp, name=shortname)]

    shapeobj = Shape(shp)
    county = County(fips=countyfp, state_fips=statefp, names=names, county_shape=shapeobj)

    db_session.add_all(names)
    db_session.add_all([shapeobj, county])
    db_session.commit()

Пример #7
0
def possible_loc(colname, vals):
    def is_ok(new_vals, maxposs=vals, thresh=0.65):
        n = 0
        for v in new_vals:
            if isinstance(v, list) or isinstance(v, tuple):
                if filter(lambda s:s, v):
                    n += 1
            else:
                if v != None:
                    n += 1
        if float(n) > thresh * len(maxposs):
            return n
        return False

    vals = [ re_badchar.sub(' ', to_utf(v)).lower().strip() for v in vals if v]
    nonempty = [v for v in vals if v]        
    colname = colname.lower().strip()
    ret = {}#defaultdict()
    
    if 'lat' in colname:
        lats = map(parse_lat, vals)
        if is_ok(lats, nonempty, thresh=0.8):
            ret['latitude'] = 'parse_lat'

    if 'lon' in colname:
        lons = map(parse_lon, vals)
        if is_ok(lons, nonempty, thresh=0.8):
            ret['longitude'] = 'parse_lon'

    if 'latitude' in ret and 'longitude' in ret:
        return ret

    if is_ok(map(parse_coords, vals), nonempty, thresh=0.5):
        ret['latlon'] = 'parse_coords'
        return ret



    if 'zip' in colname:
        zips = map(parse_zip, vals)
        if is_ok(zips, nonempty):
            return {"zipcode" : 'parse_zip'}
            
    if colname.startswith('st'):
        states = map(parse_state, vals)
        if is_ok(states, nonempty):
            return {'state' : 'parse_state'}

    zips = map(parse_per_word_zip, vals)
    if is_ok(zips, nonempty, thresh=0.8):
        ret['zipcode'] = 'parse_per_word_zip'

    states = map(parse_per_word_state, vals)
    if is_ok(states, nonempty, thresh=0.8):
        ret['state'] = 'parse_per_word_state'


    # county codes
    # countries

    # street addresses (number string string suffix)
    # column is not a single attribute, lets look for composite data
    # ok maybe its embedded in the text??
    addrs = map(parse_addrs, vals)
    if is_ok(addrs, nonempty, thresh=0.55):
        ret['address'] = 'parse_addrs'
    return ret