import os import string import sys import numpy as np import pandas as pd from utils import misc # this code is platform specific! args = sys.argv[1:] instore = pd.HDFStore(os.path.join(misc.data_dir(), args[0])) outstore = pd.HDFStore(os.path.join(misc.data_dir(), args[1])) for tblname in instore.keys(): tblname = string.replace(tblname, '/', '') print "\n\nCondensing: " + tblname tbl = instore[tblname] newtbl = pd.DataFrame(index=tbl.index) for colname in instore[tblname].columns: cmd = ( 'find | grep "\.json$" | grep -v scripts | xargs grep {}' ).format(colname) if not os.popen(cmd).read(): del tbl[colname] elif colname in ['geom', 'txt_geom']: del tbl[colname] else: if tbl[colname].dtype == np.float64:
import pandas as pd, numpy as np, os, sys, string from utils import misc instore = pd.HDFStore(os.path.join(misc.data_dir(), 'baseyeardata.h5')) outstore = pd.HDFStore( os.path.join(misc.data_dir(), 'baseyeardata_condensed.h5')) for tblname in instore.keys(): tblname = string.replace(tblname, '/', '') if tblname in ['nodes']: continue print "\n\nCondensing: " + tblname tbl = instore[tblname] newtbl = pd.DataFrame(index=tbl.index) for colname in instore[tblname].columns: if not os.popen( 'find | grep "\.json$" | grep -v scripts | xargs grep %s' % colname).read(): del tbl[colname] elif colname in ['geom', 'txt_geom', 'nodeid']: del tbl[colname] else: if tbl[colname].dtype == np.float64: newtbl[colname] = tbl[colname].astype('float32') elif tbl[colname].dtype == np.int64: newtbl[colname] = tbl[colname].astype('int32') else: newtbl[colname] = tbl[colname] print newtbl.columns print newtbl outstore[tblname] = newtbl
import pandas as pd, numpy as np, os, sys, string from utils import misc instore = pd.HDFStore(os.path.join(misc.data_dir(),'baseyeardata.h5')) outstore = pd.HDFStore(os.path.join(misc.data_dir(),'baseyeardata_condensed.h5')) for tblname in instore.keys(): tblname = string.replace(tblname,'/','') if tblname in ['nodes']: continue print "\n\nCondensing: " + tblname tbl = instore[tblname] newtbl = pd.DataFrame(index=tbl.index) for colname in instore[tblname].columns: if not os.popen('find | grep "\.py$" | grep -v scripts | xargs grep %s' % colname).read(): del tbl[colname] elif colname in ['geom','txt_geom','nodeid']: del tbl[colname] else: if tbl[colname].dtype == np.float64: newtbl[colname] = tbl[colname].astype('float32') elif tbl[colname].dtype == np.int64: newtbl[colname] = tbl[colname].astype('int32') else: newtbl[colname] = tbl[colname] print newtbl.columns print newtbl outstore[tblname] = newtbl
import os import string import sys import numpy as np import pandas as pd from utils import misc # this code is platform specific! args = sys.argv[1:] instore = pd.HDFStore(os.path.join(misc.data_dir(), args[0])) outstore = pd.HDFStore(os.path.join(misc.data_dir(), args[1])) for tblname in instore.keys(): tblname = string.replace(tblname, '/', '') print "\n\nCondensing: " + tblname tbl = instore[tblname] newtbl = pd.DataFrame(index=tbl.index) for colname in instore[tblname].columns: cmd = ('find | grep "\.json$" | grep -v scripts | xargs grep {}' ).format(colname) if not os.popen(cmd).read(): del tbl[colname] elif colname in ['geom', 'txt_geom']: del tbl[colname] else: if tbl[colname].dtype == np.float64: newtbl[colname] = tbl[colname].astype('float32')
import pandas as pd, os from utils import misc def fetch_data(sql, dbname, outname): host, user = '******', 'urbanvision' cmd = """psql -h %s -U %s -w -c "copy (%s) to stdout with csv header" %s > %s""" % \ (host,user,sql,dbname,outname) os.system(cmd) store = pd.HDFStore(os.path.join(misc.data_dir(),'baseyeardata.h5')) for sql, dbname, outname, hdf5name, keyname in [\ ("""select *, ST_X(ST_TRANSFORM(SETSRID(ST_POINT(longitude,latitude),4326),3740)) as x, ST_Y(ST_TRANSFORM(SETSRID(ST_POINT(longitude,latitude),4326),3740)) as y from nets2011_digestformodel""",'california','nets.csv','nets','dunsnumber'), ('select * from zones_pemsbuffers','bayarea','zones_buffers.csv','zones_buffers','zone_id'), ('select *, st_astext(the_geom) as txt_geom from zones','bayarea','zones.csv','zones','zone_id'), ("""select *,st_x(centroid) as x,st_y(centroid) as y,st_astext(st_simplify(the_geom,2)) as txt_geom from parcels2010_withgeography""", 'bayarea','parcels.csv','parcels','parcel_id'), ("select * from zoning_for_parcels(1,'2010-01-01 00:00:00')", 'bayarea','zoning_for_parcels.csv','zoning_for_parcels','parcel'), ("select * from zoning_join",'bayarea','zoning.csv','zoning','id'), ('select * from apts_large','bayarea','apartments.csv','apartments',None), ('select * from households','bayarea','households.csv','households','household_id'), #('select n.*, coalesce(api11,0) as api11, coalesce(api10,0) as api10, coalesce(\\"Violent crime\\",-1) as violent, coalesce(\\"Property crime\\",-1) as property, county.name as county from pemsnodes n join node_geography ng on (n.id=ng.node_id) left join cityandcrime cc on (ng.city_id = cc.gid) left join county on (ng.county_id = county.gid) left join schools on (schools.gid = school_id)','sandbox','pemsnodes.csv','nodes','id'), ('select * from costar c','sandbox','costar.csv','costar','costar_id'), ('select * from buildings2010_base','bayarea','buildings.csv','buildings','building_id'), ('select * from home_sales2008_2011','bayarea','homesales.csv','homesales','RecordID'), ('select * from batshh','sandbox','bats_hhfile.csv','batshh','hhid'), ('select * from bats','sandbox','bats_trips.csv','bats','bats_id'),