Exemplo n.º 1
0

##Load parcels as dataframes for the imputation
parcels = db_to_df('select * from parcels;')
parcels = parcels.set_index('gid')

#Standardize the res_type field
parcels.res_type[parcels.res_type.isnull()] = 'other'
parcels.res_type[parcels.res_type == ''] = 'other'
parcels.res_type[np.in1d(
    parcels.res_type,
    ['FLATS', 'APTS', 'CONDO', 'SRO', 'LIVEWORK', 'mixed'])] = 'multi'
parcels.res_type[parcels.res_type == 'SINGLE'] = 'single'

# Load TAZ residential unit control totals and other zonal targets.
taz_controls_csv = loader.get_path('hh/taz2010_imputation.csv')
targetunits = pd.read_csv(taz_controls_csv, index_col='taz1454')

taz_controls_csv2 = loader.get_path('hh/tazsumm_redfin.csv')
targetvalues = pd.read_csv(taz_controls_csv2, index_col='taz')

nonres_sqft_zone = pd.DataFrame({
    'observed':
    parcels.groupby('taz').non_residential_sqft.sum(),
    'target':
    targetunits.targetnonressqft
})

# For all employment points, translate to nonres-sqft by multiplying by 250.
# Filter out synthetic job-based buildings so that we keep only those that have no residential and have less than 500 existing sqft.
# For each TAZ, calculate the difference needed to match aggregate target.
Exemplo n.º 2
0

# Install PostGIS and create staging schema.
loader = TableLoader()
with loader.database.cursor() as cur:
    cur.execute("""
        CREATE EXTENSION IF NOT EXISTS postgis;
        CREATE SCHEMA IF NOT EXISTS staging;
    """)
loader.database.refresh()

# Load shapefiles specified above to the project database.
loader.load_shp_map(shapefiles)

# Fix invalid geometries and reproject.
staging = loader.tables.staging
conform_srids(loader.srid, schema=staging, fix=True)

# Load county land use code mapping.
csv = loader.get_path('built/parcel/2010/rtp13_processing_notes/lucodes.csv')
df = pd.read_csv(csv, dtype=str)
df.dropna(how='any', inplace=True,
          subset=['county_id', 'land_use_type_id', 'development_type_id'])
df.index.name = 'index'
df_to_db(df, 'lucodes', schema=staging)

# Add county land use code mapping unique constraint.
exec_sql("""
ALTER TABLE staging.lucodes ADD CONSTRAINT lucodes_unique
UNIQUE (county_id, land_use_type_id);
""")
    chosen = np.random.choice(
        alternative_ids, size=n_to_choose, replace=False, p=probabilities)

    # if there are fewer available units than choosers we need to pick
    # which choosers get a unit
    if n_to_choose == n_available:
        chooser_ids = np.random.choice(
            chooser_ids, size=n_to_choose, replace=False)

    choices[chooser_ids] = chosen

    return choices

# Load TAZ-level synthetic population
hh_path = loader.get_path('hh/synth/hhFile.p2011s3a1.2010.csv')
hh = pd.read_csv(hh_path)
hh = hh[hh['HHT'] > 0] #Filter out GQ households
hh = hh.set_index('HHID')
hh.index.name = 'household_id'
hh = hh.rename(columns = {'TAZ':'taz'})
hh['building_id'] = -1

# Get the taz-level dwelling unit controls just for reference.  This file also contains the employment totals by sector/zone.
taz_controls_csv = loader.get_path('hh/taz2010_imputation.csv')
targetunits = pd.read_csv(taz_controls_csv, index_col='taz1454')

targetunits['hh'] = hh.groupby('taz').size()

df = targetunits[['targetunits', 'hh']]
Exemplo n.º 4
0
fee_schedule = db_to_df(
    'select fee_schedule_id, development_type_id, development_fee_per_unit_space_initial from staging.fee_schedule'
)
parcel_fee_schedule = db_to_df(
    'select parcelid as parcel_id, fee_schedule_id, portion from staging.parcel_fee_schedule'
)

# Remove uneccesary id columns appended by spandex
for df in [buildings, jobs, households, assessor_transactions, zoning]:
    if 'id' in df.columns:
        del df['id']

zoning = zoning.set_index('zoning_id')

# Get OSM nodes and edges for Pandana
nodes_path = loader.get_path('travel/nodes.csv')
edges_path = loader.get_path('travel/edges.csv')
nodes = pd.read_csv(nodes_path).set_index('node_id')
edges = pd.read_csv(edges_path)
nodes.index.name = 'index'

# Building sqft per job_id
sqft_per_job = db_to_df('select * from staging.sqft_per_job_by_devtype;')
sqft_per_job = sqft_per_job[['luz_id', 'development_type_id', 'sqft_per_emp']]

# Get price datasets
costar = db_to_df('select * from public.costar')
if 'id' in costar.columns:
    del costar['id']

# Put tables in HDF5
    """Executes SQL query and returns DataFrame."""
    conn = loader.database._connection
    return sql.read_frame(query, conn)

##Load parcels as dataframes for the imputation
parcels = db_to_df('select * from parcels;')
parcels = parcels.set_index('gid')

#Standardize the res_type field
parcels.res_type[parcels.res_type.isnull()] = 'other'
parcels.res_type[parcels.res_type==''] = 'other'
parcels.res_type[np.in1d(parcels.res_type, ['FLATS', 'APTS', 'CONDO', 'SRO', 'LIVEWORK', 'mixed'])] = 'multi'
parcels.res_type[parcels.res_type=='SINGLE'] = 'single'

# Load TAZ residential unit control totals and other zonal targets.
taz_controls_csv = loader.get_path('hh/taz2010_imputation.csv')
targetunits = pd.read_csv(taz_controls_csv, index_col='taz1454')

taz_controls_csv2 = loader.get_path('hh/tazsumm_redfin.csv')
targetvalues = pd.read_csv(taz_controls_csv2, index_col='taz')

nonres_sqft_zone = pd.DataFrame({'observed':parcels.groupby('taz').non_residential_sqft.sum(), 'target':targetunits.targetnonressqft})

# For all employment points, translate to nonres-sqft by multiplying by 250.
# Filter out synthetic job-based buildings so that we keep only those that have no residential and have less than 500 existing sqft. 
# For each TAZ, calculate the difference needed to match aggregate target.
# If need to increment nrsqft upwards, sort synthetic buildings by sqft and take the top x that covers the needed difference
# If no valid job points and non existing nonres-sqft, introduce a synthetic building in the TAZ-  equal to the target, and put it on the biggest parcel.
# Do same in the case of no parcels (and add synthetic parcel)
# Scale to match
Exemplo n.º 6
0

loader = TableLoader()

# Download puma 2000 geometry zip files
for i in range(73):
    if i < 10:
        filename = 'p50%s_d00_shp.zip' % i
    else:
        filename = 'p5%s_d00_shp.zip' % i

    try:
        pumageom_file = urllib.URLopener()
        pumageom_file.retrieve(
            "http://www2.census.gov/geo/tiger/PREVGENZ/pu/p500shp/%s" %
            filename, os.path.join(loader.get_path('puma_geom'), filename))
        print 'Downloading %s' % filename
    except:
        continue

# Unzip and add prj file to puma 2000 geometry
for i in range(73):
    if i < 10:
        filename = 'p50%s_d00_shp.zip' % i
    else:
        filename = 'p5%s_d00_shp.zip' % i
    filepath = os.path.join(loader.get_path('puma_geom'), filename)

    if os.path.exists(filepath):
        print 'Unzipping and adding prj to %s' % filename
Exemplo n.º 7
0
assessor_transactions = db_to_df('select * from assessor_transactions').set_index('building_id')
zoning = db_to_df('select * from zoning')
zoning_allowed_uses = db_to_df('select zoning_id, development_type_id from zoning_allowed_uses')
fee_schedule = db_to_df('select fee_schedule_id, development_type_id, development_fee_per_unit_space_initial from staging.fee_schedule')
parcel_fee_schedule = db_to_df('select parcelid as parcel_id, fee_schedule_id, portion from staging.parcel_fee_schedule')


# Remove uneccesary id columns appended by spandex
for df in [buildings, jobs, households, assessor_transactions, zoning]:
    if 'id' in df.columns:
        del df['id']
        
zoning = zoning.set_index('zoning_id')

# Get OSM nodes and edges for Pandana
nodes_path = loader.get_path('travel/nodes.csv')
edges_path = loader.get_path('travel/edges.csv')
nodes = pd.read_csv(nodes_path).set_index('node_id')
edges = pd.read_csv(edges_path)
nodes.index.name = 'index'

# Building sqft per job_id
sqft_per_job = db_to_df('select * from staging.sqft_per_job_by_devtype;')
sqft_per_job = sqft_per_job[['luz_id', 'development_type_id', 'sqft_per_emp']]

# Get price datasets
costar = db_to_df('select * from public.costar')
if 'id' in costar.columns:
    del costar['id']

# Put tables in HDF5
import pandas as pd, numpy as np
import pandas.io.sql as sql
from pandas.io.excel import read_excel
from spandex.io import exec_sql,  df_to_db
from spandex import TableLoader

loader = TableLoader()

##Read Redfin CSV and load to database
redfin_csv_path = loader.get_path('built/bldg/homeprices/redfin_03feb14.csv')
redfin = pd.read_csv(redfin_csv_path)
redfin.index.name = 'idx'
df_to_db(redfin, 'redfin', schema=loader.tables.staging)

##Lat/long to point geometry, with the right SRID
exec_sql("ALTER TABLE staging.redfin ADD COLUMN geom geometry;")
exec_sql("UPDATE staging.redfin SET geom = ST_GeomFromText('POINT(' || longitude || ' ' || latitude || ')',4326);")
exec_sql("CREATE INDEX redfin_gidx on staging.redfin using gist (geom);")
exec_sql("SELECT UpdateGeometrySRID('staging', 'redfin', 'geom', 2768);")
exec_sql("UPDATE staging.redfin SET geom = ST_TRANSFORM(ST_SetSRID(geom, 4326), 2768);")

##Append the unique parcel identifier to the Redfin records
exec_sql("alter table staging.redfin add gid integer default 0;")
exec_sql("update staging.redfin set gid = a.gid from parcels a where st_within(staging.redfin.geom, a.geom);")

def db_to_df(query):
    """Executes SQL query and returns DataFrame."""
    conn = loader.database._connection
    return sql.read_frame(query, conn)
Exemplo n.º 9
0
    conn = loader.database._connection
    return sql.read_frame(query, conn)

loader = TableLoader()

# Download puma 2000 geometry zip files
for i in range(73): 
    if i < 10:
        filename = 'p50%s_d00_shp.zip' % i
    else:
        filename = 'p5%s_d00_shp.zip' % i
    
    try:
        pumageom_file = urllib.URLopener()
        pumageom_file.retrieve("http://www2.census.gov/geo/tiger/PREVGENZ/pu/p500shp/%s" % filename, 
                          os.path.join(loader.get_path('puma_geom'), filename))
        print 'Downloading %s' % filename
    except:
        continue

# Unzip and add prj file to puma 2000 geometry
for i in range(73): 
    if i < 10:
        filename = 'p50%s_d00_shp.zip' % i
    else:
        filename = 'p5%s_d00_shp.zip' % i
    filepath = os.path.join(loader.get_path('puma_geom'), filename)
    
    if os.path.exists(filepath):
        print 'Unzipping and adding prj to %s' % filename
        
Exemplo n.º 10
0
import pandas.io.sql as sql


def db_to_df(query):
    """Executes SQL query and returns DataFrame."""
    conn = loader.database._connection
    return sql.read_frame(query, conn)


# Build parcels TableFrame.
loader = TableLoader()
table = loader.database.tables.public.parcels
tf = TableFrame(table, index_col='gid')

# Load TAZ residential unit control totals.
taz_controls_csv = loader.get_path('hh/taz2010_imputation.csv')
targetunits = pd.read_csv(taz_controls_csv, index_col='taz1454')['targetunits']

# Get CSV output file directory.
output_dir = loader.get_path('out/regeneration/summaries')

# Generate summary CSV by county and TAZ.
for grouper in ['county_id', 'taz']:
    df = tf[[grouper, 'non_residential_sqft', 'residential_units']]
    df.dropna(subset=[grouper], inplace=True)

    if grouper == 'taz':
        df[grouper] = df[grouper].astype(int)

    df['count'] = 1
    summary = df.groupby(grouper).sum()
    sr_grouped = df.groupby('gid')[attribute]
    if agg_function == 'median':
        var = sr_grouped.median()
    if agg_function == 'max':
        var = sr_grouped.max()
    if agg_function == 'sum':
        var = sr_grouped.sum()
    var = var[(var > lower_bound) & (var < upper_bound)] #set bounds on valid values to use for impute
    return var


######## *LOADING* ########

#### REDFIN
# Read Redfin CSV and load to database
csv_to_staging(loader.get_path('built/bldg/homeprices/redfin_03feb14.csv'), 'redfin')
# Lat/long to point geometry, with the right SRID
lat_long_to_point_geometry('redfin', 'staging', 'longitude', 'latitude', 'geom', 2768)
# Append the unique parcel identifier to the Redfin records
append_parcel_identifier('redfin', 'staging', 'geom', 'gid')

#### GOV BUILDINGS
# Read Gov Building CSV and load to database
csv_to_staging(loader.get_path('built/bldg/add_buildings1.csv'), 'public_bldgs')
# Lat/long to point geometry, with the right SRID
lat_long_to_point_geometry('public_bldgs', 'staging', 'x', 'y', 'geom', 2768)
# Append the unique parcel identifier to the Gov Building records
append_parcel_identifier('public_bldgs', 'staging', 'geom', 'gid')

#### COSTAR
costar_xls_path = loader.get_path('built/bldg/costar/2011/costar_allbayarea.xlsx')
Exemplo n.º 12
0
    'luz_controls/pecas_PriceAndSpaceQuantity.csv',

    'assessor_transactions':
    'price/parcelTransactions.csv',
    
    'fee_schedule':
    'proformaInputs/fees/fee_schedule.csv',

    'parcel_fee_schedule':
    'proformaInputs/fees/parcel_fee_schedule.csv',
    
}

for tbl in csvs.iterkeys():
    print tbl
    csv = loader.get_path(csvs[tbl])
    df = pd.read_csv(csv)
    df.index.name = 'index'
    if df.isnull().sum().sum() > 0:
        for col in df.dtypes.iteritems():
            col_name = col[0]
            col_type = col[1]
            firstval = df[col_name].loc[0]
            if firstval in (True, False):
                if type(firstval) == bool:
                    df[col_name] = df[col_name].fillna(False)
            if col_type == np.int64:
                df[col_name] = df[col_name].fillna(0)
            elif col_type == np.float64:
                df[col_name] = df[col_name].fillna(0.0)
            elif col_type == np.object:
Exemplo n.º 13
0
import pandas as pd
from spandex import TableLoader
import pandas.io.sql as sql

loader = TableLoader()

def db_to_df(query):
    """Executes SQL query and returns DataFrame."""
    conn = loader.database._connection
    return sql.read_frame(query, conn)

## Export to HDF5-  get path to output file
h5_path = loader.get_path('out/regeneration/summaries/bayarea_v3.h5')  ## Path to the output file

#Buildings
buildings = db_to_df('select * from building').set_index('building_id')
if 'id' in buildings.columns:
    del buildings['id']
buildings['building_type_id'] = 0
buildings.building_type_id[buildings.development_type_id == 1] = 1
buildings.building_type_id[buildings.development_type_id == 2] = 3
buildings.building_type_id[buildings.development_type_id == 5] = 12
buildings.building_type_id[buildings.development_type_id == 7] = 10
buildings.building_type_id[buildings.development_type_id == 9] = 5
buildings.building_type_id[buildings.development_type_id == 10] = 4
buildings.building_type_id[buildings.development_type_id == 13] = 8
buildings.building_type_id[buildings.development_type_id == 14] = 7
buildings.building_type_id[buildings.development_type_id == 15] = 9
buildings.building_type_id[buildings.development_type_id == 13] = 8
buildings.building_type_id[buildings.development_type_id == 17] = 6
buildings.building_type_id[buildings.development_type_id == 24] = 16
Exemplo n.º 14
0
from spandex.utils import load_config
from spandex.io import exec_sql, df_to_db
import pandas.io.sql as sql

def db_to_df(query):
    """Executes SQL query and returns DataFrame."""
    conn = loader.database._connection
    return sql.read_frame(query, conn)

# Build parcels TableFrame.
loader = TableLoader()
table = loader.database.tables.public.parcels
tf = TableFrame(table, index_col='gid')

# Load TAZ residential unit control totals.
taz_controls_csv = loader.get_path('hh/taz2010_imputation.csv')
targetunits = pd.read_csv(taz_controls_csv, index_col='taz1454')['targetunits']

# Get CSV output file directory.
output_dir = loader.get_path('out/regeneration/summaries')

# Generate summary CSV by county and TAZ.
for grouper in ['county_id', 'taz']:
    df = tf[[grouper, 'non_residential_sqft', 'residential_units']]
    df.dropna(subset=[grouper], inplace=True)

    if grouper == 'taz':
        df[grouper] = df[grouper].astype(int)

    df['count'] = 1
    summary = df.groupby(grouper).sum()
Exemplo n.º 15
0
import pandas.io.sql as sql
from spandex import TableLoader
loader = TableLoader()

conn_string = "host='urbancanvas.cp2xwchuariu.us-west-2.rds.amazonaws.com' dbname='sandag_testing' user='******' password='******' port=5432"
conn=psycopg2.connect(conn_string)
cur = conn.cursor()

def uc_db_to_df(query):
    return sql.read_frame(query, conn)
    
parcels = uc_db_to_df("select parcel_id, zoning_id, devtype_id as development_type_id from parcel "
                      "where projects = '{1}' and valid_from = '{-infinity}';").set_index('parcel_id')
buildings = uc_db_to_df("SELECT building_id, parcel_id, building_type_id as development_type_id, improvement_value, "
                        "residential_units, non_residential_sqft, stories, year_built, residential_sqft, "
                        "note FROM building where projects = '{1}' and valid_from = '{-infinity}';").set_index('building_id')
                        
# Put tables in HDF5
h5_path = loader.get_path('out/sandag.h5')
store = pd.HDFStore(h5_path)

del store['buildings']
store['buildings'] = buildings

p_prev = store.parcels.copy()
p_prev['zoning_id'] = parcels.zoning_id
p_prev['development_type_id'] = parcels.development_type_id
del store['parcels']
store['parcels'] = p_prev

store.close()
Exemplo n.º 16
0
tags = soup.find_all(href=re.compile("csv_h..\.zip"))
hpums_links = []
for t in tags:
    hpums_links.append(t['href'])
    
tags = soup.find_all(href=re.compile("csv_p..\.zip"))
ppums_links = []
for t in tags:
    ppums_links.append(t['href'])

pums_links = hpums_links + ppums_links
for pums_file in pums_links:
    print pums_file
    pums_file_dl = urllib.URLopener()
    pums_file_dl.retrieve("http://www2.census.gov/acs2013_5yr/pums/%s" % pums_file, 
                      os.path.join(loader.get_path('pums'), pums_file))

for pums_file in pums_links:
    filepath = os.path.join(loader.get_path('pums'), pums_file)
    
    if os.path.exists(filepath):
        print 'Unzipping %s' % pums_file
        
        with zipfile.ZipFile(filepath, "r") as z:
            z.extractall(loader.get_path('pums'))

for pums_file in ['ss13husa.csv', 'ss13husb.csv', 
                  'ss13husc.csv', 'ss13husd.csv',
                  'ss13pusa.csv', 'ss13pusb.csv',
                  'ss13pusc.csv', 'ss13pusd.csv']:
    print 'Processing %s' % pums_file
Exemplo n.º 17
0
hpums_links = []
for t in tags:
    hpums_links.append(t['href'])

tags = soup.find_all(href=re.compile("csv_p..\.zip"))
ppums_links = []
for t in tags:
    ppums_links.append(t['href'])

pums_links = hpums_links + ppums_links
for pums_file in pums_links:
    print pums_file
    pums_file_dl = urllib.URLopener()
    pums_file_dl.retrieve(
        "http://www2.census.gov/acs2013_5yr/pums/%s" % pums_file,
        os.path.join(loader.get_path('pums'), pums_file))

for pums_file in pums_links:
    filepath = os.path.join(loader.get_path('pums'), pums_file)

    if os.path.exists(filepath):
        print 'Unzipping %s' % pums_file

        with zipfile.ZipFile(filepath, "r") as z:
            z.extractall(loader.get_path('pums'))

for pums_file in [
        'ss13husa.csv', 'ss13husb.csv', 'ss13husc.csv', 'ss13husd.csv',
        'ss13pusa.csv', 'ss13pusb.csv', 'ss13pusc.csv', 'ss13pusd.csv'
]:
    print 'Processing %s' % pums_file