def __init__(self, inputs): """ Args: inputs: array containing a LeadLeft instance """ acs = FromSQL(table='output.acs') acs.target = True inputs = inputs + [MapResults([acs], 'acs')] Step.__init__(self, inputs=inputs)
def __init__(self, spacedeltas, dates, **kwargs): SpacetimeAggregation.__init__(self, spacedeltas=spacedeltas, dates=dates, prefix='wicenroll', date_column='register_d', **kwargs) if not self.parallel: self.inputs = [ FromSQL(query=""" with enroll as ( SELECT kid_id, p.* FROM cornerstone.partenrl p join aux.kid_wics using (part_id_i) UNION ALL SELECT kid_id, p.* FROM cornerstone.partenrl p join aux.kid_mothers on p.part_id_i = mothr_id_i) select *, array_remove(array[lang_1_c, lang_2_c, lang_3_c], null) as language, array_remove(array[pa_cde1_c, pa_cde2_c, pa_cde3_c, pa_cde4_c, pa_cde5_c], null) as assistance from enroll """, parse_dates=['register_d', 'last_upd_d'], target=True) ]
def __init__(self, spacedeltas, dates, **kwargs): SpacetimeAggregation.__init__(self, spacedeltas=spacedeltas, dates=dates, prefix='wicbirth', date_column='date_of_birth', **kwargs) if not self.parallel: self.inputs = [ FromSQL(target=True, query=""" SELECT *, apgar_n::int as apgar, nullif(lgt_inch_n, 0) as length, nullif(wgt_grm_n, 0) as weight, nullif(headcirc_n, 0) as head_circumference, array_remove(array[ inf_cmp1_c, inf_cmp2_c, inf_cmp3_c, inf_cmp4_c, inf_cmp5_c ], null) as complication FROM aux.kids JOIN aux.kid_mothers USING (kid_id) JOIN cornerstone.birth USING (part_id_i, mothr_id_i) """, parse_dates=['date_of_birth']) ]
def __init__(self, month, day, year_min, **kwargs): Step.__init__(self, month=month, day=day, year_min=year_min, **kwargs) kid_addresses = Merge(on='kid_id', inputs=[ FromSQL( table='output.kid_addresses', parse_dates=KID_ADDRESSES_PARSE_DATES, target=True), FromSQL(table='output.kids', parse_dates=KIDS_PARSE_DATES, to_str=['first_name', 'last_name'], target=True) ]) addresses = FromSQL(table='output.addresses', target=True) self.inputs = [kid_addresses, addresses]
def __init__(self, spacedeltas, dates, parallel=False): SpacetimeAggregation.__init__( self, spacedeltas=spacedeltas, dates=dates, prefix='kids', aggregator_args=['date', 'index', 'delta'], date_column='address_min_date', max_date_column='address_max_date', parallel=parallel) if not self.parallel: kid_addresses = revise_kid_addresses(date=dates[0]) addresses = FromSQL(table='output.addresses') addresses.target = True self.inputs = [ Merge(inputs=[kid_addresses, addresses], on='address_id') ]
def __init__(self, **kwargs): Step.__init__(self, **kwargs) self.inputs = [ FromSQL(query=""" select *, least(init_date, comply_date) as min_date from output.inspections join output.addresses using (address_id) """, parse_dates=['min_date', 'comply_date', 'init_date'], target=False) ]
def __init__(self, indexes, **kwargs): SimpleAggregation.__init__(self, indexes=indexes, prefix='assessor', **kwargs) if not self.parallel: self.inputs = [ FromSQL(query="select * from aux.assessor " "join output.addresses using (address)", tables=['aux.assessor', 'output.addresses'], target=True) ]
def __init__(self, indexes, **kwargs): SimpleAggregation.__init__(self, indexes=indexes, prefix='buildings', **kwargs) if not self.parallel: self.inputs = [ FromSQL( query="select * from aux.buildings " "join (select distinct on (building_id) * " "from output.addresses order by building_id, address_id) a " "using (building_id)", tables=['aux.buildings', 'output.addresses'], target=True) ]
def __init__(self, spacedeltas, dates, **kwargs): SpacetimeAggregation.__init__(self, spacedeltas=spacedeltas, dates=dates, prefix='wicprenatal', date_column='visit_d', **kwargs) if not self.parallel: self.inputs = [ FromSQL(target=True, query=""" SELECT kid_id, date_of_birth, p.* FROM aux.kids JOIN aux.kid_mothers USING (kid_id) JOIN cornerstone.birth b USING (part_id_i, mothr_id_i) JOIN cornerstone.prenatl p ON b.mothr_id_i = p.part_id_i where date_of_birth - visit_d between -365 and 365 """, parse_dates=['date_of_birth', 'visit_d']) ]
def __init__(self, month, day, year_min, year_max, **kwargs): Step.__init__(self, month=month, day=day, year_min=year_min, year_max=year_max, **kwargs) acs = FromSQL(table='output.acs', target=True) left = LeadLeft(month=month, day=day, year_min=year_min, target=True) dates = tuple( (date(y, month, day) for y in range(year_min, year_max + 1))) self.aggregations = aggregations.all_dict(dates) self.aggregation_joins = [ AggregationJoin(target=True, inputs=[left, a], inputs_mapping=[{ 'aux': None }, None]) for a in self.aggregations.values() ] self.inputs = [acs, left] + self.aggregation_joins self.inputs_mapping = ['acs', {}] + [None] * len(self.aggregations)
'water', 'paint', 'window', 'wall', 'porch', 'chip', 'flak', 'peel' ] STATUS = (['OPEN', 'COMPLIED', 'NO ENTRY'], ['open', 'complied', 'no_entry']) KEYWORD_COLUMNS = str.join( ', ', ("violation_description ~* '{0}' " "or violation_inspector_comments ~* '{0}' AS {0}".format(k) for k in KEYWORDS)) STATUS_COLUMNS = str.join(', ', ("violation_status = '{0}' AS {1}".format(*s) for s in zip(*STATUS))) violations = FromSQL(""" select a.*, violation_date, violation_status, violation_status_date, %s, %s from input.building_violations join output.addresses a using (address) """ % (KEYWORD_COLUMNS, STATUS_COLUMNS), parse_dates=['violation_date', 'violation_status_date'], target=True) class ViolationsAggregation(SpacetimeAggregation): def __init__(self, spacedeltas, dates, **kwargs): SpacetimeAggregation.__init__( self, spacedeltas=spacedeltas, dates=dates, prefix='violations', date_column='violation_date', censor_columns={'violation_status_date': ['violation_status']}, **kwargs)
# most common combinations of event and res codes # TODO: consider including less common but useful ones # TODO: switch to Binarize step with min_freq! event_res_codes = [ 'REINS_C', 'INSSA_C', 'INSAR_N', 'CMPLY_C', 'ENVPH_C', 'INSAR_P', 'CONFL_C', 'INSAR_W', 'SATTY_C', 'INSAR_Z', 'INSAC_P', 'INSAC_N', 'INSAC_V', 'INSAR_J', 'INSAR_O', 'INSAR_G', 'INSAC_G', 'INSAC_Z', 'INSAC_W', 'INSSA_L', 'INSSA_M', 'INSAR_C', 'INSAC_J', 'INSAR_V', 'INSAC_O', 'INSSA_R', 'INSSA_W', 'INSAC_T', 'CONFL_Q', 'INSAR_T', 'INSAR_U', 'INSAC_C', 'CONTC_C', 'INSSA_D', 'INSAR_B', 'INSAC_U' ] events_table = FromSQL( """ select comp_date, event_code, res_code, addresses.* from stellar.event join aux.stellar_addresses on addr_id = id_number join output.addresses using (address_id) where class = 'I' """, tables=['stellar.event', 'aux.stellar_addresses', 'output.addresses'], parse_dates=['comp_date']) events_table.target = True class Events(Step): def __init__(self): Step.__init__(self, inputs=[events_table]) def run(self, event): # concatenate event and res code, e.g. 'REINS_C' event['event_res_code'] = event.event_code + '_' + event.res_code # binarize event code and event res codes
from drain import data from drain.util import day from drain.data import FromSQL, Merge from drain.step import Step from drain.aggregation import SpacetimeAggregation from drain.aggregate import Count, Fraction, Aggregate, days import pandas as pd tests = Merge( inputs=[ Merge(inputs=[ FromSQL(table='output.tests'), FromSQL(table='output.addresses') ], on='address_id'), # get kid first bll6 and bll10 counts to calculate incidences FromSQL(""" select kid_id, first_bll6_sample_date, first_bll10_sample_date from output.kids """) ], on='kid_id') tests.target = True class TestsAggregation(SpacetimeAggregation): def __init__(self, spacedeltas, dates, parallel=False): SpacetimeAggregation.__init__(self, inputs=[tests], spacedeltas=spacedeltas,
from drain.aggregate import Aggregate, Count, aggregate_counts, days from drain.aggregation import SpacetimeAggregation from drain.step import Construct from drain.data import FromSQL, binarize, binarize_sets, select_regexes from drain.util import list_filter_none, union enroll = FromSQL(query=""" with enroll as ( SELECT kid_id, p.* FROM cornerstone.partenrl p join aux.kid_wics using (part_id_i) UNION ALL SELECT kid_id, p.* FROM cornerstone.partenrl p join aux.kid_mothers on p.part_id_i = mothr_id_i) select kid_id, register_d, last_upd_d, med_risk_f = 'Y' as medical_risk, clinicid_i as clinic, emplymnt_c as employment, occptn_c as occupation, hsehld_n as household_size, hse_inc_a / 100000.0 as household_income, array_remove(array[lang_1_c, lang_2_c, lang_3_c], null) as language, array_remove(array[pa_cde1_c, pa_cde2_c, pa_cde3_c, pa_cde4_c, pa_cde5_c], null) as assistance from enroll """, tables=['aux.kid_wics', 'aux.kid_mothers'], parse_dates=['register_d', 'last_upd_d']) enroll2 = Construct(binarize, inputs=[enroll], category_classes=['employment', 'occupation', 'clinic'], min_freq=100)
from drain.aggregation import SimpleAggregation from drain.aggregate import Count, Aggregate, Proportion, Fraction from drain.data import FromSQL import numpy as np CLASSES = ['residential', 'incentive', 'multifamily', 'industrial', 'commercial', 'brownfield', 'nonprofit'] assessor = FromSQL(query="select *, coalesce(nullif(apartments, 0), 1) as units " "from aux.assessor " "join output.addresses using (address)", tables=['aux.assessor', 'output.addresses']) assessor.target = True class AssessorAggregation(SimpleAggregation): def __init__(self, indexes, parallel=False): SimpleAggregation.__init__(self, inputs=[assessor], indexes=indexes, prefix='assessor', parallel=parallel) @property def aggregates(self): return [ Count(), Aggregate('count', 'mean', 'assessents'), Aggregate(lambda a: a.land_value / 100000, 'mean', name='land_value'), Aggregate(['min_age', 'max_age'], ['min', 'mean', 'max']), # residential total value and average value Fraction( Aggregate(lambda a: a.total_value.where(a.residential > 0) / 100000, 'sum', 'residential_total_value', fname=False), Aggregate(lambda a: a.units.where(a.residential > 0),
from drain.aggregation import SimpleAggregation from drain.aggregate import Count, Aggregate, Proportion, Fraction from drain.data import FromSQL import numpy as np CONDITIONS = [ 'condition_major', 'condition_minor', 'condition_uninhabitable', 'condition_sound' ] buildings = FromSQL( query="select * from aux.buildings " "join (select distinct on (building_id) * " "from output.addresses order by building_id, address_id) a " "using (building_id)", tables=['aux.buildings', 'output.addresses']) buildings.target = True class BuildingsAggregation(SimpleAggregation): def __init__(self, indexes, parallel=False): SimpleAggregation.__init__(self, inputs=[buildings], indexes=indexes, prefix='buildings', parallel=parallel) @property def aggregates(self): return [
from drain.data import FromSQL from drain.aggregate import Count from drain.aggregation import SpacetimeAggregation PERMIT_TYPES = [ 'electric_wiring', 'elevator_equipment', 'signs', 'new_construction', 'renovation_alteration', 'easy_permit_process', 'porch_construction', 'wrecking_demolition', 'scaffolding', 'reinstate_revoked_pmt', 'for_extension_of_pmt' ] permits = FromSQL( "select * from aux.building_permits join output.addresses using (address)", parse_dates=['issue_date'], target=True) class PermitsAggregation(SpacetimeAggregation): def __init__(self, spacedeltas, dates, **kwargs): SpacetimeAggregation.__init__(self, spacedeltas=spacedeltas, dates=dates, prefix='permits', date_column='issue_date', **kwargs) if not self.parallel: self.inputs = [permits] def get_aggregates(self, date, data): aggregates = [
from drain.step import Step from drain import util, data from drain.data import FromSQL, Merge import pandas as pd import numpy as np import logging kid_addresses = FromSQL(table='output.kid_addresses') kid_addresses.target = True kids = FromSQL(table='output.kids') kids.target = True addresses = FromSQL(table='output.addresses') addresses.target = True class LeadLeft(Step): """ This Step produces a table with primary key (kid_id, address_id, date). It's called a "left" because it acts as an index onto which features are left joined. """ def __init__(self, month, day, year_min): """ Args: month: the month to use in the date index day: the day of the month to use in the date index year_min: the first year to include in the date index """
from drain.step import Step from drain.util import timestamp, cross_join from drain.data import FromSQL, Merge import pandas as pd import numpy as np import logging addresses = FromSQL(table='output.addresses') addresses.target = True class LeadAddressLeft(Step): """ This Step simply adds dates to all addresses in the database. It is used by LeadData for building an address dataset. """ def __init__(self, month, day, year_min, year_max): """ Args: month: the month to use day: the day of the month to use year_min: the year to start year_max: the year to end """ Step.__init__(self, month=month, day=day, year_min=year_min, year_max=year_max, inputs=[addresses]) def run(self, addresses): """ Returns: - left: the cross product of the output.addresses table with the specified dates.
from drain import data from drain.util import day from drain.data import FromSQL, Merge from drain.step import Step from drain.aggregation import SpacetimeAggregation from drain.aggregate import Count, Fraction, Aggregate, days import pandas as pd import logging # TODO: make this more efficient by not including unnecessary address columns tests = FromSQL(table='output.tests', parse_dates=['date'], target=True) addresses = FromSQL(table='output.addresses', target=True) class TestsAggregation(SpacetimeAggregation): def __init__(self, spacedeltas, dates, **kwargs): SpacetimeAggregation.__init__(self, spacedeltas=spacedeltas, dates=dates, prefix='tests', date_column='date', **kwargs) if not self.parallel: self.inputs = [Merge(inputs=[tests, addresses], on='address_id')] def get_aggregates(self, date, delta): kid_count = Aggregate('kid_id', 'nunique', name='kid_count', fname=False) aggregates = [ Count(), Aggregate('bll', ['mean', 'median', 'max', 'min', 'std']),