Exemplo n.º 1
0
    def __init__(self, inputs):
        """
        Args:
            inputs: array containing a LeadLeft instance
        """
        acs = FromSQL(table='output.acs')
        acs.target = True
        inputs = inputs + [MapResults([acs], 'acs')]

        Step.__init__(self, inputs=inputs)
Exemplo n.º 2
0
    def __init__(self, spacedeltas, dates, **kwargs):
        SpacetimeAggregation.__init__(self,
                                      spacedeltas=spacedeltas,
                                      dates=dates,
                                      prefix='wicenroll',
                                      date_column='register_d',
                                      **kwargs)

        if not self.parallel:
            self.inputs = [
                FromSQL(query="""
with enroll as (
SELECT kid_id, p.* 
FROM cornerstone.partenrl p join aux.kid_wics using (part_id_i)
UNION ALL
SELECT kid_id, p.*
FROM cornerstone.partenrl p join aux.kid_mothers on p.part_id_i = mothr_id_i)

select *, 
array_remove(array[lang_1_c, lang_2_c, lang_3_c], null) as language,
array_remove(array[pa_cde1_c, pa_cde2_c, pa_cde3_c, pa_cde4_c, pa_cde5_c], null) as assistance
from enroll 
""",
                        parse_dates=['register_d', 'last_upd_d'],
                        target=True)
            ]
Exemplo n.º 3
0
    def __init__(self, spacedeltas, dates, **kwargs):
        SpacetimeAggregation.__init__(self,
                                      spacedeltas=spacedeltas,
                                      dates=dates,
                                      prefix='wicbirth',
                                      date_column='date_of_birth',
                                      **kwargs)

        if not self.parallel:
            self.inputs = [
                FromSQL(target=True,
                        query="""
SELECT *, 
apgar_n::int as apgar,
nullif(lgt_inch_n, 0) as length,
nullif(wgt_grm_n, 0) as weight,
nullif(headcirc_n, 0) as head_circumference,
array_remove(array[
        inf_cmp1_c, inf_cmp2_c, inf_cmp3_c, inf_cmp4_c, inf_cmp5_c
], null) as complication
FROM aux.kids
JOIN aux.kid_mothers USING (kid_id)
JOIN cornerstone.birth USING (part_id_i, mothr_id_i)
""",
                        parse_dates=['date_of_birth'])
            ]
Exemplo n.º 4
0
    def __init__(self, month, day, year_min, **kwargs):
        Step.__init__(self, month=month, day=day, year_min=year_min, **kwargs)

        kid_addresses = Merge(on='kid_id',
                              inputs=[
                                  FromSQL(
                                      table='output.kid_addresses',
                                      parse_dates=KID_ADDRESSES_PARSE_DATES,
                                      target=True),
                                  FromSQL(table='output.kids',
                                          parse_dates=KIDS_PARSE_DATES,
                                          to_str=['first_name', 'last_name'],
                                          target=True)
                              ])

        addresses = FromSQL(table='output.addresses', target=True)

        self.inputs = [kid_addresses, addresses]
Exemplo n.º 5
0
    def __init__(self, spacedeltas, dates, parallel=False):
        SpacetimeAggregation.__init__(
            self,
            spacedeltas=spacedeltas,
            dates=dates,
            prefix='kids',
            aggregator_args=['date', 'index', 'delta'],
            date_column='address_min_date',
            max_date_column='address_max_date',
            parallel=parallel)

        if not self.parallel:
            kid_addresses = revise_kid_addresses(date=dates[0])
            addresses = FromSQL(table='output.addresses')
            addresses.target = True
            self.inputs = [
                Merge(inputs=[kid_addresses, addresses], on='address_id')
            ]
Exemplo n.º 6
0
    def __init__(self, **kwargs):
        Step.__init__(self, **kwargs)
        self.inputs = [
            FromSQL(query="""
select *, least(init_date, comply_date) as min_date
from output.inspections join output.addresses using (address_id) 
""",
                    parse_dates=['min_date', 'comply_date', 'init_date'],
                    target=False)
        ]
Exemplo n.º 7
0
 def __init__(self, indexes, **kwargs):
     SimpleAggregation.__init__(self,
                                indexes=indexes,
                                prefix='assessor',
                                **kwargs)
     if not self.parallel:
         self.inputs = [
             FromSQL(query="select * from aux.assessor "
                     "join output.addresses using (address)",
                     tables=['aux.assessor', 'output.addresses'],
                     target=True)
         ]
Exemplo n.º 8
0
 def __init__(self, indexes, **kwargs):
     SimpleAggregation.__init__(self,
                                indexes=indexes,
                                prefix='buildings',
                                **kwargs)
     if not self.parallel:
         self.inputs = [
             FromSQL(
                 query="select * from aux.buildings "
                 "join (select distinct on (building_id) * "
                 "from output.addresses order by building_id, address_id) a "
                 "using (building_id)",
                 tables=['aux.buildings', 'output.addresses'],
                 target=True)
         ]
Exemplo n.º 9
0
    def __init__(self, spacedeltas, dates, **kwargs):
        SpacetimeAggregation.__init__(self,
                                      spacedeltas=spacedeltas,
                                      dates=dates,
                                      prefix='wicprenatal',
                                      date_column='visit_d',
                                      **kwargs)

        if not self.parallel:
            self.inputs = [
                FromSQL(target=True,
                        query="""
SELECT kid_id, date_of_birth, p.*
FROM aux.kids
JOIN aux.kid_mothers USING (kid_id)
JOIN cornerstone.birth b USING (part_id_i, mothr_id_i)
JOIN cornerstone.prenatl p ON b.mothr_id_i = p.part_id_i
where date_of_birth - visit_d between -365 and 365
""",
                        parse_dates=['date_of_birth', 'visit_d'])
            ]
Exemplo n.º 10
0
    def __init__(self, month, day, year_min, year_max, **kwargs):
        Step.__init__(self,
                      month=month,
                      day=day,
                      year_min=year_min,
                      year_max=year_max,
                      **kwargs)

        acs = FromSQL(table='output.acs', target=True)
        left = LeadLeft(month=month, day=day, year_min=year_min, target=True)

        dates = tuple(
            (date(y, month, day) for y in range(year_min, year_max + 1)))
        self.aggregations = aggregations.all_dict(dates)
        self.aggregation_joins = [
            AggregationJoin(target=True,
                            inputs=[left, a],
                            inputs_mapping=[{
                                'aux': None
                            }, None]) for a in self.aggregations.values()
        ]

        self.inputs = [acs, left] + self.aggregation_joins
        self.inputs_mapping = ['acs', {}] + [None] * len(self.aggregations)
Exemplo n.º 11
0
    'water', 'paint', 'window', 'wall', 'porch', 'chip', 'flak', 'peel'
]
STATUS = (['OPEN', 'COMPLIED', 'NO ENTRY'], ['open', 'complied', 'no_entry'])

KEYWORD_COLUMNS = str.join(
    ', ', ("violation_description ~* '{0}' "
           "or violation_inspector_comments ~* '{0}' AS {0}".format(k)
           for k in KEYWORDS))

STATUS_COLUMNS = str.join(', ', ("violation_status = '{0}' AS {1}".format(*s)
                                 for s in zip(*STATUS)))

violations = FromSQL("""
select a.*, violation_date, violation_status, 
    violation_status_date, %s, %s
from input.building_violations 
join output.addresses a using (address)
""" % (KEYWORD_COLUMNS, STATUS_COLUMNS),
                     parse_dates=['violation_date', 'violation_status_date'],
                     target=True)


class ViolationsAggregation(SpacetimeAggregation):
    def __init__(self, spacedeltas, dates, **kwargs):
        SpacetimeAggregation.__init__(
            self,
            spacedeltas=spacedeltas,
            dates=dates,
            prefix='violations',
            date_column='violation_date',
            censor_columns={'violation_status_date': ['violation_status']},
            **kwargs)
Exemplo n.º 12
0
# most common combinations of event and res codes
# TODO: consider including less common but useful ones
# TODO: switch to Binarize step with min_freq!
event_res_codes = [
    'REINS_C', 'INSSA_C', 'INSAR_N', 'CMPLY_C', 'ENVPH_C', 'INSAR_P',
    'CONFL_C', 'INSAR_W', 'SATTY_C', 'INSAR_Z', 'INSAC_P', 'INSAC_N',
    'INSAC_V', 'INSAR_J', 'INSAR_O', 'INSAR_G', 'INSAC_G', 'INSAC_Z',
    'INSAC_W', 'INSSA_L', 'INSSA_M', 'INSAR_C', 'INSAC_J', 'INSAR_V',
    'INSAC_O', 'INSSA_R', 'INSSA_W', 'INSAC_T', 'CONFL_Q', 'INSAR_T',
    'INSAR_U', 'INSAC_C', 'CONTC_C', 'INSSA_D', 'INSAR_B', 'INSAC_U'
]
events_table = FromSQL(
    """
                select comp_date, event_code, res_code, addresses.*
                from stellar.event
                join aux.stellar_addresses on addr_id = id_number
                join output.addresses using (address_id)
                where class = 'I'
            """,
    tables=['stellar.event', 'aux.stellar_addresses', 'output.addresses'],
    parse_dates=['comp_date'])
events_table.target = True


class Events(Step):
    def __init__(self):
        Step.__init__(self, inputs=[events_table])

    def run(self, event):
        # concatenate event and res code, e.g. 'REINS_C'
        event['event_res_code'] = event.event_code + '_' + event.res_code
        # binarize event code and event res codes
Exemplo n.º 13
0
from drain import data
from drain.util import day
from drain.data import FromSQL, Merge
from drain.step import Step
from drain.aggregation import SpacetimeAggregation
from drain.aggregate import Count, Fraction, Aggregate, days

import pandas as pd

tests = Merge(
    inputs=[
        Merge(inputs=[
            FromSQL(table='output.tests'),
            FromSQL(table='output.addresses')
        ],
              on='address_id'),
        # get kid first bll6 and bll10 counts to calculate incidences
        FromSQL("""
        select kid_id, first_bll6_sample_date, first_bll10_sample_date 
        from output.kids
    """)
    ],
    on='kid_id')
tests.target = True


class TestsAggregation(SpacetimeAggregation):
    def __init__(self, spacedeltas, dates, parallel=False):
        SpacetimeAggregation.__init__(self,
                                      inputs=[tests],
                                      spacedeltas=spacedeltas,
Exemplo n.º 14
0
from drain.aggregate import Aggregate, Count, aggregate_counts, days
from drain.aggregation import SpacetimeAggregation
from drain.step import Construct
from drain.data import FromSQL, binarize, binarize_sets, select_regexes
from drain.util import list_filter_none, union

enroll = FromSQL(query="""
        with enroll as (
        SELECT kid_id, p.* 
        FROM cornerstone.partenrl p join aux.kid_wics using (part_id_i)
        UNION ALL
        SELECT kid_id, p.*
        FROM cornerstone.partenrl p join aux.kid_mothers on p.part_id_i = mothr_id_i)

select kid_id, register_d, last_upd_d,
    med_risk_f = 'Y' as medical_risk,
    clinicid_i as clinic,
    emplymnt_c as employment, 
    occptn_c as occupation,
    hsehld_n as household_size, hse_inc_a / 100000.0 as household_income,
array_remove(array[lang_1_c, lang_2_c, lang_3_c], null) as language,
array_remove(array[pa_cde1_c, pa_cde2_c, pa_cde3_c, pa_cde4_c, pa_cde5_c], null) as assistance
from enroll 
""",
                 tables=['aux.kid_wics', 'aux.kid_mothers'],
                 parse_dates=['register_d', 'last_upd_d'])

enroll2 = Construct(binarize,
                    inputs=[enroll],
                    category_classes=['employment', 'occupation', 'clinic'],
                    min_freq=100)
Exemplo n.º 15
0
from drain.aggregation import SimpleAggregation
from drain.aggregate import Count, Aggregate, Proportion, Fraction
from drain.data import FromSQL

import numpy as np

CLASSES = ['residential', 'incentive', 'multifamily', 'industrial', 'commercial', 'brownfield', 'nonprofit']

assessor = FromSQL(query="select *, coalesce(nullif(apartments, 0), 1) as units "
                "from aux.assessor "
                "join output.addresses using (address)",
                tables=['aux.assessor', 'output.addresses'])
assessor.target = True

class AssessorAggregation(SimpleAggregation):
    def __init__(self, indexes, parallel=False):
        SimpleAggregation.__init__(self, inputs=[assessor], indexes=indexes, prefix='assessor', parallel=parallel)

    @property
    def aggregates(self):
        return [
            Count(),
            Aggregate('count', 'mean', 'assessents'),
            Aggregate(lambda a: a.land_value / 100000, 'mean', name='land_value'),
            Aggregate(['min_age', 'max_age'], ['min', 'mean', 'max']),

            # residential total value and average value
            Fraction(
                Aggregate(lambda a: a.total_value.where(a.residential > 0) / 100000,
                          'sum', 'residential_total_value', fname=False),
                Aggregate(lambda a: a.units.where(a.residential > 0),
Exemplo n.º 16
0
from drain.aggregation import SimpleAggregation
from drain.aggregate import Count, Aggregate, Proportion, Fraction
from drain.data import FromSQL

import numpy as np

CONDITIONS = [
    'condition_major', 'condition_minor', 'condition_uninhabitable',
    'condition_sound'
]

buildings = FromSQL(
    query="select * from aux.buildings "
    "join (select distinct on (building_id) * "
    "from output.addresses order by building_id, address_id) a "
    "using (building_id)",
    tables=['aux.buildings', 'output.addresses'])
buildings.target = True


class BuildingsAggregation(SimpleAggregation):
    def __init__(self, indexes, parallel=False):
        SimpleAggregation.__init__(self,
                                   inputs=[buildings],
                                   indexes=indexes,
                                   prefix='buildings',
                                   parallel=parallel)

    @property
    def aggregates(self):
        return [
Exemplo n.º 17
0
from drain.data import FromSQL
from drain.aggregate import Count
from drain.aggregation import SpacetimeAggregation

PERMIT_TYPES = [
    'electric_wiring', 'elevator_equipment', 'signs', 'new_construction',
    'renovation_alteration', 'easy_permit_process', 'porch_construction',
    'wrecking_demolition', 'scaffolding', 'reinstate_revoked_pmt',
    'for_extension_of_pmt'
]

permits = FromSQL(
    "select * from aux.building_permits join output.addresses using (address)",
    parse_dates=['issue_date'],
    target=True)


class PermitsAggregation(SpacetimeAggregation):
    def __init__(self, spacedeltas, dates, **kwargs):
        SpacetimeAggregation.__init__(self,
                                      spacedeltas=spacedeltas,
                                      dates=dates,
                                      prefix='permits',
                                      date_column='issue_date',
                                      **kwargs)

        if not self.parallel:
            self.inputs = [permits]

    def get_aggregates(self, date, data):
        aggregates = [
Exemplo n.º 18
0
from drain.step import Step
from drain import util, data
from drain.data import FromSQL, Merge

import pandas as pd
import numpy as np
import logging

kid_addresses = FromSQL(table='output.kid_addresses')
kid_addresses.target = True

kids = FromSQL(table='output.kids')
kids.target = True

addresses = FromSQL(table='output.addresses')
addresses.target = True


class LeadLeft(Step):
    """
    This Step produces a table with primary key (kid_id, address_id, date).
    It's called a "left" because it acts as an index onto which features are
    left joined.
    """
    def __init__(self, month, day, year_min):
        """
        Args:
            month: the month to use in the date index
            day: the day of the month to use in the date index
            year_min: the first year to include in the date index
        """
Exemplo n.º 19
0
from drain.step import Step
from drain.util import timestamp, cross_join
from drain.data import FromSQL, Merge

import pandas as pd
import numpy as np
import logging

addresses = FromSQL(table='output.addresses')
addresses.target = True

class LeadAddressLeft(Step):
    """
    This Step simply adds dates to all addresses in the database. It is used
    by LeadData for building an address dataset.
    """
    def __init__(self, month, day, year_min, year_max):
        """
        Args:
            month: the month to use
            day: the day of the month to use
            year_min: the year to start
            year_max: the year to end
        """
        Step.__init__(self, month=month, day=day, year_min=year_min, year_max=year_max, inputs=[addresses])

    def run(self, addresses):
        """
        Returns:
            - left: the cross product of the output.addresses table with the
                specified dates.
Exemplo n.º 20
0
from drain import data
from drain.util import day
from drain.data import FromSQL, Merge
from drain.step import Step
from drain.aggregation import SpacetimeAggregation
from drain.aggregate import Count, Fraction, Aggregate, days

import pandas as pd
import logging

# TODO: make this more efficient by not including unnecessary address columns
tests = FromSQL(table='output.tests', parse_dates=['date'], 
        target=True)
addresses = FromSQL(table='output.addresses', target=True)

class TestsAggregation(SpacetimeAggregation):
    def __init__(self, spacedeltas, dates, **kwargs):
        SpacetimeAggregation.__init__(self,
            spacedeltas=spacedeltas, dates=dates, prefix='tests',
            date_column='date', **kwargs)

        if not self.parallel:
            self.inputs = [Merge(inputs=[tests, addresses], on='address_id')]

    def get_aggregates(self, date, delta):
        kid_count = Aggregate('kid_id', 'nunique', 
                name='kid_count', fname=False)

        aggregates = [
            Count(),
            Aggregate('bll', ['mean', 'median', 'max', 'min', 'std']),