Python FromSQL示例，drain.data.FromSQL Python示例

示例#1

0

显示文件

文件： acs.py 项目： xiaochi-liu/lead-public

    def __init__(self, inputs):
        """
        Args:
            inputs: array containing a LeadLeft instance
        """
        acs = FromSQL(table='output.acs')
        acs.target = True
        inputs = inputs + [MapResults([acs], 'acs')]

        Step.__init__(self, inputs=inputs)

示例#2

0

显示文件

    def __init__(self, spacedeltas, dates, **kwargs):
        SpacetimeAggregation.__init__(self,
                                      spacedeltas=spacedeltas,
                                      dates=dates,
                                      prefix='wicenroll',
                                      date_column='register_d',
                                      **kwargs)

        if not self.parallel:
            self.inputs = [
                FromSQL(query="""
with enroll as (
SELECT kid_id, p.* 
FROM cornerstone.partenrl p join aux.kid_wics using (part_id_i)
UNION ALL
SELECT kid_id, p.*
FROM cornerstone.partenrl p join aux.kid_mothers on p.part_id_i = mothr_id_i)

select *, 
array_remove(array[lang_1_c, lang_2_c, lang_3_c], null) as language,
array_remove(array[pa_cde1_c, pa_cde2_c, pa_cde3_c, pa_cde4_c, pa_cde5_c], null) as assistance
from enroll 
""",
                        parse_dates=['register_d', 'last_upd_d'],
                        target=True)
            ]

示例#3

0

显示文件

    def __init__(self, spacedeltas, dates, **kwargs):
        SpacetimeAggregation.__init__(self,
                                      spacedeltas=spacedeltas,
                                      dates=dates,
                                      prefix='wicbirth',
                                      date_column='date_of_birth',
                                      **kwargs)

        if not self.parallel:
            self.inputs = [
                FromSQL(target=True,
                        query="""
SELECT *, 
apgar_n::int as apgar,
nullif(lgt_inch_n, 0) as length,
nullif(wgt_grm_n, 0) as weight,
nullif(headcirc_n, 0) as head_circumference,
array_remove(array[
        inf_cmp1_c, inf_cmp2_c, inf_cmp3_c, inf_cmp4_c, inf_cmp5_c
], null) as complication
FROM aux.kids
JOIN aux.kid_mothers USING (kid_id)
JOIN cornerstone.birth USING (part_id_i, mothr_id_i)
""",
                        parse_dates=['date_of_birth'])
            ]

示例#4

0

显示文件

    def __init__(self, month, day, year_min, **kwargs):
        Step.__init__(self, month=month, day=day, year_min=year_min, **kwargs)

        kid_addresses = Merge(on='kid_id',
                              inputs=[
                                  FromSQL(
                                      table='output.kid_addresses',
                                      parse_dates=KID_ADDRESSES_PARSE_DATES,
                                      target=True),
                                  FromSQL(table='output.kids',
                                          parse_dates=KIDS_PARSE_DATES,
                                          to_str=['first_name', 'last_name'],
                                          target=True)
                              ])

        addresses = FromSQL(table='output.addresses', target=True)

        self.inputs = [kid_addresses, addresses]

示例#5

0

显示文件

文件： kids.py 项目： xiaochi-liu/lead-public

    def __init__(self, spacedeltas, dates, parallel=False):
        SpacetimeAggregation.__init__(
            self,
            spacedeltas=spacedeltas,
            dates=dates,
            prefix='kids',
            aggregator_args=['date', 'index', 'delta'],
            date_column='address_min_date',
            max_date_column='address_max_date',
            parallel=parallel)

        if not self.parallel:
            kid_addresses = revise_kid_addresses(date=dates[0])
            addresses = FromSQL(table='output.addresses')
            addresses.target = True
            self.inputs = [
                Merge(inputs=[kid_addresses, addresses], on='address_id')
            ]

示例#6

0

显示文件

    def __init__(self, **kwargs):
        Step.__init__(self, **kwargs)
        self.inputs = [
            FromSQL(query="""
select *, least(init_date, comply_date) as min_date
from output.inspections join output.addresses using (address_id) 
""",
                    parse_dates=['min_date', 'comply_date', 'init_date'],
                    target=False)
        ]

示例#7

0

显示文件

 def __init__(self, indexes, **kwargs):
     SimpleAggregation.__init__(self,
                                indexes=indexes,
                                prefix='assessor',
                                **kwargs)
     if not self.parallel:
         self.inputs = [
             FromSQL(query="select * from aux.assessor "
                     "join output.addresses using (address)",
                     tables=['aux.assessor', 'output.addresses'],
                     target=True)
         ]

示例#8

0

显示文件

 def __init__(self, indexes, **kwargs):
     SimpleAggregation.__init__(self,
                                indexes=indexes,
                                prefix='buildings',
                                **kwargs)
     if not self.parallel:
         self.inputs = [
             FromSQL(
                 query="select * from aux.buildings "
                 "join (select distinct on (building_id) * "
                 "from output.addresses order by building_id, address_id) a "
                 "using (building_id)",
                 tables=['aux.buildings', 'output.addresses'],
                 target=True)
         ]

示例#9

0

显示文件

    def __init__(self, spacedeltas, dates, **kwargs):
        SpacetimeAggregation.__init__(self,
                                      spacedeltas=spacedeltas,
                                      dates=dates,
                                      prefix='wicprenatal',
                                      date_column='visit_d',
                                      **kwargs)

        if not self.parallel:
            self.inputs = [
                FromSQL(target=True,
                        query="""
SELECT kid_id, date_of_birth, p.*
FROM aux.kids
JOIN aux.kid_mothers USING (kid_id)
JOIN cornerstone.birth b USING (part_id_i, mothr_id_i)
JOIN cornerstone.prenatl p ON b.mothr_id_i = p.part_id_i
where date_of_birth - visit_d between -365 and 365
""",
                        parse_dates=['date_of_birth', 'visit_d'])
            ]

示例#10

0

显示文件

    def __init__(self, month, day, year_min, year_max, **kwargs):
        Step.__init__(self,
                      month=month,
                      day=day,
                      year_min=year_min,
                      year_max=year_max,
                      **kwargs)

        acs = FromSQL(table='output.acs', target=True)
        left = LeadLeft(month=month, day=day, year_min=year_min, target=True)

        dates = tuple(
            (date(y, month, day) for y in range(year_min, year_max + 1)))
        self.aggregations = aggregations.all_dict(dates)
        self.aggregation_joins = [
            AggregationJoin(target=True,
                            inputs=[left, a],
                            inputs_mapping=[{
                                'aux': None
                            }, None]) for a in self.aggregations.values()
        ]

        self.inputs = [acs, left] + self.aggregation_joins
        self.inputs_mapping = ['acs', {}] + [None] * len(self.aggregations)

示例#11

0

显示文件

    'water', 'paint', 'window', 'wall', 'porch', 'chip', 'flak', 'peel'
]
STATUS = (['OPEN', 'COMPLIED', 'NO ENTRY'], ['open', 'complied', 'no_entry'])

KEYWORD_COLUMNS = str.join(
    ', ', ("violation_description ~* '{0}' "
           "or violation_inspector_comments ~* '{0}' AS {0}".format(k)
           for k in KEYWORDS))

STATUS_COLUMNS = str.join(', ', ("violation_status = '{0}' AS {1}".format(*s)
                                 for s in zip(*STATUS)))

violations = FromSQL("""
select a.*, violation_date, violation_status, 
    violation_status_date, %s, %s
from input.building_violations 
join output.addresses a using (address)
""" % (KEYWORD_COLUMNS, STATUS_COLUMNS),
                     parse_dates=['violation_date', 'violation_status_date'],
                     target=True)


class ViolationsAggregation(SpacetimeAggregation):
    def __init__(self, spacedeltas, dates, **kwargs):
        SpacetimeAggregation.__init__(
            self,
            spacedeltas=spacedeltas,
            dates=dates,
            prefix='violations',
            date_column='violation_date',
            censor_columns={'violation_status_date': ['violation_status']},
            **kwargs)

示例#12

0

显示文件

# most common combinations of event and res codes
# TODO: consider including less common but useful ones
# TODO: switch to Binarize step with min_freq!
event_res_codes = [
    'REINS_C', 'INSSA_C', 'INSAR_N', 'CMPLY_C', 'ENVPH_C', 'INSAR_P',
    'CONFL_C', 'INSAR_W', 'SATTY_C', 'INSAR_Z', 'INSAC_P', 'INSAC_N',
    'INSAC_V', 'INSAR_J', 'INSAR_O', 'INSAR_G', 'INSAC_G', 'INSAC_Z',
    'INSAC_W', 'INSSA_L', 'INSSA_M', 'INSAR_C', 'INSAC_J', 'INSAR_V',
    'INSAC_O', 'INSSA_R', 'INSSA_W', 'INSAC_T', 'CONFL_Q', 'INSAR_T',
    'INSAR_U', 'INSAC_C', 'CONTC_C', 'INSSA_D', 'INSAR_B', 'INSAC_U'
]
events_table = FromSQL(
    """
                select comp_date, event_code, res_code, addresses.*
                from stellar.event
                join aux.stellar_addresses on addr_id = id_number
                join output.addresses using (address_id)
                where class = 'I'
            """,
    tables=['stellar.event', 'aux.stellar_addresses', 'output.addresses'],
    parse_dates=['comp_date'])
events_table.target = True


class Events(Step):
    def __init__(self):
        Step.__init__(self, inputs=[events_table])

    def run(self, event):
        # concatenate event and res code, e.g. 'REINS_C'
        event['event_res_code'] = event.event_code + '_' + event.res_code
        # binarize event code and event res codes

示例#13

0

显示文件

文件： tests.py 项目： xiaochi-liu/lead-public

from drain import data
from drain.util import day
from drain.data import FromSQL, Merge
from drain.step import Step
from drain.aggregation import SpacetimeAggregation
from drain.aggregate import Count, Fraction, Aggregate, days

import pandas as pd

tests = Merge(
    inputs=[
        Merge(inputs=[
            FromSQL(table='output.tests'),
            FromSQL(table='output.addresses')
        ],
              on='address_id'),
        # get kid first bll6 and bll10 counts to calculate incidences
        FromSQL("""
        select kid_id, first_bll6_sample_date, first_bll10_sample_date 
        from output.kids
    """)
    ],
    on='kid_id')
tests.target = True


class TestsAggregation(SpacetimeAggregation):
    def __init__(self, spacedeltas, dates, parallel=False):
        SpacetimeAggregation.__init__(self,
                                      inputs=[tests],
                                      spacedeltas=spacedeltas,

示例#14

0

显示文件

文件： wic.py 项目： xiaochi-liu/lead-public

from drain.aggregate import Aggregate, Count, aggregate_counts, days
from drain.aggregation import SpacetimeAggregation
from drain.step import Construct
from drain.data import FromSQL, binarize, binarize_sets, select_regexes
from drain.util import list_filter_none, union

enroll = FromSQL(query="""
        with enroll as (
        SELECT kid_id, p.* 
        FROM cornerstone.partenrl p join aux.kid_wics using (part_id_i)
        UNION ALL
        SELECT kid_id, p.*
        FROM cornerstone.partenrl p join aux.kid_mothers on p.part_id_i = mothr_id_i)

select kid_id, register_d, last_upd_d,
    med_risk_f = 'Y' as medical_risk,
    clinicid_i as clinic,
    emplymnt_c as employment, 
    occptn_c as occupation,
    hsehld_n as household_size, hse_inc_a / 100000.0 as household_income,
array_remove(array[lang_1_c, lang_2_c, lang_3_c], null) as language,
array_remove(array[pa_cde1_c, pa_cde2_c, pa_cde3_c, pa_cde4_c, pa_cde5_c], null) as assistance
from enroll 
""",
                 tables=['aux.kid_wics', 'aux.kid_mothers'],
                 parse_dates=['register_d', 'last_upd_d'])

enroll2 = Construct(binarize,
                    inputs=[enroll],
                    category_classes=['employment', 'occupation', 'clinic'],
                    min_freq=100)

示例#15

0

显示文件

文件： assessor.py 项目： xiaochi-liu/lead-public

from drain.aggregation import SimpleAggregation
from drain.aggregate import Count, Aggregate, Proportion, Fraction
from drain.data import FromSQL

import numpy as np

CLASSES = ['residential', 'incentive', 'multifamily', 'industrial', 'commercial', 'brownfield', 'nonprofit']

assessor = FromSQL(query="select *, coalesce(nullif(apartments, 0), 1) as units "
                "from aux.assessor "
                "join output.addresses using (address)",
                tables=['aux.assessor', 'output.addresses'])
assessor.target = True

class AssessorAggregation(SimpleAggregation):
    def __init__(self, indexes, parallel=False):
        SimpleAggregation.__init__(self, inputs=[assessor], indexes=indexes, prefix='assessor', parallel=parallel)

    @property
    def aggregates(self):
        return [
            Count(),
            Aggregate('count', 'mean', 'assessents'),
            Aggregate(lambda a: a.land_value / 100000, 'mean', name='land_value'),
            Aggregate(['min_age', 'max_age'], ['min', 'mean', 'max']),

            # residential total value and average value
            Fraction(
                Aggregate(lambda a: a.total_value.where(a.residential > 0) / 100000,
                          'sum', 'residential_total_value', fname=False),
                Aggregate(lambda a: a.units.where(a.residential > 0),

示例#16

0

显示文件

文件： buildings.py 项目： xiaochi-liu/lead-public

from drain.aggregation import SimpleAggregation
from drain.aggregate import Count, Aggregate, Proportion, Fraction
from drain.data import FromSQL

import numpy as np

CONDITIONS = [
    'condition_major', 'condition_minor', 'condition_uninhabitable',
    'condition_sound'
]

buildings = FromSQL(
    query="select * from aux.buildings "
    "join (select distinct on (building_id) * "
    "from output.addresses order by building_id, address_id) a "
    "using (building_id)",
    tables=['aux.buildings', 'output.addresses'])
buildings.target = True


class BuildingsAggregation(SimpleAggregation):
    def __init__(self, indexes, parallel=False):
        SimpleAggregation.__init__(self,
                                   inputs=[buildings],
                                   indexes=indexes,
                                   prefix='buildings',
                                   parallel=parallel)

    @property
    def aggregates(self):
        return [

示例#17

0

显示文件

from drain.data import FromSQL
from drain.aggregate import Count
from drain.aggregation import SpacetimeAggregation

PERMIT_TYPES = [
    'electric_wiring', 'elevator_equipment', 'signs', 'new_construction',
    'renovation_alteration', 'easy_permit_process', 'porch_construction',
    'wrecking_demolition', 'scaffolding', 'reinstate_revoked_pmt',
    'for_extension_of_pmt'
]

permits = FromSQL(
    "select * from aux.building_permits join output.addresses using (address)",
    parse_dates=['issue_date'],
    target=True)


class PermitsAggregation(SpacetimeAggregation):
    def __init__(self, spacedeltas, dates, **kwargs):
        SpacetimeAggregation.__init__(self,
                                      spacedeltas=spacedeltas,
                                      dates=dates,
                                      prefix='permits',
                                      date_column='issue_date',
                                      **kwargs)

        if not self.parallel:
            self.inputs = [permits]

    def get_aggregates(self, date, data):
        aggregates = [

示例#18

0

显示文件

from drain.step import Step
from drain import util, data
from drain.data import FromSQL, Merge

import pandas as pd
import numpy as np
import logging

kid_addresses = FromSQL(table='output.kid_addresses')
kid_addresses.target = True

kids = FromSQL(table='output.kids')
kids.target = True

addresses = FromSQL(table='output.addresses')
addresses.target = True


class LeadLeft(Step):
    """
    This Step produces a table with primary key (kid_id, address_id, date).
    It's called a "left" because it acts as an index onto which features are
    left joined.
    """
    def __init__(self, month, day, year_min):
        """
        Args:
            month: the month to use in the date index
            day: the day of the month to use in the date index
            year_min: the first year to include in the date index
        """

示例#19

0

显示文件

文件： address.py 项目： xiaochi-liu/lead-public

from drain.step import Step
from drain.util import timestamp, cross_join
from drain.data import FromSQL, Merge

import pandas as pd
import numpy as np
import logging

addresses = FromSQL(table='output.addresses')
addresses.target = True

class LeadAddressLeft(Step):
    """
    This Step simply adds dates to all addresses in the database. It is used
    by LeadData for building an address dataset.
    """
    def __init__(self, month, day, year_min, year_max):
        """
        Args:
            month: the month to use
            day: the day of the month to use
            year_min: the year to start
            year_max: the year to end
        """
        Step.__init__(self, month=month, day=day, year_min=year_min, year_max=year_max, inputs=[addresses])

    def run(self, addresses):
        """
        Returns:
            - left: the cross product of the output.addresses table with the
                specified dates.

示例#20

0

显示文件

from drain import data
from drain.util import day
from drain.data import FromSQL, Merge
from drain.step import Step
from drain.aggregation import SpacetimeAggregation
from drain.aggregate import Count, Fraction, Aggregate, days

import pandas as pd
import logging

# TODO: make this more efficient by not including unnecessary address columns
tests = FromSQL(table='output.tests', parse_dates=['date'], 
        target=True)
addresses = FromSQL(table='output.addresses', target=True)

class TestsAggregation(SpacetimeAggregation):
    def __init__(self, spacedeltas, dates, **kwargs):
        SpacetimeAggregation.__init__(self,
            spacedeltas=spacedeltas, dates=dates, prefix='tests',
            date_column='date', **kwargs)

        if not self.parallel:
            self.inputs = [Merge(inputs=[tests, addresses], on='address_id')]

    def get_aggregates(self, date, delta):
        kid_count = Aggregate('kid_id', 'nunique', 
                name='kid_count', fname=False)

        aggregates = [
            Count(),
            Aggregate('bll', ['mean', 'median', 'max', 'min', 'std']),