Python Aggregate示例，drain.aggregate.Aggregate Python示例

示例#1

0

显示文件

    def get_aggregates(self, date, delta):

        aggregates = [
            Count(),
            Aggregate('inspected', 'max', fname=False),
            Aggregate('complied', 'max', fname=False),
            Count('hazard_int', prop=True),
            Count('hazard_ext', prop=True),
            Count('hazard', prop=True),
            Count('hazard_both', prop=True),
            Count('inspected'),
            Count('complied', prop=True),
            Aggregate('inspection_to_compliance', ['mean', 'min', 'max']),
            Aggregate(lambda i: (date - i.init_date) / day,
                      ['mean', 'min', 'max'],
                      name='from_inspection'),
            Aggregate(lambda i: (date - i.comply_date) / day,
                      ['mean', 'min', 'max'],
                      name='from_compliance'),
        ]

        aggregates.extend([
            Count(lambda i, c=c: i['closure'] == c,
                  name='closure_%s' % c,
                  prop=True) for c in CLOSURE_CODES
        ])

        return aggregates

示例#2

0

显示文件

文件： wic.py 项目： xiaochi-liu/lead-public

    def get_aggregates(self, date, delta):
        prenatal = self.inputs[0].get_result()

        aggregates = [
            Count(),
            Aggregate(days('visit_d', 'date_of_birth'), ['min', 'max'],
                      'visit'),
            Aggregate(list(select_regexes(prenatal, ['service_.*'])),
                      'sum',
                      fname=False),
            Aggregate('preg_nbr_n', 'max', 'previous_pregnancies',
                      fname=False),
            Aggregate('lv_brth_n', 'max', 'previous_births', fname=False),
            Aggregate('othr_trm_n',
                      'max',
                      'previous_terminations',
                      fname=False),
            Aggregate(lambda p: p.smk3_mth_f == 'Y',
                      'any',
                      'smoked_3mo',
                      fname=False),
            Aggregate('cig3_day_n', 'max', 'cigarettes_per_day', fname=False),
            Aggregate(lambda p: p.drk3_mth_f == 'Y',
                      'any',
                      'drank_3mo',
                      fname=False),
            Aggregate('dr_dy_wk_n', 'max', 'days_drank_per_week', fname=False),
            Aggregate('drnk_day_n', 'max', 'drinks_per_day', fname=False),
        ]

        return aggregates

示例#3

0

显示文件

文件： wic.py 项目： xiaochi-liu/lead-public

    def get_aggregates(self, date, delta):
        enroll = self.inputs[0].get_result()
        aggregates = [
            Aggregate('medical_risk', 'any', fname=False),
            Aggregate(['household_size', 'household_income'],
                      ['median', 'max']),
            Aggregate(list(
                select_regexes(
                    enroll,
                    ['(employment|occupation|language|assistance|clinic)_.*'
                     ])),
                      'sum',
                      fname=False)
        ]

        return aggregates

示例#4

0

显示文件

文件： buildings.py 项目： xiaochi-liu/lead-public

 def aggregates(self):
     return [
         Count(),
         Aggregate('area', 'sum'),
         Aggregate(lambda b: b.area * b.stories, 'mean', 'volume'),
         Aggregate('years_built', [
             lambda y: np.nanmedian(np.concatenate(y.values)),
             lambda y: np.nanmean(np.concatenate(y.values)),
             lambda y: np.nanmin(np.concatenate(y.values)),
             lambda y: np.nanmax(np.concatenate(y.values)),
         ],
                   fname=['median', 'mean', 'min', 'max']),
         Aggregate('address_count', 'sum'),
         # average proportion of sound building condition
         Proportion(['%s_prop' % c for c in CONDITIONS],
                    'condition_not_null',
                    name=CONDITIONS),
         Aggregate([lambda p: p['%s_prop' % c] > 0 for c in CONDITIONS],
                   'any',
                   name=CONDITIONS),
         Aggregate('stories', 'mean'),
         Aggregate('units', 'sum'),
         Proportion('pre1978_prop',
                    lambda i: i.pre1978_prop.notnull(),
                    denom_name='pre1978_not_null'),
     ]

示例#5

0

显示文件

    def get_aggregates(self, date, delta):

        aggregates = [
            Aggregate(lambda e: e.med_risk_f == 'Y',
                      'any',
                      'medical_risk',
                      fname=False),
            Aggregate('emplymnt_c',
                      lambda e: set(list_filter_none(e)),
                      'employment_status',
                      fname=False),
            Aggregate('occptn_c',
                      lambda o: set(list_filter_none(o)),
                      'occupation',
                      fname=False),
            Aggregate(['hsehld_n', 'hse_inc_a'], 'median',
                      ['household_size', 'household_income']),
            Aggregate('language',
                      lambda ls: union(set(l) for l in ls),
                      fname=False),
            Aggregate('assistance',
                      lambda ls: union(set(l) for l in ls),
                      fname=False),
            Aggregate('clinicid_i', lambda c: set(c), 'clinic', fname=False)
        ]

        return aggregates

示例#6

0

显示文件

    def get_aggregates(self, date, delta):
        kid_count = Aggregate('kid_id', 'nunique', 
                name='kid_count', fname=False)

        aggregates = [
            Count(),
            Aggregate('bll', ['mean', 'median', 'max', 'min', 'std']),
            Count(lambda t: t.bll <= 2, 'bll2', prop=True),
            Fraction(Count(['first_bll6', 'first_bll10']), kid_count, 
                    include_numerator=True, include_denominator=True),
        ]
        if delta == 'all':
            aggregates.extend([
                Aggregate(days('date',date), ['min','max'], 
                        'days_since_test'),
                Aggregate([
                    lambda t: (date - t.date.where(t.bll >= 6))/day,
                    lambda t: (date - t.date.where(t.bll >= 10))/day],
                    ['min','max'], ['days_since_bll6', 'days_since_bll10'])
            ])
        return aggregates

示例#7

0

显示文件

 def aggregates(self):
     return [
         Aggregate('count', 'mean'),
         Aggregate('land_value', 'sum'),
         Aggregate('age', ['min', 'mean', 'max']),
         Fraction(Aggregate('total_value', 'sum', fname=False),
                  Aggregate(lambda a: a.apartments.replace(0, 1),
                            'sum',
                            name='units',
                            fname=False),
                  include_numerator=True,
                  include_denominator=True),
         Aggregate('rooms', 'sum'),
         Aggregate('beds', 'sum'),
         Aggregate('baths', 'sum'),
         Aggregate('building_area', 'sum'),
         Aggregate('land_area', 'sum'),
         Proportion(lambda a: a.owner_occupied > 0, 'owner_occupied'),
         Proportion([lambda a, c=c: a[c] > 0 for c in CLASSES],
                    name=CLASSES)
     ]

示例#8

0

显示文件

    def get_aggregates(self, date, delta):

        aggregates = [
            Aggregate('length', 'max', fname=False),
            Aggregate('weight', 'max', fname=False),
            Aggregate('head_circumference', 'max', fname=False),
            Aggregate('apgar', 'max', 'apgar_score', fname=False),
            Aggregate('brth_typ_c',
                      lambda b: set(b),
                      'place_type',
                      fname=False),
            Aggregate('inf_disp_c',
                      lambda i: set(i),
                      'disposition',
                      fname=False),
            Aggregate('complication',
                      lambda cs: union(set(c) for c in cs),
                      fname=False),
            Aggregate(lambda b: b.apors_f == 'Y', 'any', 'apors', fname=False),
            Aggregate(lambda b: b.icu_f == 'Y', 'any', 'icu', fname=False),
            Aggregate('clinicid_i', lambda c: set(c), 'clinic', fname=False)
        ]

        return aggregates

示例#9

0

显示文件

文件： tests.py 项目： xiaochi-liu/lead-public

    def get_aggregates(self, date, delta):
        kid_count = Aggregate('kid_id',
                              'nunique',
                              name='kid_count',
                              fname=False)

        aggregates = [
            Count(),
            Aggregate('bll', ['mean', 'median', 'max', 'min', 'std']),
            Aggregate(lambda t: t.bll.where(t.increase),
                      ['mean', 'median', 'max', 'min', 'std'], 'increase_bll'),
            Count(lambda t: t.bll <= 2, 'bll2', prop=True),
            # prevalences
            Fraction(Count(['first_bll6', 'first_bll10']),
                     kid_count,
                     include_numerator=True,
                     include_denominator=True),
        ]

        # incidences
        if delta != 'all':
            start_date = date - data.parse_delta(delta)
            no_bll6_count = Aggregate(lambda k: k.kid_id.where(
                (k.first_bll6_sample_date >= start_date).fillna(True)),
                                      'nunique',
                                      name='no_bll6_count',
                                      fname=False)
            no_bll10_count = Aggregate(lambda k: k.kid_id.where(
                (k.first_bll10_sample_date >= start_date).fillna(True)),
                                       'nunique',
                                       name='no_bll10_count',
                                       fname=False)

            aggregates.extend([
                no_bll6_count, no_bll10_count,
                Count('first_bll6') / no_bll6_count,
                Count('first_bll10') / no_bll10_count
            ])

        if delta == 'all':
            aggregates.extend([
                Aggregate(days('date', date), ['min', 'max'],
                          'days_since_test'),
                Aggregate([
                    lambda t:
                    (date - t.date.where(t.bll >= 6)) / day, lambda t:
                    (date - t.date.where(t.bll >= 10)) / day
                ], ['min', 'max'], ['days_since_bll6', 'days_since_bll10'])
            ])
        return aggregates

示例#10

0

显示文件

文件： wic.py 项目： xiaochi-liu/lead-public

    def get_aggregates(self, date, delta):
        births = self.inputs[0].get_result()
        aggregates = [
            Aggregate('length', 'max', fname=False),
            Aggregate('weight', 'max', fname=False),
            Aggregate('head_circumference', 'max', fname=False),
            Aggregate('apgar', 'max', 'apgar_score', fname=False),
            Aggregate(list(
                select_regexes(births,
                               ['(complication|place_type|disposition)_.*'])),
                      'sum',
                      fname=False),
            Aggregate(lambda b: b.apors_f == 'Y', 'any', 'apors', fname=False),
            Aggregate(lambda b: b.icu_f == 'Y', 'any', 'icu', fname=False),
        ]

        return aggregates

示例#11

0

显示文件

文件： kids.py 项目： xiaochi-liu/lead-public

    def get_aggregates(self, date, index, delta):
        if index == 'kid':
            return [
                Aggregate(
                    ['test_address_count', 'address_count', 'test_count'],
                    'max',
                    fname=False),
                Aggregate(['max_bll'], 'max', fname=False),
                # Comment out this and all other wic aggregates because they can't be lagged
                # and they're not useful for predicting poisoning
                #Aggregate(lambda k: k.last_wic_date == k.address_wic_max_date,
                #        'any', 'last_wic_address', fname=False),
                #Aggregate(['address_wic_mother', 'address_wic_infant'], 'any', fname=False),
                #Aggregate([days('address_wic_max_date', date),
                #        days('address_wic_min_date', date),
                #        days('last_wic_date', date),
                #        days('first_wic_date', date)],
                #        ['max'], ['address_wic_min_date', 'address_wic_max_date',
                #                  'last_wic_date', 'first_wic_date'], fname=False)
            ]

        sample_2y = lambda k: ((k.last_sample_date - k.date_of_birth) / day >
                               365 * 2) | (k.max_bll >= 6)
        counts = Count([np.float32(1), sample_2y], ['kid', 'kid_sample_2y'])

        aggregates = [
            counts,
            Aggregate(['test_address_count', 'test_count', 'address_count'],
                      ['median', 'mean', 'min', 'max']),
            Count([
                lambda k: k.address_test_min_date.notnull(),
                lambda k: k.first_sample_date.notnull()
            ],
                  prop=True,
                  name=['tested_here', 'tested_ever']),

            #Count(lambda k: k.first_wic_date.notnull(), prop=True, name='wic'),

            #Count([lambda k: k.address_wic_min_date.notnull() & k.address_test_min_date.notnull(),
            #       lambda k: k.address_wic_min_date.notnull() & k.first_sample_date.notnull()],
            #       name=['wic_tested_here', 'wic_tested_ever'],
            #       prop=lambda k: k.first_wic_date.notnull(), prop_name='wic'),
            Aggregate(
                [
                    days('address_min_date', 'address_max_date'),
                    #days('address_wic_min_date', 'address_wic_max_date'),
                    days('address_test_min_date', 'address_test_max_date')
                ],
                ['mean'],
                [
                    'address_total_time',  #'address_wic_time', 
                    'address_test_time'
                ]),

            # the first of these are kid level, not address-kid level
            # that means kids get double counted when aggregated to above the address level
            # if they lived in multiple addresses on that e.g. census tract. oh well.
            Aggregate([
                'max_bll', 'avg_bll', 'cumulative_bll', 'avg_cumulative_bll',
                'mean_bll', 'address_max_bll', 'address_mean_bll'
            ], ['mean', 'median', 'min', 'max']),

            # ebll past, present, future, ever count the number of kids who
            # moved into this address in the period defined by date and delta
            # and who were poisoned before, during, after or ever relative to their time living there
            Fraction(Count([
                lambda k: k.first_bll6_sample_date.notnull(),
                lambda k: k.first_bll10_sample_date.notnull()
            ], ['bll6_ever', 'bll10_ever']),
                     counts,
                     include_numerator=True),
            Fraction(Count([
                lambda k: k.first_bll6_sample_date > k.address_max_date,
                lambda k: k.first_bll10_sample_date > k.address_max_date
            ], ['bll6_future', 'bll10_future']),
                     counts,
                     include_numerator=True),
            Fraction(Count([
                lambda k: k.first_bll6_sample_date < k.address_min_date,
                lambda k: k.first_bll10_sample_date < k.address_min_date
            ], ['bll6_past', 'bll10_past']),
                     counts,
                     include_numerator=True),
            Fraction(Count([
                lambda k: k.first_bll6_sample_date.between(
                    k.address_min_date, k.address_max_date),
                lambda k: k.first_bll10_sample_date.between(
                    k.address_min_date, k.address_max_date)
            ], ['bll6_present', 'bll10_present']),
                     counts,
                     include_numerator=True),
            Aggregate('last_name', 'nunique', fname='count', astype=str)
            # TODO: min_last_sample_age cutoffs
        ]
        if delta == 'all':
            aggregates.extend([
                #Aggregate(days('address_wic_min_date', date), ['min', 'max'], 'days_since_wic'),
                Aggregate(days('date_of_birth', date), ['min', 'max', 'mean'],
                          'date_of_birth'),
            ])

        return aggregates

示例#12

0

显示文件

    def get_aggregates(self, date, index, delta):
        if index == 'kid':
            return [
                Aggregate(['address_count', 'test_count'], 'max', fname=False),
                Aggregate(['max_bll'], 'max', fname=False),
                Aggregate(lambda k: k.last_wic_date == k.address_wic_max_date,
                          'any',
                          'last_wic_address',
                          fname=False),
                Aggregate(['address_wic_mother', 'address_wic_infant'],
                          'any',
                          fname=False),
                Aggregate([
                    days('address_wic_max_date', date),
                    days('address_wic_min_date', date),
                    days('last_wic_date', date),
                    days('first_wic_date', date)
                ], ['max'], [
                    'address_wic_min_date', 'address_wic_max_date',
                    'last_wic_date', 'first_wic_date'
                ],
                          fname=False)
            ]

        sample_2y = lambda k: ((k.last_sample_date - k.date_of_birth) / day >
                               365 * 2) | (k.max_bll >= 6)
        counts = Count([np.float32(1), sample_2y], ['kid', 'kid_sample_2y'])

        aggregates = [
            counts,
            Aggregate(['address_count', 'test_count'],
                      ['median', 'mean', 'min', 'max']),
            Count([
                lambda k: k.address_test_min_date.notnull(),
                lambda k: k.first_sample_date.notnull(),
                lambda k: k.first_wic_date.notnull()
            ],
                  prop=True,
                  name=['tested_here', 'tested_ever', 'wic']),
            Count([
                lambda k: k.address_wic_min_date.notnull(
                ) & k.address_test_min_date.notnull(), lambda k: k.
                address_wic_min_date.notnull() & k.first_sample_date.notnull()
            ],
                  name=['wic_tested_here', 'wic_tested_ever'],
                  parent=lambda k: k.first_wic_date.notnull()),
            Aggregate([
                days('address_min_date', 'address_max_date'),
                days('address_wic_min_date', 'address_wic_max_date'),
                days('address_test_min_date', 'address_test_max_date')
            ], ['mean'], [
                'address_total_time', 'address_wic_time', 'address_test_time'
            ]),
            Aggregate(
                ['max_bll', 'mean_bll', 'address_max_bll', 'address_mean_bll'],
                ['mean', 'median', 'min', 'max']),
            Fraction(Count([
                lambda k: k.first_bll6_sample_date.notnull(),
                lambda k: k.first_bll10_sample_date.notnull()
            ], ['bll6_ever', 'bll10_ever']),
                     counts,
                     include_numerator=True),
            Fraction(Count([
                lambda k: k.first_bll6_sample_date > k.address_max_date,
                lambda k: k.first_bll10_sample_date > k.address_max_date
            ], ['bll6_future', 'bll10_future']),
                     counts,
                     include_numerator=True),
            Fraction(Count([
                lambda k: k.first_bll6_sample_date < k.address_min_date,
                lambda k: k.first_bll10_sample_date < k.address_min_date
            ], ['bll6_past', 'bll10_past']),
                     counts,
                     include_numerator=True),
            Fraction(Count([
                lambda k: k.first_bll6_sample_date.between(
                    k.address_min_date, k.address_max_date),
                lambda k: k.first_bll10_sample_date.between(
                    k.address_min_date, k.address_max_date)
            ], ['bll6_present', 'bll10_present']),
                     counts,
                     include_numerator=True),
            Aggregate('last_name', 'nunique', fname='count', astype=str)
            # TODO: min_last_sample_age cutoffs
        ]
        if delta == 'all':
            aggregates.extend([
                Aggregate(days('address_wic_min_date', date), ['min', 'max'],
                          'days_since_wic'),
                Aggregate(days('date_of_birth', date), ['min', 'max', 'mean'],
                          'date_of_birth'),
            ])

        return aggregates

示例#13

0

显示文件

import pandas as pd
import numpy as np

from drain import util, data
from drain.aggregate import Count, Aggregate, Aggregator, Fraction
from drain.util import PgSQLDatabase

from sqlalchemy.dialects.postgresql import ARRAY
from sqlalchemy.types import Float

conditions = ('SOUND', 'NEEDS MAJOR REPAIR', 'NEEDS MINOR REPAIR',
              'UNINHABITABLE')
cond = Aggregate([(lambda b, c=c: b.bldg_condi == c) for c in conditions],
                 'sum',
                 name=[
                     'condition_sound', 'condition_major', 'condition_minor',
                     'condition_uninhabitable'
                 ],
                 fname=False)

aggregates = [
    Aggregate('area', 'mean', fname=False),
    Aggregate('year_built', lambda l: list(l), name='years_built',
              fname=False),
    Aggregate(lambda b: (b.t_add1 - b.f_add1) / 2 + 1,
              'max',
              name='address_count',
              fname=False),
    Aggregate('bldg_condi_not_null',
              'any',
              name='condition_not_null',

示例#14

0

显示文件

文件： assessor.py 项目： xiaochi-liu/lead-public

    def aggregates(self):
        return [
            Count(),
            Aggregate('count', 'mean', 'assessents'),
            Aggregate(lambda a: a.land_value / 100000, 'mean', name='land_value'),
            Aggregate(['min_age', 'max_age'], ['min', 'mean', 'max']),

            # residential total value and average value
            Fraction(
                Aggregate(lambda a: a.total_value.where(a.residential > 0) / 100000,
                          'sum', 'residential_total_value', fname=False),
                Aggregate(lambda a: a.units.where(a.residential > 0),
                          'sum', name='residential_units', fname=False),
                include_numerator=True, include_denominator=True
            ),
            # non-residential total and average value
            Fraction(
                Aggregate(lambda a: a.total_value.where(a.residential == 0) / 100000,
                          'sum', 'non_residential_total_value', fname=False),
                Aggregate(lambda a: a.units.where(a.residential == 0),
                          'sum', name='non_residential_units', fname=False),
                include_numerator=True, include_denominator=True
            ),

            Aggregate('apartments', 'mean'),
            Aggregate('units', 'mean'),
            Aggregate(lambda a: a.rooms / a.units, 'mean', name='rooms_per_unit'),
            Aggregate(lambda a: a.beds / a.units, 'mean', name='beds_per_unit'),
            Aggregate(lambda a: a.baths / a.units, 'mean', name='baths_per_unit'),

            Proportion(lambda a: a.owner_occupied > 0, name='owner_occupied'),
            Proportion([lambda a, c=c: a[c] > 0 for c in CLASSES],
                    name=CLASSES)
        ]

示例#15

0

显示文件

    def get_aggregates(self, date, delta):

        aggregates = [
            Count(),
            Aggregate(days('visit_d', 'date_of_birth'), ['min', 'max'],
                      'visit'),
            Aggregate('serv_typ_c', lambda s: set(s), 'service', fname=False),
            Aggregate('preg_nbr_n', 'max', 'previous_pregnancies',
                      fname=False),
            Aggregate('lv_brth_n', 'max', 'previous_births', fname=False),
            Aggregate('othr_trm_n',
                      'max',
                      'previous_terminations',
                      fname=False),
            Aggregate(lambda p: p.smk3_mth_f == 'Y',
                      'any',
                      'smoked_3mo',
                      fname=False),
            Aggregate('cig3_day_n', 'max', 'cigarettes_per_day', fname=False),
            Aggregate(lambda p: p.drk3_mth_f == 'Y',
                      'any',
                      'drank_3mo',
                      fname=False),
            Aggregate('dr_dy_wk_n', 'max', 'days_drank_per_week', fname=False),
            Aggregate('drnk_day_n', 'max', 'drinks_per_day', fname=False),
            Aggregate('clinicid_i', lambda c: set(c), 'clinic', fname=False)
        ]

        return aggregates