Пример #1
0
    def __init__(self,
                 query=None,
                 to_str=None,
                 table=None,
                 tables=None,
                 **kwargs):
        """
        Use tables to automatically set dependecies
        """
        if query is None:
            if table is None:
                raise ValueError("Must specify query or table")
            query = "SELECT * FROM %s" % table
            tables = [table]

        if tables is not None and 'SQL_DIR' in os.environ:
            self.dependencies = [
                os.path.join(os.environ['SQL_DIR'], table.replace('.', '/'))
                for table in tables
            ]

        if to_str is None:
            to_str = []

        Step.__init__(self, query=query, to_str=to_str, **kwargs)

        if 'inputs' not in kwargs:
            self.inputs = [CreateEngine()]
Пример #2
0
    def __init__(self,
                 month,
                 day,
                 year,
                 train_years,
                 wic_lag=None,
                 train_query=None):
        """
        Args:
            month: the month of the train-test split
            day: the day of the train-test split
            year: the year of the train-test split
            train_years: the number of training years
            wic_lag: an optional lag for the wic data, in days
            train_query: an optional additional query for training
        """
        Step.__init__(self,
                      month=month,
                      day=day,
                      year=year,
                      train_years=train_years,
                      wic_lag=wic_lag,
                      train_query=train_query)

        if not YEAR_MIN <= year <= YEAR_MAX:
            raise ValueError('Invalid year: %s' % year)

        today = date(year, month, day)
        # use kid_addresses_revised for a revised aux matrix for temporally valid training queries
        kid_addresses_revised = revise_kid_addresses(date=today)
        self.inputs = [lead_data(month, day, wic_lag), kid_addresses_revised]
Пример #3
0
Файл: data.py Проект: dssg/drain
    def __init__(self, sql, id_column, max_date_column, min_date_column, 
                date_column, date, from_sql_args=None, source_id_column=None, **kwargs):
        """
        revise a query to the specified date
        sql: a path to a file or a string containing sql
        id_column: the entity id column(s) linking the result of the query with its source tables
        max_date_column: the maximum date column name for an entry in the result
        min_date_column: the minimum date column name for an entry in the result
        date_column: name of the date column in the source
        date: the date to revise at
        from_sql_args: dictionary of keyword arguments to pass input FromSQL steps, 
                e.g. target=True, parse_dates
        """

        Step.__init__(self, sql=sql, id_column=id_column, 
                max_date_column=max_date_column, min_date_column=min_date_column, 
                date_column=date_column, date=date, source_id_column=source_id_column,
                from_sql_args=from_sql_args, **kwargs)
        
        if os.path.exists(sql):
            self.dependencies = [os.path.abspath(sql)]
            sql = util.read_file(sql)

        table, query = revise_helper(sql)

        revised_sql = revise_sql(query=query, id_column=id_column, output_table=table,
                max_date_column=max_date_column, min_date_column=min_date_column, 
                date_column=date_column, date=date, source_id_column=source_id_column)

        if from_sql_args is None: from_sql_args = {}
        self.inputs = [FromSQL(table=table, **from_sql_args), 
                       # by depending on table, the revised query is given the right dependencies
                       FromSQL(revised_sql, tables=[table], **from_sql_args)]
        self.inputs_mapping = ['source', 'revised']
Пример #4
0
 def __init__(self, month, day, year_min, year_max):
     """
     Args:
         month: the month to use
         day: the day of the month to use
         year_min: the year to start
         year_max: the year to end
     """
     Step.__init__(self, month=month, day=day, year_min=year_min, year_max=year_max, inputs=[addresses])
Пример #5
0
 def __init__(self, return_estimator=False, return_feature_importances=True, return_predictions=True, prefit=False, **kwargs):
     """
     Args:
         return_estimator: whether or not to return the fitted estimator object
         return_feature_importances: whether or not to return a DataFrame of feature names and their importances
         prefit: whether the estimator input is already fitted
     """
     Step.__init__(self, return_estimator=return_estimator,
             return_feature_importances=return_feature_importances,
             return_predictions=return_predictions, prefit=prefit, **kwargs)
Пример #6
0
Файл: data.py Проект: dssg/drain
    def __init__(self, table_name, **kwargs):
        """
        Args:
            table_name: a hack because name is a special kwarg currently
                TODO: use name once refactor/init is merged
        """
        Step.__init__(self, table_name=table_name, **kwargs)

        if len(self.inputs) == 1:
            self.inputs.append(CreateDatabase())
Пример #7
0
    def __init__(self, **kwargs):
        Step.__init__(self, **kwargs)
        self.inputs = [
            FromSQL(query="""
select *, least(init_date, comply_date) as min_date
from output.inspections join output.addresses using (address_id) 
""",
                    parse_dates=['min_date', 'comply_date', 'init_date'],
                    target=False)
        ]
Пример #8
0
    def __init__(self, inputs):
        """
        Args:
            inputs: array containing a LeadLeft instance
        """
        acs = FromSQL(table='output.acs')
        acs.target = True
        inputs = inputs + [MapResults([acs], 'acs')]

        Step.__init__(self, inputs=inputs)
Пример #9
0
    def __init__(self,
                 sql,
                 id_column,
                 max_date_column,
                 min_date_column,
                 date_column,
                 date,
                 from_sql_args=None,
                 source_id_column=None,
                 **kwargs):
        """
        revise a query to the specified date
        sql: a path to a file or a string containing sql
        id_column: the entity id column(s) linking the result of the query with its source tables
        max_date_column: the maximum date column name for an entry in the result
        min_date_column: the minimum date column name for an entry in the result
        date_column: name of the date column in the source
        date: the date to revise at
        from_sql_args: dictionary of keyword arguments to pass input FromSQL steps, 
                e.g. target=True, parse_dates
        """

        Step.__init__(self,
                      sql=sql,
                      id_column=id_column,
                      max_date_column=max_date_column,
                      min_date_column=min_date_column,
                      date_column=date_column,
                      date=date,
                      source_id_column=source_id_column,
                      from_sql_args=from_sql_args,
                      **kwargs)

        if os.path.exists(sql):
            self.dependencies = [os.path.abspath(sql)]
            sql = util.read_file(sql)

        table, query = revise_helper(sql)

        revised_sql = revise_sql(query=query,
                                 id_column=id_column,
                                 output_table=table,
                                 max_date_column=max_date_column,
                                 min_date_column=min_date_column,
                                 date_column=date_column,
                                 date=date,
                                 source_id_column=source_id_column)

        if from_sql_args is None: from_sql_args = {}
        self.inputs = [
            FromSQL(table=table, **from_sql_args),
            # by depending on table, the revised query is given the right dependencies
            FromSQL(revised_sql, tables=[table], **from_sql_args)
        ]
        self.inputs_mapping = ['source', 'revised']
Пример #10
0
    def __init__(self, month, day, year_min):
        """
        Args:
            month: the month to use in the date index
            day: the day of the month to use in the date index
            year_min: the first year to include in the date index
        """
        Step.__init__(self, month=month, day=day, year_min=year_min)

        aux = Merge(on='kid_id', inputs=[kid_addresses, kids])
        self.inputs = [aux, addresses]
Пример #11
0
 def __init__(self,
              return_estimator=False,
              return_feature_importances=True,
              return_predictions=True,
              prefit=False,
              **kwargs):
     Step.__init__(self,
                   return_estimator=return_estimator,
                   return_feature_importances=return_feature_importances,
                   return_predictions=return_predictions,
                   prefit=prefit,
                   **kwargs)
Пример #12
0
def test_inputs_no_target(drain_setup):
    inputs = [Step(value=2), Step(value=3, inputs=[Step(value=4)])]
    inputs[0].target = True

    step = Step(value=1, inputs=inputs)
    step.target = True

    assert get_inputs(step, target=False) ==\
            set([Step(value=3, inputs=[Step(value=4)]), Step(value=4)])
Пример #13
0
 def __init__(self, fit, indexes, pars=None, parameter_keys=None):
     """
     Args:
         fit: Step producing a StanFit object
         data: a dictionary of key: index pairs, e.g. {'mu': [1150, 2251,...]}
         parameter_keys: optional dictionary mapping parameter names
             names to index keys, e.g. {'mu':'address_id'}
     """
     if parameter_keys is None:
         parameter_keys = {}
     Step.__init__(self, inputs=[fit, indexes],
                   pars=pars,
                   parameter_keys=parameter_keys)
Пример #14
0
    def __init__(self, month, day, year_min, **kwargs):
        Step.__init__(self, month=month, day=day, year_min=year_min, 
                **kwargs)

        kid_addresses = Merge(on='kid_id', inputs=[
                FromSQL(table='output.kid_addresses', 
                    parse_dates=KID_ADDRESSES_PARSE_DATES, target=True), 
                FromSQL(table='output.kids', 
                    parse_dates=KIDS_PARSE_DATES, 
                    to_str=['first_name','last_name'], target=True)])

        addresses = FromSQL(table='output.addresses', target=True)

        self.inputs = [kid_addresses, addresses]
Пример #15
0
    def __init__(self, month, day, year_min, year_max, **kwargs):
        Step.__init__(self, month=month, day=day, year_min=year_min, year_max=year_max,
                **kwargs)

        acs = FromSQL(table='output.acs', target=True)
        left = LeadLeft(month=month, day=day, year_min=year_min, target=True)

        dates = tuple((date(y, month, day) for y in range(year_min, year_max+1)))
        self.aggregations = aggregations.all_dict(dates)
        self.aggregation_joins = [AggregationJoin(target=True, inputs=[left, a], 
                inputs_mapping=[{'aux':None}, None]) for a in self.aggregations.values()]

        self.inputs = [acs, left] + self.aggregation_joins
        self.inputs_mapping=['acs', {}] + [None]*len(self.aggregations)
Пример #16
0
 def __init__(self, inputs,
              return_estimator=False,
              return_feature_importances=True,
              return_predictions=True,
              prefit=False,
              predict_train=False):
     """
     Args:
         return_estimator: whether to return the fitted estimator object
         return_feature_importances: whether to return a DataFrame of feature importances
         prefit: whether the estimator input is already fitted
         predict_train: whether to make predictions on training set
     """
     Step.__init__(self, inputs=inputs, return_estimator=return_estimator,
                   return_feature_importances=return_feature_importances,
                   return_predictions=return_predictions, prefit=prefit,
                   predict_train=predict_train)
Пример #17
0
    def __init__(self, month, day, year_min, **kwargs):
        Step.__init__(self, month=month, day=day, year_min=year_min, **kwargs)

        kid_addresses = Merge(on='kid_id',
                              inputs=[
                                  FromSQL(
                                      table='output.kid_addresses',
                                      parse_dates=KID_ADDRESSES_PARSE_DATES,
                                      target=True),
                                  FromSQL(table='output.kids',
                                          parse_dates=KIDS_PARSE_DATES,
                                          to_str=['first_name', 'last_name'],
                                          target=True)
                              ])

        addresses = FromSQL(table='output.addresses', target=True)

        self.inputs = [kid_addresses, addresses]
Пример #18
0
def test_inputs_target2(drain_setup):
    inputs = [Step(value=2), Step(value=3)]
    inputs[0].target = True

    step = Step(value=1)
    step.inputs = inputs
    step.target = True

    assert get_inputs(step, target=True) == set([Step(value=2)])
Пример #19
0
 def __init__(self,
              inputs,
              outcome_expr,
              aggregations,
              wic_sample_weight=0,
              exclude=[],
              include=[]):
     """
     Args:
         inputs: list containing a LeadCrossValidate step
         outcome_expr: the query to perform on the auxillary information to produce an outcome variable
         aggregations: defines which of the SpacetimeAggregations to include
         and which to drop
         wic_sample_weight: optional different sample weight for wic kids
     """
     Step.__init__(self,
                   inputs=inputs,
                   outcome_expr=outcome_expr,
                   aggregations=aggregations,
                   wic_sample_weight=wic_sample_weight,
                   exclude=exclude,
                   include=include)
Пример #20
0
Файл: data.py Проект: dssg/drain
    def __init__(self, query=None, to_str=None, table=None, tables=None, **kwargs):
        """
        Use tables to automatically set dependecies
        """
        if query is None:
            if table is None:
                raise ValueError("Must specify query or table")
            query = "SELECT * FROM %s" % table
            tables = [table]
        
        if tables is not None and 'SQL_DIR' in os.environ:
            self.dependencies = [os.path.join(
                    os.environ['SQL_DIR'], table.replace('.','/')) 
                        for table in tables]

        if to_str is None:
            to_str = []

        Step.__init__(self, query=query, to_str=to_str, **kwargs)

        if 'inputs' not in kwargs:
            self.inputs = [CreateEngine()]
Пример #21
0
def test_drake_data4(drain_setup):
    steps = [
        Step(a=1, inputs=[Step(b=1, target=True)]),
        Step(a=2, inputs=[Step(b=1, target=True)])
    ]
    data = get_drake_data(steps)
    assert data == {
        Step(b=1): set(),
        steps[0]: {Step(b=1)},
        steps[1]: {Step(b=1)}
    }
Пример #22
0
    def __init__(self, month, day, year, outcome_expr, train_years, 
            aggregations,
            train_query=None,
            spacetime_normalize=False,
            wic_sample_weight=1, exclude=[], include=[], **kwargs):
        Step.__init__(self, month=month, day=day, year=year, 
                outcome_expr=outcome_expr,
                train_years=train_years,
                aggregations=aggregations,
                train_query=train_query,
                spacetime_normalize=spacetime_normalize,
                wic_sample_weight=wic_sample_weight, 
                exclude=exclude, include=include, **kwargs)

        year_min = 2003
        year_max = 2016
        if not year_min <= year <= year_max:
            raise ValueError('Invalid year: %s' % year)

        today = date(year, month, day)
        kid_addresses_revised = revise_kid_addresses(date=today)
        self.inputs = [lead_data(month, day), kid_addresses_revised]
Пример #23
0
    def __init__(self, insert_args, aggregator_args, concat_args, 
            parallel=False, target=False, prefix=None, **kwargs):
        """
        insert_args: collection of argument names to insert into results
        aggregator_args: collection of argument names to pass 
                to get_aggregator
        concat_args: collection of argument names on which to 
                concatenate results. Typically a subset (or equal 
                to) aggregator_args.

        """

        self.insert_args = insert_args
        self.concat_args = concat_args
        self.aggregator_args = aggregator_args
        self.prefix = prefix

        Step.__init__(self, parallel=parallel, target=target and not parallel, **kwargs)

        if parallel:
            inputs = self.inputs if hasattr(self, 'inputs') else []
            self.inputs = []
            # create a new Aggregation according to parallel_kwargs
            # pass our input to those steps
            # those become the inputs to this step
            for kwargs in self.parallel_kwargs:
                a = self.__class__(parallel=False, target=target, inputs=inputs, **kwargs)
                self.inputs.append(a)

        self._aggregators = {}
    
        """
        arguments is a list of dictionaries of argument names and values.
        it must include the special 'index' argument, whose values are keys to plug into the self.indexes dictionary, whose values are the actual index
        the index is used for aggregation its index name is used to prefix the results
        """
        """
Пример #24
0
    def __init__(self, month, day, year_min, year_max, **kwargs):
        Step.__init__(self,
                      month=month,
                      day=day,
                      year_min=year_min,
                      year_max=year_max,
                      **kwargs)

        acs = FromSQL(table='output.acs', target=True)
        left = LeadLeft(month=month, day=day, year_min=year_min, target=True)

        dates = tuple(
            (date(y, month, day) for y in range(year_min, year_max + 1)))
        self.aggregations = aggregations.all_dict(dates)
        self.aggregation_joins = [
            AggregationJoin(target=True,
                            inputs=[left, a],
                            inputs_mapping=[{
                                'aux': None
                            }, None]) for a in self.aggregations.values()
        ]

        self.inputs = [acs, left] + self.aggregation_joins
        self.inputs_mapping = ['acs', {}] + [None] * len(self.aggregations)
Пример #25
0
 def __init__(self, inputs, **kwargs):
     Step.__init__(self, inputs=inputs, **kwargs)
Пример #26
0
Файл: data.py Проект: dssg/drain
 def __init__(self, filepath_or_buffer, **kwargs):
     Step.__init__(self, filepath_or_buffer=filepath_or_buffer, **kwargs)
Пример #27
0
 def __init__(self):
     Step.__init__(self, inputs=[events_table])
Пример #28
0
def test_drakefile(drain_setup):
    steps = [
        Step(a=1, inputs=[Step(b=1, target=True)]),
        Step(a=2, inputs=[Step(b=1, target=True)])
    ]
    print to_drakefile(steps, preview=True)
Пример #29
0
 def __init__(self, return_estimator=False, return_feature_importances=True, return_predictions=True, prefit=False, **kwargs):
     Step.__init__(self, return_estimator=return_estimator,
             return_feature_importances=return_feature_importances,
             return_predictions=return_predictions, prefit=prefit, **kwargs)
Пример #30
0
 def __init__(self, metrics, **kwargs):
     Step.__init__(self, metrics=metrics, **kwargs)
Пример #31
0
 def __init__(self, target=True, objects_to_ascii=False, **kwargs):
     Step.__init__(self,
                   target=True,
                   objects_to_ascii=objects_to_ascii,
                   **kwargs)
Пример #32
0
 def __init__(self, model, data, **kwargs):
     Step.__init__(self, model=model, inputs=[model, data], kwargs=kwargs)
Пример #33
0
def test_drake_data2(drain_setup):
    step = Step(a=1, inputs=[Step(b=1)])
    data = get_drake_data([step])
    assert data == {step: set()}
Пример #34
0
    def __init__(self, **kwargs):
        Step.__init__(self, **kwargs)
        self.inputs = [FromSQL(query="""
select *, least(init_date, comply_date) as min_date
from output.inspections join output.addresses using (address_id) 
""", parse_dates=['min_date', 'comply_date', 'init_date'], target=False)]
Пример #35
0
 def __init__(self, metrics, **kwargs):
     Step.__init__(self, metrics=metrics, **kwargs)
Пример #36
0
    def __init__(self,
                 month,
                 day,
                 year_min,
                 year_max,
                 wic_lag=None,
                 dtype=None,
                 address=False):
        """
        Args:
            month: the month for feature generation
            day: the day of the month for feature generation
            year_min: the year to start generating features
            year_max: the year to stop generating features
            wic_lag: a lag for the WIC aggregations, parsed by
                drain.data.parse_delta, e.g. '6m' is a six month lag.
                Defaultis to None, which is no lag.
            dtype: the dtype to use for features. Defaults to np.float16.
            address: whether to build an address dataset. Defaults to False,
                which builds a kid dataset.
        """
        if dtype is None:
            dtype = np.float16

        Step.__init__(self,
                      month=month,
                      day=day,
                      year_min=year_min,
                      year_max=year_max,
                      wic_lag=wic_lag,
                      dtype=dtype,
                      address=address)

        if address:
            left = LeadAddressLeft(month=month,
                                   day=day,
                                   year_min=year_min,
                                   year_max=year_max)
            # left_only is left without aux
            # in the address case it's the same as left
            left_only = left
        else:
            left = LeadLeft(month=month, day=day, year_min=year_min)
            left.target = True
            left_only = MapResults([left], {'aux': None})

        acs = Call("astype", inputs=[ACS(inputs=[left_only])], dtype=dtype)
        acs.target = True

        dates = tuple(
            (date(y, month, day) for y in range(year_min, year_max + 1)))
        self.aggregations = aggregations.all_dict(dates, wic_lag)

        self.aggregation_joins = []
        for name, a in self.aggregations.items():
            aj = SpacetimeAggregationJoin(
                inputs=[a, left_only],
                lag=wic_lag if name.startswith('wic') else None)
            aj = Call("astype", inputs=[aj], dtype=dtype)
            aj.target = True
            self.aggregation_joins.append(aj)

        self.inputs = [
            MapResults([acs, left] + self.aggregation_joins,
                       ['acs', {}] + [None] * len(self.aggregations))
        ]
Пример #37
0
def test_inputs_target(drain_setup):
    assert get_inputs(Step(value=1, inputs=[Step(value=2)]),
                      target=True) == set()
Пример #38
0
def test_drakefile(drain_setup):
    inputs = [Step(b=1)]
    inputs[0].target = True
    steps = [Step(a=1, inputs=inputs), Step(a=2, inputs=inputs)]
    print(to_drakefile(steps, preview=True))
Пример #39
0
def test_drake_data3(drain_setup):
    inputs = [Step(b=1)]
    inputs[0].target = True
    step = Step(a=1, inputs=inputs)
    data = get_drake_data([step])
    assert data == {Step(b=1): set(), step: {Step(b=1)}}
Пример #40
0
Файл: data.py Проект: dssg/drain
 def __init__(self, target=True, objects_to_ascii=False, **kwargs):
     Step.__init__(self, target=True, objects_to_ascii=objects_to_ascii, **kwargs)
Пример #41
0
def test_input_targets2(drain_setup):
    assert get_input_targets(
        Step(value=1,
             target=True,
             inputs=[Step(value=2, target=True),
                     Step(value=3)])) == set([Step(value=2)])
Пример #42
0
def test_drake_data(drain_setup):
    step = Step(a=1)
    data = get_drake_data([step])
    assert data == {Step(a=1): set()}
Пример #43
0
 def __init__(self, filepath_or_buffer, **kwargs):
     Step.__init__(self, filepath_or_buffer=filepath_or_buffer, **kwargs)