def __init__(self, query=None, to_str=None, table=None, tables=None, **kwargs): """ Use tables to automatically set dependecies """ if query is None: if table is None: raise ValueError("Must specify query or table") query = "SELECT * FROM %s" % table tables = [table] if tables is not None and 'SQL_DIR' in os.environ: self.dependencies = [ os.path.join(os.environ['SQL_DIR'], table.replace('.', '/')) for table in tables ] if to_str is None: to_str = [] Step.__init__(self, query=query, to_str=to_str, **kwargs) if 'inputs' not in kwargs: self.inputs = [CreateEngine()]
def __init__(self, month, day, year, train_years, wic_lag=None, train_query=None): """ Args: month: the month of the train-test split day: the day of the train-test split year: the year of the train-test split train_years: the number of training years wic_lag: an optional lag for the wic data, in days train_query: an optional additional query for training """ Step.__init__(self, month=month, day=day, year=year, train_years=train_years, wic_lag=wic_lag, train_query=train_query) if not YEAR_MIN <= year <= YEAR_MAX: raise ValueError('Invalid year: %s' % year) today = date(year, month, day) # use kid_addresses_revised for a revised aux matrix for temporally valid training queries kid_addresses_revised = revise_kid_addresses(date=today) self.inputs = [lead_data(month, day, wic_lag), kid_addresses_revised]
def __init__(self, sql, id_column, max_date_column, min_date_column, date_column, date, from_sql_args=None, source_id_column=None, **kwargs): """ revise a query to the specified date sql: a path to a file or a string containing sql id_column: the entity id column(s) linking the result of the query with its source tables max_date_column: the maximum date column name for an entry in the result min_date_column: the minimum date column name for an entry in the result date_column: name of the date column in the source date: the date to revise at from_sql_args: dictionary of keyword arguments to pass input FromSQL steps, e.g. target=True, parse_dates """ Step.__init__(self, sql=sql, id_column=id_column, max_date_column=max_date_column, min_date_column=min_date_column, date_column=date_column, date=date, source_id_column=source_id_column, from_sql_args=from_sql_args, **kwargs) if os.path.exists(sql): self.dependencies = [os.path.abspath(sql)] sql = util.read_file(sql) table, query = revise_helper(sql) revised_sql = revise_sql(query=query, id_column=id_column, output_table=table, max_date_column=max_date_column, min_date_column=min_date_column, date_column=date_column, date=date, source_id_column=source_id_column) if from_sql_args is None: from_sql_args = {} self.inputs = [FromSQL(table=table, **from_sql_args), # by depending on table, the revised query is given the right dependencies FromSQL(revised_sql, tables=[table], **from_sql_args)] self.inputs_mapping = ['source', 'revised']
def __init__(self, month, day, year_min, year_max): """ Args: month: the month to use day: the day of the month to use year_min: the year to start year_max: the year to end """ Step.__init__(self, month=month, day=day, year_min=year_min, year_max=year_max, inputs=[addresses])
def __init__(self, return_estimator=False, return_feature_importances=True, return_predictions=True, prefit=False, **kwargs): """ Args: return_estimator: whether or not to return the fitted estimator object return_feature_importances: whether or not to return a DataFrame of feature names and their importances prefit: whether the estimator input is already fitted """ Step.__init__(self, return_estimator=return_estimator, return_feature_importances=return_feature_importances, return_predictions=return_predictions, prefit=prefit, **kwargs)
def __init__(self, table_name, **kwargs): """ Args: table_name: a hack because name is a special kwarg currently TODO: use name once refactor/init is merged """ Step.__init__(self, table_name=table_name, **kwargs) if len(self.inputs) == 1: self.inputs.append(CreateDatabase())
def __init__(self, **kwargs): Step.__init__(self, **kwargs) self.inputs = [ FromSQL(query=""" select *, least(init_date, comply_date) as min_date from output.inspections join output.addresses using (address_id) """, parse_dates=['min_date', 'comply_date', 'init_date'], target=False) ]
def __init__(self, inputs): """ Args: inputs: array containing a LeadLeft instance """ acs = FromSQL(table='output.acs') acs.target = True inputs = inputs + [MapResults([acs], 'acs')] Step.__init__(self, inputs=inputs)
def __init__(self, sql, id_column, max_date_column, min_date_column, date_column, date, from_sql_args=None, source_id_column=None, **kwargs): """ revise a query to the specified date sql: a path to a file or a string containing sql id_column: the entity id column(s) linking the result of the query with its source tables max_date_column: the maximum date column name for an entry in the result min_date_column: the minimum date column name for an entry in the result date_column: name of the date column in the source date: the date to revise at from_sql_args: dictionary of keyword arguments to pass input FromSQL steps, e.g. target=True, parse_dates """ Step.__init__(self, sql=sql, id_column=id_column, max_date_column=max_date_column, min_date_column=min_date_column, date_column=date_column, date=date, source_id_column=source_id_column, from_sql_args=from_sql_args, **kwargs) if os.path.exists(sql): self.dependencies = [os.path.abspath(sql)] sql = util.read_file(sql) table, query = revise_helper(sql) revised_sql = revise_sql(query=query, id_column=id_column, output_table=table, max_date_column=max_date_column, min_date_column=min_date_column, date_column=date_column, date=date, source_id_column=source_id_column) if from_sql_args is None: from_sql_args = {} self.inputs = [ FromSQL(table=table, **from_sql_args), # by depending on table, the revised query is given the right dependencies FromSQL(revised_sql, tables=[table], **from_sql_args) ] self.inputs_mapping = ['source', 'revised']
def __init__(self, month, day, year_min): """ Args: month: the month to use in the date index day: the day of the month to use in the date index year_min: the first year to include in the date index """ Step.__init__(self, month=month, day=day, year_min=year_min) aux = Merge(on='kid_id', inputs=[kid_addresses, kids]) self.inputs = [aux, addresses]
def __init__(self, return_estimator=False, return_feature_importances=True, return_predictions=True, prefit=False, **kwargs): Step.__init__(self, return_estimator=return_estimator, return_feature_importances=return_feature_importances, return_predictions=return_predictions, prefit=prefit, **kwargs)
def test_inputs_no_target(drain_setup): inputs = [Step(value=2), Step(value=3, inputs=[Step(value=4)])] inputs[0].target = True step = Step(value=1, inputs=inputs) step.target = True assert get_inputs(step, target=False) ==\ set([Step(value=3, inputs=[Step(value=4)]), Step(value=4)])
def __init__(self, fit, indexes, pars=None, parameter_keys=None): """ Args: fit: Step producing a StanFit object data: a dictionary of key: index pairs, e.g. {'mu': [1150, 2251,...]} parameter_keys: optional dictionary mapping parameter names names to index keys, e.g. {'mu':'address_id'} """ if parameter_keys is None: parameter_keys = {} Step.__init__(self, inputs=[fit, indexes], pars=pars, parameter_keys=parameter_keys)
def __init__(self, month, day, year_min, **kwargs): Step.__init__(self, month=month, day=day, year_min=year_min, **kwargs) kid_addresses = Merge(on='kid_id', inputs=[ FromSQL(table='output.kid_addresses', parse_dates=KID_ADDRESSES_PARSE_DATES, target=True), FromSQL(table='output.kids', parse_dates=KIDS_PARSE_DATES, to_str=['first_name','last_name'], target=True)]) addresses = FromSQL(table='output.addresses', target=True) self.inputs = [kid_addresses, addresses]
def __init__(self, month, day, year_min, year_max, **kwargs): Step.__init__(self, month=month, day=day, year_min=year_min, year_max=year_max, **kwargs) acs = FromSQL(table='output.acs', target=True) left = LeadLeft(month=month, day=day, year_min=year_min, target=True) dates = tuple((date(y, month, day) for y in range(year_min, year_max+1))) self.aggregations = aggregations.all_dict(dates) self.aggregation_joins = [AggregationJoin(target=True, inputs=[left, a], inputs_mapping=[{'aux':None}, None]) for a in self.aggregations.values()] self.inputs = [acs, left] + self.aggregation_joins self.inputs_mapping=['acs', {}] + [None]*len(self.aggregations)
def __init__(self, inputs, return_estimator=False, return_feature_importances=True, return_predictions=True, prefit=False, predict_train=False): """ Args: return_estimator: whether to return the fitted estimator object return_feature_importances: whether to return a DataFrame of feature importances prefit: whether the estimator input is already fitted predict_train: whether to make predictions on training set """ Step.__init__(self, inputs=inputs, return_estimator=return_estimator, return_feature_importances=return_feature_importances, return_predictions=return_predictions, prefit=prefit, predict_train=predict_train)
def __init__(self, month, day, year_min, **kwargs): Step.__init__(self, month=month, day=day, year_min=year_min, **kwargs) kid_addresses = Merge(on='kid_id', inputs=[ FromSQL( table='output.kid_addresses', parse_dates=KID_ADDRESSES_PARSE_DATES, target=True), FromSQL(table='output.kids', parse_dates=KIDS_PARSE_DATES, to_str=['first_name', 'last_name'], target=True) ]) addresses = FromSQL(table='output.addresses', target=True) self.inputs = [kid_addresses, addresses]
def test_inputs_target2(drain_setup): inputs = [Step(value=2), Step(value=3)] inputs[0].target = True step = Step(value=1) step.inputs = inputs step.target = True assert get_inputs(step, target=True) == set([Step(value=2)])
def __init__(self, inputs, outcome_expr, aggregations, wic_sample_weight=0, exclude=[], include=[]): """ Args: inputs: list containing a LeadCrossValidate step outcome_expr: the query to perform on the auxillary information to produce an outcome variable aggregations: defines which of the SpacetimeAggregations to include and which to drop wic_sample_weight: optional different sample weight for wic kids """ Step.__init__(self, inputs=inputs, outcome_expr=outcome_expr, aggregations=aggregations, wic_sample_weight=wic_sample_weight, exclude=exclude, include=include)
def __init__(self, query=None, to_str=None, table=None, tables=None, **kwargs): """ Use tables to automatically set dependecies """ if query is None: if table is None: raise ValueError("Must specify query or table") query = "SELECT * FROM %s" % table tables = [table] if tables is not None and 'SQL_DIR' in os.environ: self.dependencies = [os.path.join( os.environ['SQL_DIR'], table.replace('.','/')) for table in tables] if to_str is None: to_str = [] Step.__init__(self, query=query, to_str=to_str, **kwargs) if 'inputs' not in kwargs: self.inputs = [CreateEngine()]
def test_drake_data4(drain_setup): steps = [ Step(a=1, inputs=[Step(b=1, target=True)]), Step(a=2, inputs=[Step(b=1, target=True)]) ] data = get_drake_data(steps) assert data == { Step(b=1): set(), steps[0]: {Step(b=1)}, steps[1]: {Step(b=1)} }
def __init__(self, month, day, year, outcome_expr, train_years, aggregations, train_query=None, spacetime_normalize=False, wic_sample_weight=1, exclude=[], include=[], **kwargs): Step.__init__(self, month=month, day=day, year=year, outcome_expr=outcome_expr, train_years=train_years, aggregations=aggregations, train_query=train_query, spacetime_normalize=spacetime_normalize, wic_sample_weight=wic_sample_weight, exclude=exclude, include=include, **kwargs) year_min = 2003 year_max = 2016 if not year_min <= year <= year_max: raise ValueError('Invalid year: %s' % year) today = date(year, month, day) kid_addresses_revised = revise_kid_addresses(date=today) self.inputs = [lead_data(month, day), kid_addresses_revised]
def __init__(self, insert_args, aggregator_args, concat_args, parallel=False, target=False, prefix=None, **kwargs): """ insert_args: collection of argument names to insert into results aggregator_args: collection of argument names to pass to get_aggregator concat_args: collection of argument names on which to concatenate results. Typically a subset (or equal to) aggregator_args. """ self.insert_args = insert_args self.concat_args = concat_args self.aggregator_args = aggregator_args self.prefix = prefix Step.__init__(self, parallel=parallel, target=target and not parallel, **kwargs) if parallel: inputs = self.inputs if hasattr(self, 'inputs') else [] self.inputs = [] # create a new Aggregation according to parallel_kwargs # pass our input to those steps # those become the inputs to this step for kwargs in self.parallel_kwargs: a = self.__class__(parallel=False, target=target, inputs=inputs, **kwargs) self.inputs.append(a) self._aggregators = {} """ arguments is a list of dictionaries of argument names and values. it must include the special 'index' argument, whose values are keys to plug into the self.indexes dictionary, whose values are the actual index the index is used for aggregation its index name is used to prefix the results """ """
def __init__(self, month, day, year_min, year_max, **kwargs): Step.__init__(self, month=month, day=day, year_min=year_min, year_max=year_max, **kwargs) acs = FromSQL(table='output.acs', target=True) left = LeadLeft(month=month, day=day, year_min=year_min, target=True) dates = tuple( (date(y, month, day) for y in range(year_min, year_max + 1))) self.aggregations = aggregations.all_dict(dates) self.aggregation_joins = [ AggregationJoin(target=True, inputs=[left, a], inputs_mapping=[{ 'aux': None }, None]) for a in self.aggregations.values() ] self.inputs = [acs, left] + self.aggregation_joins self.inputs_mapping = ['acs', {}] + [None] * len(self.aggregations)
def __init__(self, inputs, **kwargs): Step.__init__(self, inputs=inputs, **kwargs)
def __init__(self, filepath_or_buffer, **kwargs): Step.__init__(self, filepath_or_buffer=filepath_or_buffer, **kwargs)
def __init__(self): Step.__init__(self, inputs=[events_table])
def test_drakefile(drain_setup): steps = [ Step(a=1, inputs=[Step(b=1, target=True)]), Step(a=2, inputs=[Step(b=1, target=True)]) ] print to_drakefile(steps, preview=True)
def __init__(self, metrics, **kwargs): Step.__init__(self, metrics=metrics, **kwargs)
def __init__(self, target=True, objects_to_ascii=False, **kwargs): Step.__init__(self, target=True, objects_to_ascii=objects_to_ascii, **kwargs)
def __init__(self, model, data, **kwargs): Step.__init__(self, model=model, inputs=[model, data], kwargs=kwargs)
def test_drake_data2(drain_setup): step = Step(a=1, inputs=[Step(b=1)]) data = get_drake_data([step]) assert data == {step: set()}
def __init__(self, **kwargs): Step.__init__(self, **kwargs) self.inputs = [FromSQL(query=""" select *, least(init_date, comply_date) as min_date from output.inspections join output.addresses using (address_id) """, parse_dates=['min_date', 'comply_date', 'init_date'], target=False)]
def __init__(self, month, day, year_min, year_max, wic_lag=None, dtype=None, address=False): """ Args: month: the month for feature generation day: the day of the month for feature generation year_min: the year to start generating features year_max: the year to stop generating features wic_lag: a lag for the WIC aggregations, parsed by drain.data.parse_delta, e.g. '6m' is a six month lag. Defaultis to None, which is no lag. dtype: the dtype to use for features. Defaults to np.float16. address: whether to build an address dataset. Defaults to False, which builds a kid dataset. """ if dtype is None: dtype = np.float16 Step.__init__(self, month=month, day=day, year_min=year_min, year_max=year_max, wic_lag=wic_lag, dtype=dtype, address=address) if address: left = LeadAddressLeft(month=month, day=day, year_min=year_min, year_max=year_max) # left_only is left without aux # in the address case it's the same as left left_only = left else: left = LeadLeft(month=month, day=day, year_min=year_min) left.target = True left_only = MapResults([left], {'aux': None}) acs = Call("astype", inputs=[ACS(inputs=[left_only])], dtype=dtype) acs.target = True dates = tuple( (date(y, month, day) for y in range(year_min, year_max + 1))) self.aggregations = aggregations.all_dict(dates, wic_lag) self.aggregation_joins = [] for name, a in self.aggregations.items(): aj = SpacetimeAggregationJoin( inputs=[a, left_only], lag=wic_lag if name.startswith('wic') else None) aj = Call("astype", inputs=[aj], dtype=dtype) aj.target = True self.aggregation_joins.append(aj) self.inputs = [ MapResults([acs, left] + self.aggregation_joins, ['acs', {}] + [None] * len(self.aggregations)) ]
def test_inputs_target(drain_setup): assert get_inputs(Step(value=1, inputs=[Step(value=2)]), target=True) == set()
def test_drakefile(drain_setup): inputs = [Step(b=1)] inputs[0].target = True steps = [Step(a=1, inputs=inputs), Step(a=2, inputs=inputs)] print(to_drakefile(steps, preview=True))
def test_drake_data3(drain_setup): inputs = [Step(b=1)] inputs[0].target = True step = Step(a=1, inputs=inputs) data = get_drake_data([step]) assert data == {Step(b=1): set(), step: {Step(b=1)}}
def test_input_targets2(drain_setup): assert get_input_targets( Step(value=1, target=True, inputs=[Step(value=2, target=True), Step(value=3)])) == set([Step(value=2)])
def test_drake_data(drain_setup): step = Step(a=1) data = get_drake_data([step]) assert data == {Step(a=1): set()}