def __transpose_dict_of_lists(self, dol): return utils.transpose_dict_of_lists(dol)
def subset_over( self, label_col, interval_train_window_start, interval_train_window_size, interval_test_window_start, interval_test_window_size, interval_inc_value, interval_expanding=False, row_M_col_name=None, row_M_train_window_start=None, row_M_train_window_size=None, row_M_test_window_start=None, row_M_test_window_size=None, row_M_inc_value=None, row_M_expanding=False, clfs=[{"clf": RandomForestClassifier}], feature_gen_lambda=None, ): """ Generates ArrayGenerators according to some subsetting directive. There are two ways that we determine what the train and test sets are for each trial: 1. The start time/stop time interval. This is the interval used to create features in the M-formatted matrix. Setting the start time/stop time of this interval is equalivalent to passing values to set_interval. variables pertaining to this interval have the interval* prefix. 2. The rows of the M matrix to select, based on the value of some column in the M matrix. Setting the start and end of this interval is equivalent to passing values to select_rows_in_M. Values pertaining to this set of rows have the row_M* prefix. Taking subsets over rows of M is optional, and it will only occur if row_M_col_name is not None Parameters ---------- label_col : str The name of the column containing labels interval_train_window_start : number or datetime start of training interval interval_train_window_size : number or datetime (Initial) size of training interval interval_test_window_start : number or datetime start of testing interval interval_test_window_size : number or datetime size of testing interval interval_inc_value : datetime, timedelta, or number interval to increment train and test interval interval_expanding : boolean whether or not the training interval is expanding row_M_col_name : str or None If not None, the name of the feature which will be used to select different training and testing sets in addition to the interval If None, train and testing sets will use all rows given a particular time interval row_M_train_window_start : ? or None Start of train window for M rows. If None, uses interval_train_window_start row_M_train_window_size : ? or None (Initial) size of train window for M rows. If None, uses interval_train_window_size row_M_test_window_start : ? or None Start of test window for M rows. If None, uses interval_test_window_start row_M_train_window_size : ? or None size of test window for M rows. If None, uses interval_test_window_size row_M_inc_value : ? or None interval to increment train and test window for M rows. If None, uses interval_inc_value row_M_expanding : bool whether or not the training window for M rows is expanding clfs : list of dict classifiers and parameters to run with each train/test set. See documentation for diogenes.grid_search.experiment.Experiment. feature_gen_lambda : (np.ndarray, str, ?, ?, ?, ?) -> np.ndarray or None If not None,function to by applied to generated arrays before they are fit to classifiers. Must be a function of signature: f(M, test_or_train, interval_start, interval_end, row_M_start, row_M_end) Where: * M is the generated array, * test_or_train is 'test' if this is a test set or 'train' if it's a train set * interval_start and interval_end define the interval * row_M_start and row_M_end define the rows of M that are included Returns ------- diogenes.grid_search.experiment.Experiment Experiment collecting train/test sets that have been run """ if row_M_train_window_start is None: row_M_train_window_start = interval_train_window_start if row_M_train_window_size is None: row_M_train_window_size = interval_train_window_size if row_M_test_window_start is None: row_M_test_window_start = interval_test_window_start if row_M_test_window_size is None: row_M_test_window_size = interval_test_window_size if row_M_inc_value is None: row_M_inc_value = interval_inc_value conn = self.__conn col_specs = self.__col_specs table_name = self.__rg_table_name sql_get_max_interval_end = "SELECT MAX({}) FROM {}".format(col_specs["stop_time"], table_name) interval_end = conn.execute(sql_get_max_interval_end)[0][0] if row_M_col_name is not None: sql_get_max_col = ("SELECT MAX({}) FROM {} " "WHERE {} = '{}'").format( col_specs["val"], table_name, col_specs["feature"], row_M_col_name ) row_M_end = conn.execute(sql_get_max_col)[0][0] else: row_M_end = interval_end trial_directives = [] for clf_params in clfs: clf = clf_params["clf"] all_clf_ps = clf_params.copy() del all_clf_ps["clf"] for param_dict in utils.transpose_dict_of_lists(all_clf_ps): trial_directives.append((clf, param_dict, [])) current_interval_train_start = interval_train_window_start current_interval_train_end = interval_train_window_start + interval_train_window_size current_interval_test_start = interval_test_window_start current_interval_test_end = interval_test_window_start + interval_test_window_size current_row_M_train_start = row_M_train_window_start current_row_M_train_end = row_M_train_window_start + row_M_train_window_size current_row_M_test_start = row_M_test_window_start current_row_M_test_end = row_M_test_window_start + row_M_test_window_size while current_interval_test_end <= interval_end and current_row_M_test_end <= row_M_end: ae_train = self.set_interval(current_interval_train_start, current_interval_train_end) ae_test = self.set_interval(current_interval_test_start, current_interval_test_end) if row_M_col_name is not None: ae_train = ae_train.select_rows_in_M( "{col} >= {start} AND {col} <= {stop}".format( col=row_M_col_name, start=current_row_M_train_start, stop=current_row_M_train_end ) ) ae_test = ae_test.select_rows_in_M( "{col} >= {start} AND {col} <= {stop}".format( col=row_M_col_name, start=current_row_M_test_start, stop=current_row_M_test_end ) ) # TODO this should actually run clfs and build an experiment # rather than doing this yield data_train = ae_train.emit_M() M_train = utils.remove_cols(data_train, label_col) y_train = data_train[label_col] data_test = ae_test.emit_M() M_test = utils.remove_cols(data_test, label_col) y_test = data_test[label_col] if feature_gen_lambda is not None: M_train = feature_gen_lambda( M_train, "train", current_interval_train_start, current_interval_train_end, current_row_M_train_start, current_row_M_train_end, ) M_test = feature_gen_lambda( M_test, "test", current_interval_test_start, current_interval_test_end, current_row_M_test_start, current_row_M_test_end, ) col_names = M_train.dtype.names M_train_nd = utils.cast_np_sa_to_nd(M_train) M_test_nd = utils.cast_np_sa_to_nd(M_test) for clf, params, runs in trial_directives: clf_inst = clf(**params) clf_inst.fit(M_train_nd, y_train) runs.append( exp.Run( M_train_nd, y_train, col_names, clf_inst, None, None, col_names, np.arange(len(col_names)), { "train_interval_start": current_interval_train_start, "train_interval_end": current_interval_train_end, "test_interval_start": current_interval_test_start, "test_interval_end": current_interval_test_end, }, { "train_start": current_row_M_train_start, "train_end": current_row_M_train_end, "test_start": current_row_M_test_start, "test_end": current_row_M_test_end, }, M_test_nd, y_test, ) ) if not interval_expanding: current_interval_train_start += interval_inc_value current_interval_train_end += interval_inc_value current_interval_test_start += interval_inc_value current_interval_test_end += interval_inc_value if not row_M_expanding: current_row_M_train_start += row_M_inc_value current_row_M_train_end += row_M_inc_value current_row_M_test_start += row_M_inc_value current_row_M_test_end += row_M_inc_value trials = [ exp.Trial(None, None, None, clf, params, "Array Emitter", {}, "Array Emitter", {}, [runs]) for clf, params, runs in trial_directives ] return exp.Experiment(None, None, clfs, [{"subset": "Array Emitter"}], [{"cv": "Array Emitter"}], trials)