def output(self): """ We output a log file specifying the model type """ experiments_path = self.conf.get(self.pipeline_task, "experiments") with open(experiments_path) as experiments_file: experiments_dict = json.load(experiments_file) cur_exper = pg_sed.fill_experiment_defaults( experiments_dict[self.exper_id], self.conf, self.pipeline_task) features_dict = OrderedDict( (("subset_name", cur_exper["subset_name"]), ("features", cur_exper["features"]), ("preprocessing", cur_exper["preprocessing"]))) features_basename = self.conf.get(self.pipeline_task, "features_basename") features_str = features_basename + \ pg_sed.hash_if_needed("".join(features_dict.values())) responses_basename = self.conf.get(self.pipeline_task, "responses_basename") responses_str = responses_basename + \ pg_sed.hash_if_needed(cur_exper["subset_name"]) basenames = { "train_features": "%s_train_%s" % (features_str, self.cur_fold), "train_responses": "%s_train_%s" % (responses_str, self.cur_fold), "test_features": "%s_test_%s" % (features_str, self.cur_fold), "test_responses": "%s_test_%s" % (responses_str, self.cur_fold) } model_dict = model_funs.get_model_dict( self.theta, self.cur_fold, cur_exper, self.conf.get(self.pipeline_task, "responses"), self.seed, basenames, self.conf.get(self.pipeline_task, "model_data_schema")) # if successful, log it output_path = "%s/models/model%s_%s.log" % ( self.logging_path, self.pipeline_task, model_dict["string"]) return luigi.LocalTarget(output_path)
def get_model_dict(theta, cur_fold, cur_exper, responses, seed, data_path=None, model_data_schema=None): """ Return a string representing a model :param string model_name The name of the model type, as in the keys to MODELS_MAPPING. :param string theta A string of the dictionary encoding the parameter values for the current model run. This comes from a single entry of calling ParameterGrid() on a value in the models.json. :param int cur_fold The fold for the current training data. :param luigi.configuration conf The luigi configuration information. :return A dictionary giving specification of the model. :rtype dict """ # load json object features_path = cur_exper["features"] preprocessing_path = cur_exper["preprocessing"] with open(features_path) as json_file: features = json.load(json_file) params_string = pg_sed.strip_punct(str(theta)) model_dict = OrderedDict([("model", cur_exper["model"]["name"]), ("responses", responses), ("features", cur_exper["features"]), ("subset", cur_exper["subset_name"]), ("params", params_string), ("preprocessing", cur_exper["preprocessing"]), ("fold", str(cur_fold)), ("seed", seed)]) # return the results model_string = pg_sed.hash_if_needed("".join(model_dict.values())) return { "string": model_string, "response": responses, "features_path": features_path, "features": features, "subset": cur_exper["subset_name"], "model": cur_exper["model"]["name"], "param": theta, "preprocessing": preprocessing_path, "fold": cur_fold, "seed": seed, "model_data_schema": model_data_schema, "model_data": data_path }
def output(self): """ Write responses obtained by leaving out a fold at a time """ responses_basename = self.conf.get(self.pipeline_task, "responses_basename") responses_str = responses_basename + \ pg_sed.hash_if_needed(self.subset_table) base_path = pg_sed.process_filter_name(self.responses_dir, self.filter_condition, responses_str) responses_path = base_path + ".csv" schema_path = base_path + "_schema.csv" return [ luigi.LocalTarget(responses_path), luigi.LocalTarget(schema_path) ]
def output(self): features_basename = self.conf.get(self.pipeline_task, "features_basename") features_dict = json.loads(self.features_dict, object_pairs_hook=OrderedDict) features_str = features_basename + \ pg_sed.hash_if_needed("".join(features_dict.values())) base_path = pg_sed.process_filter_name(self.features_dir, self.filter_condition, features_str) features_path = base_path + ".csv" schema_path = base_path + "_schema.csv" return [ luigi.LocalTarget(features_path), luigi.LocalTarget(schema_path) ]
def requires(self): """ Use the filter_condition parameter to select all folds except for the held out one """ models_data_table = self.conf.get(self.pipeline_task, "model_data_schema") if self.table_type == "features": table_basename = self.conf.get(self.pipeline_task, "features_basename") elif self.table_type == "responses": table_basename = self.conf.get(self.pipeline_task, "responses_basename") else: raise ValueError("""'table_type' parameter must be either features or responses.""") loaded_dict = json.loads(self.data_dict, object_pairs_hook=OrderedDict) table_str = table_basename + \ pg_sed.hash_if_needed("".join(loaded_dict.values())) # create train data filter_condition = "WHERE cv_index <> %s" % self.cur_fold train_table = "%s.%s_train_%s" % tuple( [models_data_table, table_str, self.cur_fold] ) train_task = LoadData(pipeline_task=self.pipeline_task, table=train_table, table_type=self.table_type, filter_condition=filter_condition, data_dict=self.data_dict) # create test data filter_condition = "WHERE cv_index = %s" % self.cur_fold test_table = "%s.%s_test_%s" % tuple( [models_data_table, table_str, self.cur_fold] ) test_task = LoadData(pipeline_task=self.pipeline_task, table=test_table, table_type=self.table_type, filter_condition=filter_condition, data_dict=self.data_dict) return [train_task, test_task]
def write_responses(responses, schema_name, subset_type, filter_condition=None, responses_dir="./", responses_basename="responses"): """ Wrapper function to get both original and derived features This wraps the get_original_features() function, along with any get_* function used to get specific derived features specified in the features json. :param string responses A list of response types to extract. Each type of response must correspond to a function in this module. :param string subset_type The table in the semantic schema to use in generating features. This is specified by the subset_type field in the luigi.cfg file, usually. :param string filter_condition A condition specifying the subset of rows to filter down, within the specified table. This is useful when combined with the cv_index column for cross-validation. :param responses_dir string The path to which to write the leave-fold-out response csv files along with the schema :param responses_basename string The basename of the files to which we write the leave-fold-out response csv files along with the schema :return None :side-effects Writes responses leaving out each fold responses_dir, along with the schema. """ responses = get_responses(responses, schema_name, subset_type, filter_condition) # write repsonses to file responses_str = responses_basename + pg_sed.hash_if_needed(subset_type) base_path = pg_sed.process_filter_name(responses_dir, filter_condition, responses_str) responses_path = base_path + ".csv" schema_path = base_path + "_schema.csv" logger.info("Writing responses to %s", responses_path) pg_sed.write_data_with_schema(responses, responses_path, schema_path)
def run(self): semantic = self.conf.get(self.pipeline_task, "semantic_schema") features_basename = self.conf.get(self.pipeline_task, "features_basename") features_dict = json.loads(self.features_dict, object_pairs_hook=OrderedDict) # get the appropriate aggregation level grouping_cols = self.conf.get(self.pipeline_task, "grouping_cols") grouping_cols = pg_sed.parse_cfg_string(grouping_cols) features = ft.get_features(features_dict["features"], semantic, features_dict["subset_name"], self.filter_condition, grouping_cols) processed_features = ft.preprocess_features( features, features_dict["preprocessing"]) features_str = features_basename + \ pg_sed.hash_if_needed("".join(features_dict.values())) pg_sed.write_data_with_schema_wrapper(processed_features, self.features_dir, self.filter_condition, features_str)
def run(self): """ Run, evaluate, and load a model """ experiments_path = self.conf.get(self.pipeline_task, "experiments") with open(experiments_path) as experiments_file: experiments_dict = json.load(experiments_file) cur_exper = pg_sed.fill_experiment_defaults( experiments_dict[self.exper_id], self.conf, self.pipeline_task) features_dict = OrderedDict( (("subset_name", cur_exper["subset_name"]), ("features", cur_exper["features"]), ("preprocessing", cur_exper["preprocessing"]))) features_basename = self.conf.get(self.pipeline_task, "features_basename") features_str = features_basename + \ pg_sed.hash_if_needed("".join(features_dict.values())) responses_basename = self.conf.get(self.pipeline_task, "responses_basename") responses_str = responses_basename + \ pg_sed.hash_if_needed(features_dict["subset_name"]) basenames = { "train_features": "%s_train_%s" % (features_str, self.cur_fold), "train_responses": "%s_train_%s" % (responses_str, self.cur_fold), "test_features": "%s_test_%s" % (features_str, self.cur_fold), "test_responses": "%s_test_%s" % (responses_str, self.cur_fold) } # get model data data = model_funs.get_model_data( basenames, self.conf.get(self.pipeline_task, "model_data_schema")) model_dict = model_funs.get_model_dict( self.theta, self.cur_fold, cur_exper, self.conf.get(self.pipeline_task, "responses"), self.seed, basenames, self.conf.get(self.pipeline_task, "model_data_schema")) # fit the model start = time.time() model_fit = model_funs.fit_model(cur_exper["model"]["name"], ast.literal_eval(self.theta), data["train_features"], data["train_responses"]) model_dict["run_date"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start)) model_dict["time_to_run"] = time.time() - start # save model model_dict["binary_path"] = os.path.join( self.binaries_path, "%s.pkl" % model_dict["string"]) with open(model_dict["binary_path"], "wb") as file_obj: pickle.dump(model_fit, file_obj) # evaluate the model metrics_list = pg_sed.parse_cfg_string(cur_exper["metrics"]) model_eval = model_funs.evaluate_model(model_fit, data, metrics_list) # load model model_funs.load_model_results( model_eval, model_dict, self.models_schema, self.conf.get(self.pipeline_task, "models_table")) # if successful, log it output_path = os.path.join( self.logging_path, "models", "model%s_%s.log" % (self.pipeline_task, model_dict["string"])) open(output_path, "a").close()