def get_model_data(basenames=None, schema_name="model_data", resolution_type="intersection"): """ Extract train and test data for a fold :param dict basenames The base of the table names associated with each fold, keyed by the type of table (train_features, train_response, test_features, and test_response). This is used to identify the table to read from the database. For example, if :param int cur_fold The fold of train / test data to extract :param string schema_name The schema within whith to look for the tables specified by basenames. :return dict data A dictionary keyed by the values in basenames, with values giving pandas dataframes for the correct train / test folds for these data. """ n_tables = len(basenames) data = dict(zip(basenames.keys(), [None] * n_tables)) conn = pg_sed.db_connection() for table_type, table_name in basenames.items(): query = "SELECT * FROM %s.%s" % (schema_name, table_name) logger.info("Executing: %s", query) data[table_type] = pd.io.sql.read_sql(query, conn) data = resolve_unshared_features(data, resolution_type) conn.close() return data
def get_matching_models(models_schema, models_table, filter_condition): """ Get the models that satisfy a SQL condition :param string models_schema The name of the schema containing the models to download. :param string models_table The name of the table containing the model results. :param string filter_condition The condition which the models returned must satisfy. This string must be able to follow the word WHERE in a SQL query. E.g., "subset_type = 'sifode'"" is a legitmate string argument. :return list of strings The IDs for models matching the specified condition, in the required models database table. :rtype list of strings Example ------- matching_models = get_matching_models( "models", "pub_imputation_spatial_subsets", "subset = 'sifode_geo_census_subset_31'" ) """ model_query = """SELECT model FROM {}.{} WHERE {}""".format(models_schema, models_table, filter_condition) conn = db_connection() cursor = conn.cursor() cursor.execute(model_query) model_metadata = cursor.fetchall() return [m[0] for m in model_metadata]
def load_model_results(metrics, model_dict, models_schema="models", models_table="models"): """ Load a model's fit and evaluation metrics to the sedesol database :param model_fit The fitted model object (typically from sklearn.fit()). The only requirement though is that it has a .predict() or .predict_proba() method. :param dict metrics A dictionary with the metrics specified in metrics filled in with values and all others set to None. :param dict model_dict A dict specifying the form of the model. This is the output of get_model_dict(). :return None :side-effects Loads a model to the sedesol database. :rtype None """ connection = pg_sed.db_connection() cursor = connection.cursor() # create and insert a row into the model metadata table create_sql = """CREATE TABLE IF NOT EXISTS %s.%s_json (model BIGSERIAL, metadata JSONB);""" % (models_schema, models_table) cursor.execute(create_sql) insert_sql = """INSERT INTO %s.%s_json(metadata) VALUES(%s);""" % ( models_schema, models_table, "%s") metadata = str(json.dumps({"config": model_dict, "metrics": metrics})) cursor.execute(insert_sql, (metadata, )) view_sql = """CREATE OR REPLACE VIEW %s.%s AS SELECT model, metadata->'config'->>'response' AS response, metadata->'config'->>'fold' AS fold, metadata->'config'->>'features_path' AS features_path, metadata->'config'->>'preprocessing' AS preprocessing, metadata->'config'->>'subset' AS subset, metadata->'config'->>'model_data' AS model_data, metadata->'config'->>'model_data_schema' AS model_data_schema, metadata->'config'->>'model' AS model_type, metadata->'config'->>'param' AS param, metadata->'config'->>'time_to_run' AS time_to_run, metadata->'metrics' AS metrics, metadata->'metrics'->>'mae' AS mae, metadata->'metrics'->>'mse' AS mse, metadata->'metrics'->>'enigh_income_precision_recall' AS enigh_income_precision_recall, metadata->'config'->>'seed' AS seed, metadata->'config'->>'string' AS string, metadata->'config'->>'run_date' AS run_date, metadata->'config'->>'binary_path' AS binary_path FROM %s.%s_json;""" % (models_schema, models_table, models_schema, models_table) cursor.execute(view_sql) connection.close()
def run(self): conn = pg_sed.db_connection() cursor = conn.cursor() # create an row_number in the table cv_grouping_cols = self.conf.get(self.pipeline_task, "cv_grouping_cols") cv_grouping_cols = pg_sed.parse_cfg_string(cv_grouping_cols) substitutions = { "\n": " ", "subset_table": self.subset_table, "semantic_schema": self.conf.get(self.pipeline_task, "semantic_schema"), "cv_order_condition": "ORDER BY {}".format(cv_grouping_cols[0]) } queries = pg_sed.parse_query_strings(self.row_number_generator, substitutions) for query in queries: logger.info("Executing %s", query) cursor.execute(query) conn.commit() n_rows = ft.get_n_rows_in_table(substitutions["semantic_schema"], substitutions["subset_table"]) logger.info("Creating CV indices using sklearn") cv_iterator = get_cv_indices(n_rows, self.k_folds, self.seed, substitutions, cv_grouping_cols) logger.info("Uploading CV indices to postgres") for fold, fold_indices in enumerate(cv_iterator): # update rows in the current fold (note that python is zero indexed) cur_rows = ",".join([str(indices + 1) for indices in fold_indices[1]]) update_query = "UPDATE %s.%s SET cv_index = %s WHERE row_number in (%s) " % ( substitutions["semantic_schema"], substitutions["subset_table"], str(fold), cur_rows ) logger.info("Executing CV update for fold %s", fold) cursor.execute(update_query) logger.info("Creating postgres indices on cv_index column") index_query = "CREATE INDEX %s_cv_index_idx ON %s.%s(cv_index);" %( substitutions["subset_table"], substitutions["semantic_schema"], substitutions["subset_table"] ) index_query = index_query.replace("%s", substitutions["subset_table"]) cursor.execute(index_query) conn.close() # touch the output file output_path = "%screated_cv_%s.log" % ( self.conf.get("etl", "logging_path"), self.pipeline_task ) open(output_path, 'a').close()
def get_generalization_predictions(models_schema, models_table, model_id, training_schema, generalization_schema, generalization_table, training_grouping, generalization_grouping): """ Get predictions on a generalization set This is mainly used to extract final predicted probabilities for subsets of PUB, given models trained on associated subsets of SIFODE. :param string models_schema The name of the schema containing the models to download. :param string models_table The name of the table containing the model results. :param list of strings ids A list of database model IDs to filter the results down to. :param string generalization_schema The schema from which to draw the data to compute predictions on. :param string generalization_table The table within the generalization schema that will be the source of predictions. :param training_groupings string or list of string The grouping parameter when calling get_features for the training model data. :param generalization_groupings string or list of string The grouping parameter when calling get_features for the training model data. :return pd.DataFrame y_hat A dataframe whose columns index predictions to different responses. Example ------- get_generalization_predictions( "models", "pub_imputation_spatial_subsets", "14", "semantic_pub_imputation", "pub_geo_census_subset_17", "person_id", "sifode_pub_id" ) """ conn = db_connection() cursor = conn.cursor() model = get_models(models_schema, models_table, [model_id])[model_id] retrained = retrain_model(training_schema, model, training_grouping) model_copy = deepcopy(model) model_copy["subset"] = generalization_table generalization_data = get_model_specified_features( generalization_schema, model_copy, generalization_grouping) y_hat = pd.DataFrame() for response, model_fit in model["fits"].items(): y_hat[response] = model_fit.predict_proba(generalization_data)[:, 1] return y_hat
def get_models(models_schema, models_table, ids=None): """ Download metadata and model objects from a models database table :param string models_schema The name of the schema containing the models to download. :param string models_table The name of the table containing the model results. :param list of strings ids A list of database model IDs to filter the results down to. :return dict A dictionary keyed by the 'model' id variable in the database. The model object that can be used in prediction is stored in the 'fits' field. :rtype dict Examples -------- # retrieves models from the underreporting schema for model ids 14 and 18 models = get_models("models", "underreporting", ['14', '18']) """ if ids is None: models_filter = "" else: models_filter = "WHERE model IN (%s)" % (",".join(ids)) connection = db_connection() cursor = connection.cursor() cursor.execute("SELECT * from %s.%s %s" % (models_schema, models_table, models_filter)) field_names = [i[0] for i in cursor.description] metadata = pd.DataFrame(cursor.fetchall(), columns=field_names) metadata = metadata.set_index('model', drop=True) metadata = json.loads(metadata.to_json(orient="index")) # combine binaries and metadata into one list model_results = dict() for k in metadata.keys(): model_results[k] = metadata[k].copy() try: with open(metadata[k]["binary_path"], "rb") as binary_obj: model_binary = pickle.load(binary_obj) model_results[k]["fits"] = model_binary except IOError: warnings.warn("Binary not found for model %s" % k) model_results[k]["fits"] = None return model_results
def run(self): query_filename = self.conf.get(self.pipeline_task, 'clean_table_indexer') connection = pg_sed.db_connection() connection.autocommit = True cursor = connection.cursor() all_queries = open(query_filename, 'r').read().split(';') for query in all_queries: if not query.isspace(): logger.info("Executing query: " + query) cursor.execute(query) connection.commit() connection.close() output_path = "%s%s_clean_indices_created.log" % (self.logging_path, self.pipeline_task) open(output_path, 'a').close()
def get_data_log_household_income(schema_name, subset_type, filter_condition=None): """ Retrieve income data on household level, sorted by family id This will include several rows for each household x expansion factor, one row for each type of income source. :param string schema_name The name fo the schema from which to extract this data :param string subset_type The name of the table within the specified schema from which to extract the required data. :param string filter_condition A filtering condition from which to extract the required data. :return pd.Frame Y A DataFrame containing several columns related to household income. These come from the ingresos table in ENIGH, see page 17 in http://internet.contenidos.inegi.org.mx/contenidos/productos//prod_serv/contenidos/espanol/bvinegi/productos/nueva_estruc/702825070373.pdf (or, inc ase of linkrot, search for Encuesta Nacional de Ingresos y Gastos de los Hogares ENIGH 2014) """ if filter_condition is None: filter_condition = "" query = """SELECT folioviv, expansion_ix, SUM(ing_tri) / 4.0 as monthly_income FROM %s.%s %s GROUP BY folioviv, expansion_ix ORDER BY folioviv, expansion_ix;""" \ % (schema_name, subset_type, filter_condition if filter_condition else '') logger.info("Executing: %s", query) conn = pg_sed.db_connection() response = pd.io.sql.read_sql(query, conn) conn.close() return response
def get_data_log_income_without_transferences(schema_name, subset_type, filter_condition=None): """ Retrieve income without transferences data on household level, sorted by family id This takes into account the original variable that is estimated: income without transferences from social programs This will include several rows for each household x expansion factor, one row for each type of income source. :param string schema_name The name fo the schema from which to extract this data :param string subset_type The name of the table within the specified schema from which to extract the required data. :param string filter_condition A filtering condition from which to extract the required data. :return pd.Frame Y A DataFrame containing several columns related to household income. These come from the concentradohogar table in ENIGH, see page 21 in http://internet.contenidos.inegi.org.mx/contenidos/productos//prod_serv/contenidos/espanol/bvinegi/productos/nueva_estruc/702825070373.pdf (or, inc ase of linkrot, search for Encuesta Nacional de Ingresos y Gastos de los Hogares ENIGH 2014) """ if filter_condition is None: filter_condition = "" query = """SELECT distinct on (folioviv,expansion_ix) folioviv, expansion_ix, concentradohogar_ing_cor - concentradohogar_transfer - concentradohogar_bene_gob - concentradohogar_rentas as income_without_transferences FROM %s.%s %s ORDER BY folioviv, expansion_ix;""" \ % (schema_name, subset_type, filter_condition if filter_condition else '') logger.info("Executing: %s", query) conn = pg_sed.db_connection() response = pd.io.sql.read_sql(query, conn) conn.close() return response
class RunModel(luigi.Task): """ Run, evaluate, and load a model for a specific data / parameter combination """ # setup parameters exper_id = luigi.Parameter() theta = luigi.Parameter() cur_fold = luigi.Parameter() pipeline_task = luigi.Parameter() conf = configuration.get_config() logging_path = conf.get("etl", "logging_path") binaries_path = conf.get("etl", "binaries_path") seed = conf.get("shared", "seed") models_schema = conf.get("shared", "models_schema") conn = pg_sed.db_connection() def requires(self): """ We need the cross-validation-fold featurized data """ experiments_path = self.conf.get(self.pipeline_task, "experiments") with open(experiments_path) as experiments_file: experiments_dict = json.load(experiments_file) cur_exper = pg_sed.fill_experiment_defaults( experiments_dict[self.exper_id], self.conf, self.pipeline_task) features_dict = OrderedDict( (("subset_name", cur_exper["subset_name"]), ("features", cur_exper["features"]), ("preprocessing", cur_exper["preprocessing"]))) responses_dict = {"subset_name": cur_exper["subset_name"]} yield CreateSchema(self.models_schema) yield { "load_features": LoadCVData(self.pipeline_task, json.dumps(features_dict), "features", self.cur_fold), "load_responses": LoadCVData(self.pipeline_task, json.dumps(responses_dict), "responses", self.cur_fold) } def run(self): """ Run, evaluate, and load a model """ experiments_path = self.conf.get(self.pipeline_task, "experiments") with open(experiments_path) as experiments_file: experiments_dict = json.load(experiments_file) cur_exper = pg_sed.fill_experiment_defaults( experiments_dict[self.exper_id], self.conf, self.pipeline_task) features_dict = OrderedDict( (("subset_name", cur_exper["subset_name"]), ("features", cur_exper["features"]), ("preprocessing", cur_exper["preprocessing"]))) features_basename = self.conf.get(self.pipeline_task, "features_basename") features_str = features_basename + \ pg_sed.hash_if_needed("".join(features_dict.values())) responses_basename = self.conf.get(self.pipeline_task, "responses_basename") responses_str = responses_basename + \ pg_sed.hash_if_needed(features_dict["subset_name"]) basenames = { "train_features": "%s_train_%s" % (features_str, self.cur_fold), "train_responses": "%s_train_%s" % (responses_str, self.cur_fold), "test_features": "%s_test_%s" % (features_str, self.cur_fold), "test_responses": "%s_test_%s" % (responses_str, self.cur_fold) } # get model data data = model_funs.get_model_data( basenames, self.conf.get(self.pipeline_task, "model_data_schema")) model_dict = model_funs.get_model_dict( self.theta, self.cur_fold, cur_exper, self.conf.get(self.pipeline_task, "responses"), self.seed, basenames, self.conf.get(self.pipeline_task, "model_data_schema")) # fit the model start = time.time() model_fit = model_funs.fit_model(cur_exper["model"]["name"], ast.literal_eval(self.theta), data["train_features"], data["train_responses"]) model_dict["run_date"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start)) model_dict["time_to_run"] = time.time() - start # save model model_dict["binary_path"] = os.path.join( self.binaries_path, "%s.pkl" % model_dict["string"]) with open(model_dict["binary_path"], "wb") as file_obj: pickle.dump(model_fit, file_obj) # evaluate the model metrics_list = pg_sed.parse_cfg_string(cur_exper["metrics"]) model_eval = model_funs.evaluate_model(model_fit, data, metrics_list) # load model model_funs.load_model_results( model_eval, model_dict, self.models_schema, self.conf.get(self.pipeline_task, "models_table")) # if successful, log it output_path = os.path.join( self.logging_path, "models", "model%s_%s.log" % (self.pipeline_task, model_dict["string"])) open(output_path, "a").close() def output(self): """ We output a log file specifying the model type """ experiments_path = self.conf.get(self.pipeline_task, "experiments") with open(experiments_path) as experiments_file: experiments_dict = json.load(experiments_file) cur_exper = pg_sed.fill_experiment_defaults( experiments_dict[self.exper_id], self.conf, self.pipeline_task) features_dict = OrderedDict( (("subset_name", cur_exper["subset_name"]), ("features", cur_exper["features"]), ("preprocessing", cur_exper["preprocessing"]))) features_basename = self.conf.get(self.pipeline_task, "features_basename") features_str = features_basename + \ pg_sed.hash_if_needed("".join(features_dict.values())) responses_basename = self.conf.get(self.pipeline_task, "responses_basename") responses_str = responses_basename + \ pg_sed.hash_if_needed(cur_exper["subset_name"]) basenames = { "train_features": "%s_train_%s" % (features_str, self.cur_fold), "train_responses": "%s_train_%s" % (responses_str, self.cur_fold), "test_features": "%s_test_%s" % (features_str, self.cur_fold), "test_responses": "%s_test_%s" % (responses_str, self.cur_fold) } model_dict = model_funs.get_model_dict( self.theta, self.cur_fold, cur_exper, self.conf.get(self.pipeline_task, "responses"), self.seed, basenames, self.conf.get(self.pipeline_task, "model_data_schema")) # if successful, log it output_path = "%s/models/model%s_%s.log" % ( self.logging_path, self.pipeline_task, model_dict["string"]) return luigi.LocalTarget(output_path)
"pub_" + subset, "person_id", "sifode_pub_id") output_path = "mnt/data/sedesol/pub_preds/single_model/pub_predictions_subset_%s_model_%s" % ( subset, model) cur_probs.round(2).to_csv(output_path, index=False) # average these per model predictions base_path = "/mnt/data/sedesol/pub_preds/single_model/pub_predictions_subset_geo_census_subset" ensemble_path = "/mnt/data/sedesol/pub_preds/ensemble/" state_ids = ["0" + str(s) for s in range(1, 10)] + \ [str(s) for s in range(10, 15)] + \ ["1503", "150_except_1503", "15_except_150"] + \ [str(s) for s in range(16, 33)] conn = pg_sed.db_connection() cursor = conn.cursor() for state in state_ids: print("starting state %s" % state) home_ids = pg_sed.get_original_features(["sifode_pub_id"], "semantic_pub_imputation", "pub_geo_census_subset_%s" % state, None, "sifode_pub_id") cur_paths = glob.glob("%s_%s*" % (base_path, state)) p_hats = [] for ix, path in enumerate(cur_paths): p_hats.append(np.array(pd.read_csv(path))) p_hat = np.dstack(tuple(p_hats)) p_final = ensemble.get_average_pred(p_hat) p_final = pd.concat([home_ids, pd.DataFrame(p_final)], axis=1)