Exemplo n.º 1
0
def get_model_data(basenames=None,
                   schema_name="model_data",
                   resolution_type="intersection"):
    """
    Extract train and test data for a fold

    :param dict basenames The base of the table names associated with each
     fold, keyed by the type of table (train_features, train_response,
     test_features, and test_response). This is used to identify the table to
     read from the database. For example, if
    :param int cur_fold The fold of train / test data to extract
    :param string schema_name The schema within whith to look for the tables
     specified by basenames.
    :return dict data A dictionary keyed by the values in basenames, with values
     giving pandas dataframes for the correct train / test folds for these data.
    """
    n_tables = len(basenames)
    data = dict(zip(basenames.keys(), [None] * n_tables))
    conn = pg_sed.db_connection()

    for table_type, table_name in basenames.items():
        query = "SELECT * FROM %s.%s" % (schema_name, table_name)
        logger.info("Executing: %s", query)
        data[table_type] = pd.io.sql.read_sql(query, conn)

    data = resolve_unshared_features(data, resolution_type)
    conn.close()
    return data
Exemplo n.º 2
0
def get_matching_models(models_schema, models_table, filter_condition):
    """
    Get the models that satisfy a SQL condition

    :param string models_schema The name of the schema containing the models to
     download.
    :param string models_table The name of the table containing the model
     results.
    :param string filter_condition The condition which the models returned must
     satisfy. This string must be able to follow the word WHERE in a SQL query.
     E.g., "subset_type = 'sifode'"" is a legitmate string argument.
    :return list of strings The IDs for models matching the specified condition,
     in the required models database table.
    :rtype list of strings

    Example
    -------
    matching_models = get_matching_models(
        "models",
        "pub_imputation_spatial_subsets",
        "subset = 'sifode_geo_census_subset_31'"
    )
    """
    model_query = """SELECT model
    FROM {}.{} WHERE {}""".format(models_schema, models_table,
                                  filter_condition)

    conn = db_connection()
    cursor = conn.cursor()
    cursor.execute(model_query)
    model_metadata = cursor.fetchall()
    return [m[0] for m in model_metadata]
Exemplo n.º 3
0
def load_model_results(metrics,
                       model_dict,
                       models_schema="models",
                       models_table="models"):
    """
    Load a model's fit and evaluation metrics to the sedesol database

    :param model_fit The fitted model object (typically from sklearn.fit()). The
     only requirement though is that it has a .predict() or .predict_proba()
     method.
    :param dict metrics A dictionary with the metrics specified in metrics
     filled in with values and all others set to None.
    :param dict model_dict A dict specifying the form of the model. This is the
     output of get_model_dict().
    :return None
    :side-effects Loads a model to the sedesol database.
    :rtype None
    """
    connection = pg_sed.db_connection()
    cursor = connection.cursor()

    # create and insert a row into the model metadata table
    create_sql = """CREATE TABLE IF NOT EXISTS
    %s.%s_json
    (model BIGSERIAL, metadata JSONB);""" % (models_schema, models_table)
    cursor.execute(create_sql)

    insert_sql = """INSERT INTO %s.%s_json(metadata) VALUES(%s);""" % (
        models_schema, models_table, "%s")

    metadata = str(json.dumps({"config": model_dict, "metrics": metrics}))
    cursor.execute(insert_sql, (metadata, ))

    view_sql = """CREATE OR REPLACE VIEW %s.%s AS
    SELECT model,
    metadata->'config'->>'response' AS response,
    metadata->'config'->>'fold' AS fold,
    metadata->'config'->>'features_path' AS features_path,
    metadata->'config'->>'preprocessing' AS preprocessing,
    metadata->'config'->>'subset' AS subset,
    metadata->'config'->>'model_data' AS model_data,
    metadata->'config'->>'model_data_schema' AS model_data_schema,
    metadata->'config'->>'model' AS model_type,
    metadata->'config'->>'param' AS param,
    metadata->'config'->>'time_to_run' AS time_to_run,
    metadata->'metrics' AS metrics,
    metadata->'metrics'->>'mae' AS mae,
    metadata->'metrics'->>'mse' AS mse,
    metadata->'metrics'->>'enigh_income_precision_recall' AS enigh_income_precision_recall,
    metadata->'config'->>'seed' AS seed,
    metadata->'config'->>'string' AS string,
    metadata->'config'->>'run_date' AS run_date,
    metadata->'config'->>'binary_path' AS binary_path
    FROM %s.%s_json;""" % (models_schema, models_table, models_schema,
                           models_table)
    cursor.execute(view_sql)
    connection.close()
Exemplo n.º 4
0
    def run(self):
        conn = pg_sed.db_connection()
        cursor = conn.cursor()

        # create an row_number in the table
        cv_grouping_cols = self.conf.get(self.pipeline_task, "cv_grouping_cols")
        cv_grouping_cols = pg_sed.parse_cfg_string(cv_grouping_cols)
        substitutions = {
            "\n": " ",
            "subset_table": self.subset_table,
            "semantic_schema": self.conf.get(self.pipeline_task, "semantic_schema"),
            "cv_order_condition": "ORDER BY {}".format(cv_grouping_cols[0])
        }
        queries = pg_sed.parse_query_strings(self.row_number_generator,
                                             substitutions)

        for query in queries:
            logger.info("Executing %s", query)
            cursor.execute(query)
            conn.commit()

        n_rows = ft.get_n_rows_in_table(substitutions["semantic_schema"],
                                        substitutions["subset_table"])

        logger.info("Creating CV indices using sklearn")
        cv_iterator = get_cv_indices(n_rows, self.k_folds, self.seed,
                                     substitutions, cv_grouping_cols)

        logger.info("Uploading CV indices to postgres")
        for fold, fold_indices in enumerate(cv_iterator):
            # update rows in the current fold (note that python is zero indexed)
            cur_rows = ",".join([str(indices + 1) for indices in fold_indices[1]])
            update_query = "UPDATE %s.%s SET cv_index = %s WHERE row_number in (%s) " % (
                substitutions["semantic_schema"], substitutions["subset_table"],
                str(fold), cur_rows
            )
            logger.info("Executing CV update for fold %s", fold)
            cursor.execute(update_query)


        logger.info("Creating postgres indices on cv_index column")
        index_query = "CREATE INDEX %s_cv_index_idx ON %s.%s(cv_index);" %(
            substitutions["subset_table"],
            substitutions["semantic_schema"],
            substitutions["subset_table"]
        )
        index_query = index_query.replace("%s", substitutions["subset_table"])
        cursor.execute(index_query)

        conn.close()

        # touch the output file
        output_path = "%screated_cv_%s.log" % (
            self.conf.get("etl", "logging_path"),
            self.pipeline_task
        )
        open(output_path, 'a').close()
Exemplo n.º 5
0
def get_generalization_predictions(models_schema, models_table, model_id,
                                   training_schema, generalization_schema,
                                   generalization_table, training_grouping,
                                   generalization_grouping):
    """
    Get predictions on a generalization set

    This is mainly used to extract final predicted probabilities for subsets of 
    PUB, given models trained on associated subsets of SIFODE.

    :param string models_schema The name of the schema containing the models to
     download.
    :param string models_table The name of the table containing the model
     results.
    :param list of strings ids A list of database model IDs to filter the
     results down to.
    :param string generalization_schema The schema from which to draw the data
     to compute predictions on.
    :param string generalization_table The table within the generalization 
     schema that will be the source of predictions.
    :param training_groupings string or list of string The grouping parameter 
     when calling get_features for the training model data.
    :param generalization_groupings string or list of string The grouping 
     parameter when calling get_features for the training model data.
    :return pd.DataFrame y_hat A dataframe whose columns index predictions to 
     different responses.

    Example
    -------
    get_generalization_predictions(
        "models", 
        "pub_imputation_spatial_subsets",
        "14",
        "semantic_pub_imputation",
        "pub_geo_census_subset_17",
        "person_id",
        "sifode_pub_id"
    )
    """
    conn = db_connection()
    cursor = conn.cursor()

    model = get_models(models_schema, models_table, [model_id])[model_id]
    retrained = retrain_model(training_schema, model, training_grouping)

    model_copy = deepcopy(model)
    model_copy["subset"] = generalization_table
    generalization_data = get_model_specified_features(
        generalization_schema, model_copy, generalization_grouping)

    y_hat = pd.DataFrame()
    for response, model_fit in model["fits"].items():
        y_hat[response] = model_fit.predict_proba(generalization_data)[:, 1]

    return y_hat
Exemplo n.º 6
0
def get_models(models_schema, models_table, ids=None):
    """
    Download  metadata and model objects from a models database table

    :param string models_schema The name of the schema containing the models to
     download.
    :param string models_table The name of the table containing the model
     results.
    :param list of strings ids A list of database model IDs to filter the
     results down to.
    :return dict A dictionary keyed by the 'model' id variable in the database.
     The model object that can be used in prediction is stored in the 'fits'
     field.
    :rtype dict

    Examples
    --------
    # retrieves models from the underreporting schema for model ids 14 and 18
    models = get_models("models", "underreporting", ['14', '18'])
    """
    if ids is None:
        models_filter = ""
    else:
        models_filter = "WHERE model IN (%s)" % (",".join(ids))

    connection = db_connection()
    cursor = connection.cursor()
    cursor.execute("SELECT * from %s.%s %s" %
                   (models_schema, models_table, models_filter))
    field_names = [i[0] for i in cursor.description]
    metadata = pd.DataFrame(cursor.fetchall(), columns=field_names)
    metadata = metadata.set_index('model', drop=True)
    metadata = json.loads(metadata.to_json(orient="index"))

    # combine binaries and metadata into one list
    model_results = dict()
    for k in metadata.keys():
        model_results[k] = metadata[k].copy()

        try:
            with open(metadata[k]["binary_path"], "rb") as binary_obj:
                model_binary = pickle.load(binary_obj)
            model_results[k]["fits"] = model_binary
        except IOError:
            warnings.warn("Binary not found for model %s" % k)
            model_results[k]["fits"] = None

    return model_results
Exemplo n.º 7
0
    def run(self):
        query_filename = self.conf.get(self.pipeline_task,
                                       'clean_table_indexer')
        connection = pg_sed.db_connection()
        connection.autocommit = True
        cursor = connection.cursor()

        all_queries = open(query_filename, 'r').read().split(';')
        for query in all_queries:
            if not query.isspace():
                logger.info("Executing query: " + query)
                cursor.execute(query)
                connection.commit()

        connection.close()
        output_path = "%s%s_clean_indices_created.log" % (self.logging_path,
                                                          self.pipeline_task)
        open(output_path, 'a').close()
Exemplo n.º 8
0
def get_data_log_household_income(schema_name,
                                  subset_type,
                                  filter_condition=None):
    """
    Retrieve income data on household level, sorted by family id

    This will include several rows for each household x expansion factor, one
    row for each type of income source.

    :param string schema_name The name fo the schema from which to extract this
     data
    :param string subset_type The name of the table within the specified schema
     from which to extract the required data.
    :param string filter_condition A filtering condition from which to extract
     the required data.

    :return pd.Frame Y A DataFrame containing several columns related to
     household income. These come from the ingresos table in ENIGH, see
     page 17 in http://internet.contenidos.inegi.org.mx/contenidos/productos//prod_serv/contenidos/espanol/bvinegi/productos/nueva_estruc/702825070373.pdf
     (or, inc ase of linkrot, search for Encuesta Nacional de Ingresos
     y Gastos de los Hogares ENIGH 2014)
    """
    if filter_condition is None:
        filter_condition = ""

    query = """SELECT folioviv,
                expansion_ix,
                SUM(ing_tri) / 4.0 as monthly_income
                FROM %s.%s %s
                GROUP BY folioviv, expansion_ix
                ORDER BY folioviv, expansion_ix;""" \
             % (schema_name, subset_type, filter_condition if filter_condition else '')

    logger.info("Executing: %s", query)
    conn = pg_sed.db_connection()
    response = pd.io.sql.read_sql(query, conn)

    conn.close()
    return response
Exemplo n.º 9
0
def get_data_log_income_without_transferences(schema_name,
                                              subset_type,
                                              filter_condition=None):
    """
    Retrieve income without transferences data on household level, sorted by family id
    This takes into account the original variable that is estimated: income without transferences from social programs
    This will include several rows for each household x expansion factor, one
    row for each type of income source.
    :param string schema_name The name fo the schema from which to extract this
     data
    :param string subset_type The name of the table within the specified schema
     from which to extract the required data.
    :param string filter_condition A filtering condition from which to extract
     the required data.
    :return pd.Frame Y A DataFrame containing several columns related to
     household income. These come from the concentradohogar table in ENIGH, see
     page 21 in http://internet.contenidos.inegi.org.mx/contenidos/productos//prod_serv/contenidos/espanol/bvinegi/productos/nueva_estruc/702825070373.pdf
     (or, inc ase of linkrot, search for Encuesta Nacional de Ingresos
     y Gastos de los Hogares ENIGH 2014)
    """
    if filter_condition is None:
        filter_condition = ""

    query = """SELECT distinct on (folioviv,expansion_ix)
                folioviv,
                expansion_ix,
                concentradohogar_ing_cor - concentradohogar_transfer - concentradohogar_bene_gob - concentradohogar_rentas as income_without_transferences
                FROM %s.%s %s
        ORDER BY folioviv, expansion_ix;""" \
             % (schema_name, subset_type, filter_condition if filter_condition else '')

    logger.info("Executing: %s", query)
    conn = pg_sed.db_connection()
    response = pd.io.sql.read_sql(query, conn)

    conn.close()
    return response
Exemplo n.º 10
0
class RunModel(luigi.Task):
    """
    Run, evaluate, and load a model for a specific data / parameter combination
    """

    # setup parameters
    exper_id = luigi.Parameter()
    theta = luigi.Parameter()
    cur_fold = luigi.Parameter()
    pipeline_task = luigi.Parameter()

    conf = configuration.get_config()
    logging_path = conf.get("etl", "logging_path")
    binaries_path = conf.get("etl", "binaries_path")
    seed = conf.get("shared", "seed")
    models_schema = conf.get("shared", "models_schema")
    conn = pg_sed.db_connection()

    def requires(self):
        """
        We need the cross-validation-fold featurized data
        """
        experiments_path = self.conf.get(self.pipeline_task, "experiments")
        with open(experiments_path) as experiments_file:
            experiments_dict = json.load(experiments_file)

        cur_exper = pg_sed.fill_experiment_defaults(
            experiments_dict[self.exper_id], self.conf, self.pipeline_task)

        features_dict = OrderedDict(
            (("subset_name",
              cur_exper["subset_name"]), ("features", cur_exper["features"]),
             ("preprocessing", cur_exper["preprocessing"])))
        responses_dict = {"subset_name": cur_exper["subset_name"]}
        yield CreateSchema(self.models_schema)
        yield {
            "load_features":
            LoadCVData(self.pipeline_task, json.dumps(features_dict),
                       "features", self.cur_fold),
            "load_responses":
            LoadCVData(self.pipeline_task, json.dumps(responses_dict),
                       "responses", self.cur_fold)
        }

    def run(self):
        """
        Run, evaluate, and load a model
        """
        experiments_path = self.conf.get(self.pipeline_task, "experiments")
        with open(experiments_path) as experiments_file:
            experiments_dict = json.load(experiments_file)

        cur_exper = pg_sed.fill_experiment_defaults(
            experiments_dict[self.exper_id], self.conf, self.pipeline_task)

        features_dict = OrderedDict(
            (("subset_name",
              cur_exper["subset_name"]), ("features", cur_exper["features"]),
             ("preprocessing", cur_exper["preprocessing"])))

        features_basename = self.conf.get(self.pipeline_task,
                                          "features_basename")
        features_str = features_basename + \
                       pg_sed.hash_if_needed("".join(features_dict.values()))

        responses_basename = self.conf.get(self.pipeline_task,
                                           "responses_basename")
        responses_str = responses_basename + \
                        pg_sed.hash_if_needed(features_dict["subset_name"])

        basenames = {
            "train_features": "%s_train_%s" % (features_str, self.cur_fold),
            "train_responses": "%s_train_%s" % (responses_str, self.cur_fold),
            "test_features": "%s_test_%s" % (features_str, self.cur_fold),
            "test_responses": "%s_test_%s" % (responses_str, self.cur_fold)
        }

        # get model data
        data = model_funs.get_model_data(
            basenames, self.conf.get(self.pipeline_task, "model_data_schema"))

        model_dict = model_funs.get_model_dict(
            self.theta, self.cur_fold, cur_exper,
            self.conf.get(self.pipeline_task, "responses"), self.seed,
            basenames, self.conf.get(self.pipeline_task, "model_data_schema"))

        # fit the model
        start = time.time()
        model_fit = model_funs.fit_model(cur_exper["model"]["name"],
                                         ast.literal_eval(self.theta),
                                         data["train_features"],
                                         data["train_responses"])
        model_dict["run_date"] = time.strftime('%Y-%m-%d %H:%M:%S',
                                               time.localtime(start))
        model_dict["time_to_run"] = time.time() - start

        # save model
        model_dict["binary_path"] = os.path.join(
            self.binaries_path, "%s.pkl" % model_dict["string"])
        with open(model_dict["binary_path"], "wb") as file_obj:
            pickle.dump(model_fit, file_obj)

        # evaluate the model
        metrics_list = pg_sed.parse_cfg_string(cur_exper["metrics"])
        model_eval = model_funs.evaluate_model(model_fit, data, metrics_list)

        # load model
        model_funs.load_model_results(
            model_eval, model_dict, self.models_schema,
            self.conf.get(self.pipeline_task, "models_table"))

        # if successful, log it
        output_path = os.path.join(
            self.logging_path, "models",
            "model%s_%s.log" % (self.pipeline_task, model_dict["string"]))
        open(output_path, "a").close()

    def output(self):
        """
        We output a log file specifying the model type
        """
        experiments_path = self.conf.get(self.pipeline_task, "experiments")
        with open(experiments_path) as experiments_file:
            experiments_dict = json.load(experiments_file)

        cur_exper = pg_sed.fill_experiment_defaults(
            experiments_dict[self.exper_id], self.conf, self.pipeline_task)

        features_dict = OrderedDict(
            (("subset_name",
              cur_exper["subset_name"]), ("features", cur_exper["features"]),
             ("preprocessing", cur_exper["preprocessing"])))

        features_basename = self.conf.get(self.pipeline_task,
                                          "features_basename")
        features_str = features_basename + \
                       pg_sed.hash_if_needed("".join(features_dict.values()))

        responses_basename = self.conf.get(self.pipeline_task,
                                           "responses_basename")
        responses_str = responses_basename + \
                        pg_sed.hash_if_needed(cur_exper["subset_name"])
        basenames = {
            "train_features": "%s_train_%s" % (features_str, self.cur_fold),
            "train_responses": "%s_train_%s" % (responses_str, self.cur_fold),
            "test_features": "%s_test_%s" % (features_str, self.cur_fold),
            "test_responses": "%s_test_%s" % (responses_str, self.cur_fold)
        }

        model_dict = model_funs.get_model_dict(
            self.theta, self.cur_fold, cur_exper,
            self.conf.get(self.pipeline_task, "responses"), self.seed,
            basenames, self.conf.get(self.pipeline_task, "model_data_schema"))

        # if successful, log it
        output_path = "%s/models/model%s_%s.log" % (
            self.logging_path, self.pipeline_task, model_dict["string"])

        return luigi.LocalTarget(output_path)
Exemplo n.º 11
0
            "pub_" + subset, "person_id", "sifode_pub_id")

        output_path = "mnt/data/sedesol/pub_preds/single_model/pub_predictions_subset_%s_model_%s" % (
            subset, model)
        cur_probs.round(2).to_csv(output_path, index=False)

# average these per model predictions
base_path = "/mnt/data/sedesol/pub_preds/single_model/pub_predictions_subset_geo_census_subset"
ensemble_path = "/mnt/data/sedesol/pub_preds/ensemble/"

state_ids = ["0" + str(s) for s in range(1, 10)] + \
            [str(s) for s in range(10, 15)] + \
            ["1503", "150_except_1503", "15_except_150"] + \
            [str(s) for s in range(16, 33)]

conn = pg_sed.db_connection()
cursor = conn.cursor()

for state in state_ids:
    print("starting state %s" % state)
    home_ids = pg_sed.get_original_features(["sifode_pub_id"],
                                            "semantic_pub_imputation",
                                            "pub_geo_census_subset_%s" % state,
                                            None, "sifode_pub_id")
    cur_paths = glob.glob("%s_%s*" % (base_path, state))
    p_hats = []
    for ix, path in enumerate(cur_paths):
        p_hats.append(np.array(pd.read_csv(path)))
    p_hat = np.dstack(tuple(p_hats))
    p_final = ensemble.get_average_pred(p_hat)
    p_final = pd.concat([home_ids, pd.DataFrame(p_final)], axis=1)