def getCategoryByAgeQuantilesFast(sdf, queries, product, state, plot=False):
    df = sdf.df
    results = {}
    for query in queries:
        res = sdftools.categoryByAgeQuantiles(df,
                                              sdf.schema,
                                              query,
                                              labels=True)
        results.update(res)

    if plot:
        for key, df in results.items():
            queryname, category, datatype = parseAgeQuantileKey(key,
                                                                fsname=False)
            sdftools.print_item(
                df.count(),
                "Number of rows in the Spark DF before transforming to Pandas DF"
            )
            if datatype == "quantile_df":
                age_quantile_pandas_df = df.toPandas()
                saveloc = du.getdir(sdf.metric_save_location)
                rp.age_quantile_lineplot(age_quantile_pandas_df, saveloc,
                                         product, state)
            else:  # datatype == "survival_props"
                pass

    return results
def analyzeQuery(query, table_name, analysis, spark, geolevels, eps, buckets=default_buckets, schema="DHCP_HHGQ"):
    """
        Main plotting fxn.
            query           : str, name of a valid query for the target experiment's schema
            table_name      : str, name of a table (used for file-naming conventions)
            analysis        : Analysis setuptools.setup object, organizes Analysis metadata
            spark           : SparkSession object, attached to analysis object
            geolevels       : [str, ...], geolevels to compute over for the current query
            buckets         : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing
            schema          : str, name of ../programs/schema/schemas/schemamaker.py schema associated with target data
        Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3.
    """
    print(f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}")
    schema_name = schema
    num_trials, paths, experiment_name, eps_str = getPathsAndName(schema_name, query, table_name, eps)
    print(f"Passing paths to Analysis experiment maker: {paths}")
    experiment = analysis.make_experiment(experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    spark_df = experiment.getDF()
    print("df looks like:")
    spark_df.show()
    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    queries = [query]

    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels)
    spark_df = sdftools.answerQueries(spark_df, schema, queries)
    spark_df = sdftools.getAvgAbsErrorByTrueCountRuns(spark_df, bins=[0,1,10,100,1000,10000]).persist()
    missing_rows_pandas_df = sdftools.getMissingRowCounts(spark_df, schema, queries, groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.PLB, AC.BUDGET_GROUP])
    missing_rows_dict = defaultdict(int)
    for index, row in missing_rows_pandas_df.iterrows():
        #print(f"missing df row # {index} geolevel, sum(missing) = {row['geolevel']},{row['sum(missing)']}")
        missing_rows_dict[row['geolevel']] = row['sum(missing)']
    spark_df.show()
    print("^^^^ with abs error, DF looks like ^^^^")

    metric_name = "Avg( |q(MDF) - q(CEF)| )"
    x_axis_variable_name = 'CEF Count, Binned'

    pandas_df = spark_df.toPandas()
    pandas_df = pandas_df.rename(columns={"abs_error":metric_name, "orig_count_bin":x_axis_variable_name})
    plt.figure(1, figsize=(11,8.5))
    plt.rc('axes', labelsize=8)
    print(f"pandas df before plotting has cols: {pandas_df.columns.values}")
    print(f"{x_axis_variable_name} column has distinct levels: {pandas_df[x_axis_variable_name].unique()}")
    buckets = pandas_df[x_axis_variable_name].unique()
    buckets = sorted(buckets, key=lambda bucket_name: largestIntInStr(bucket_name))
    print(f"Sorted bucket names: {buckets}")

    # Saving data frame
    csv_savepath = experiment.save_location_linux + f"{experiment_name}.csv"
    du.makePath(du.getdir(csv_savepath))
    pandas_df.to_csv(csv_savepath, index=False)

    makePlots(experiment, experiment_name, table_name, queries, x_axis_variable_name,
                                           metric_name, geolevels, pandas_df, buckets,
                                           schema_name, eps_str, missing_rows_dict, num_trials)
示例#3
0
def setup(save_location=None, spark_loglevel="ERROR", cli_args=True, logname=None, num_core_nodes=None,
          analysis_script=None):
    """
    Create a new spark session and Analysis object (from DataTools) to run experiments
    :param save_location: filepath of desired location
    :param spark_loglevel: level at which to set logging
    :param cli_args: optional boolean: use CLI arguments (will engage argument parser) or function args. default True
    :param logname: optional filepath of logfile. default None (if CLI)
    :param num_core_nodes: optional int: number of nodes. default None (if CLI)
    :param analysis_script: name of Analysis script to run. default None (if CLI)
    :return: Analysis object from the arguments
    """
    assert save_location is not None, "Need to specify local directory where the results of analysis will be saved"
    assert spark_loglevel in ["ALL", "DEBUG", "ERROR", "FATAL", "INFO", "OFF", "TRACE", "WARN"], \
        "Invalid Spark loglevel"


    # if parsing args from CLI (default True)
    if cli_args:
        parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
        parser.add_argument("--logname", help="logname used to output log data and for saving analysis results")
        parser.add_argument("--num_core_nodes", help="number of core nodes in the cluster upon spark start")
        parser.add_argument("--analysis_script", help="the analysis script being called by the run script")

        args, unknown = parser.parse_known_args()

        logname = args.logname.split('.')[0] # get rid of the .log part
        logname = logname.split('/')[1] # get rid of the logs/ part for the save_location and app_name
    # otherwise get args from function header
    else:
        assert logname is not None and num_core_nodes is not None and analysis_script is not None, \
            "If not using command line, must pass logname, num_core_nodes, and analysis_script"
        assert type(logname) is str and type(analysis_script) is str, "logname and analysis_script must be strings"
        assert type(num_core_nodes) is int and num_core_nodes > 0, "num_core_nodes must be positive integer"

        class Arguments():
            def __init__(self, logname, num_core_nodes, analysis_script):
                self.logname = logname
                self.num_core_nodes = num_core_nodes
                self.analysis_script = analysis_script
        args = Arguments(logname=logname, num_core_nodes=num_core_nodes, analysis_script=analysis_script)

    #save_location = f"/mnt/users/{os.environ['JBID']}/analysis_results/{logname}/"
    save_location = f"{du.addslash(save_location)}{logname}/"

    app_name = f"DAS_Analysis | {os.environ['JBID']} | {logname}"

    spark = SparkSession.builder.appName(app_name).getOrCreate()
    
    sdftools.print_item(du.pretty(dict(spark.sparkContext.getConf().getAll())), "Spark configurations being used")

    sdftools.print_item(args.num_core_nodes, "Number of Core Nodes on this cluster")
    sdftools.print_item(spark_loglevel, f"Spark loglevel")
    spark.sparkContext.setLogLevel(spark_loglevel)

    analysis_script_path = args.analysis_script    
    sdftools.print_item(analysis_script_path, "The Analysis Script's path")

    return datatools.Analysis(spark, save_location, analysis_script_path)
def getGeolevelTVDFast(sdf, product, state, plot=False):
    sdf = sdf.geolevel_tvd(groupby=[AC.GEOLEVEL, AC.RUN_ID, AC.QUERY, AC.PLB])

    if plot:
        saveloc = du.getdir(sdf.metric_save_location)
        sdftools.print_item(
            sdf.df.count(),
            "Number of rows in the Spark DF before transforming to Pandas DF")
        geolevel_tvd_pandas_df = sdf.toPandas()
        rp.geolevel_tvd_lineplot(geolevel_tvd_pandas_df, saveloc, product,
                                 state)
        rp.geolevel_tvd_heatmap(geolevel_tvd_pandas_df, saveloc, product,
                                state)

    results = {'geolevel_tvd': sdf}

    return results
def generateReport(sdf,
                   geolevels,
                   queries,
                   age_queries,
                   product,
                   state,
                   plot=False):
    geolevel_sdf = sdf.getGeolevels(geolevels)

    results = {}

    # for getSignedError, getGeolevelTVD, and getGeolevelSparsity
    queries_sdf = geolevel_sdf.answerQueries(queries)

    # signed error calculations
    res = getSignedErrorFast(queries_sdf.clone())
    results.update(res)
    sdftools.print_item(results, "Results after Signed Error Calculations")

    # geolevel 1-tvd calculations
    res = getGeolevelTVDFast(queries_sdf.clone(), product, state, plot)
    results.update(res)
    sdftools.print_item(
        results, "Results after Signed Error and Geolevel 1-TVD Calculations")

    # geolevel sparsity calculations
    res = getGeolevelSparsityFast(queries_sdf.clone())
    results.update(res)
    sdftools.print_item(
        results,
        "Results after Signed Error, Geolevel 1-TVD, and Sparsity Calculations"
    )

    # age quantile calculations
    res = getCategoryByAgeQuantilesFast(geolevel_sdf.clone(), age_queries,
                                        product, state, plot)
    results.update(res)
    sdftools.print_item(
        results,
        "Results after Signed Error, Geolevel 1-TVD, Sparsity, and Age Quantile Calculations"
    )

    return results
def analyzeQuery(query, table_name, analysis, spark, geolevels, buckets=default_buckets, schema="DHCP_HHGQ"):
    """
        Main plotting fxn.

            query           : str, name of a valid query for the target experiment's schema
            table_name      : str, name of a table (used for file-naming conventions)
            analysis        : Analysis setuptools.setup object, organizes Analysis metadata
            spark           : SparkSession object, attached to analysis object
            geolevels       : [str, ...], geolevels to compute over for the current query
            buckets         : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing

        Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3.
    """
    print(f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}")
    schema_name = schema
    paths, experiment_name, eps_str = getPathsAndName(schema_name)
    experiment = analysis.make_experiment(experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    spark_df = experiment.getDF()
    print("df looks like:")
    spark_df.show()
    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    queries = [query]
    #y=sdftools.getAnswers(spark,df,geolevels,schema,queries)

    # Old approach to computing df with abs diff, bucketed by true count:
    #sparkDFWithAbsDiff = getSparkDFWithAbsDiff(spark, spark_df, geolevels, queries, schema)
    #getSignedErrorByTrueCountRuns(spark_df, bins=[0,1,10,100,1000,10000]):
    #rdd = sdftools.getRowGroupsAsRDD(sparkDFWithAbsDiff, groupby=[AC.GEOLEVEL, AC.QUERY])
    #rdd = rdd.flatMapValues(lambda rows: sepBounds(rows, 'orig', buckets)).persist()
    #rdd = rdd.map(lambda row: Row(**row[1]))
    #spark_df = rdd.toDF().persist()

    # New (actually preexisting) approach to computing spark_df with abs diff, bucketed by true count:
    # (avoids pandas dfs inside mappers, which is RAM-hungry)
    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels)
    spark_df = sdftools.answerQueries(spark_df, schema, queries)
    spark_df = sdftools.getFullWorkloadDF(spark_df, schema, queries,groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP])
    spark_df = sdftools.getAvgAbsErrorByTrueCountRuns(spark_df, bins=[0,1,10,100,1000,10000]).persist()

    spark_df.show()
    print("^^^^ with abs error, DF looks like ^^^^")

    metric_name = "Avg( |q(MDF) - q(CEF)| )"
    x_axis_variable_name = 'CEF Count, Binned'

    # spark_df = spark_df.groupby(['geocode','geolevel','level','Bin0','Bin1','Bin2','Bin3','Bin4','Bin5']).avg()
    # Below spark_df has cols: geocode, geolevel, run_id, plb, budget_group, query, orig_count_bin, signed_error, re
    #spark_df = spark_df.groupby(['geocode', 'geolevel', 'plb', 'budget_group', 'query', 'orig_count_bin']).avg()
    #print("^^^^ after averaging, spark_df looks like ^^^^")
    pandas_df = spark_df.toPandas()
    #pandas_df = pandas_df.rename(columns={"avg(signed_error)":metric_name, "avg(orig)":"orig"})
    #pandas_df[x_axis_variable_name] = pandas_df.apply(lambda row: binIndexToInteger(row, buckets), axis=1)
    #pandas_df = pandas_df.rename(columns={"avg(signed_error)":metric_name, "avg(orig_count_bin)":"orig"})
    pandas_df = pandas_df.rename(columns={"abs_error":metric_name, "orig_count_bin":x_axis_variable_name})
    plt.figure(1, figsize=(11,8.5))
    plt.rc('axes', labelsize=8)
    print(f"pandas df before plotting has cols: {pandas_df.columns.values}")
    print(f"{x_axis_variable_name} column has distinct levels: {pandas_df[x_axis_variable_name].unique()}")
    buckets = pandas_df[x_axis_variable_name].unique()
    buckets = sorted(buckets, key=lambda bucket_name: largestIntInStr(bucket_name))
    print(f"Sorted bucket names: {buckets}")
    new_bucket_order = [0,1,2,3,5,4] # Apply ordering system to make 10000+ the last bucket
    buckets = [buckets[i] for i in new_bucket_order]
    print(f"Sorted bucket names: {buckets}")


    """
    print(pandas_df.head(30))
    print(f"pandas_df headers: {list(pandas_df.columns.values)}")
    tmpDf = pandas_df[[x_axis_variable_name, 'orig', metric_name]]
    print("tmpDf looks like:")
    with pandas.option_context('display.max_rows', None, 'display.max_columns', None):
        print(tmpDf)
    print("^^^^ pandas df looks like ^^^^")
    print("And first 3 rows:")
    print(pandas_df.iloc[:3])
    #print(df.dtypes)
    print("And first 100 rows, subset to Bins:")
    print(pandas_df.iloc[0:101,3:9])
    print(pandas_df.iloc[0:101,-1])
    """

    # Saving data frame
    csv_savepath = experiment.save_location_linux + f"Executive_Priority_Tabulations_#1_{experiment_name}_{table_name}_{query}.csv"
    du.makePath(du.getdir(csv_savepath))
    pandas_df.to_csv(csv_savepath, index=False)

    makePlots(experiment, experiment_name, table_name, queries, x_axis_variable_name,
                                           metric_name, geolevels, pandas_df, buckets,
                                           schema_name, eps_str)
    # save the analysis script?
    # toggle to_linux=True|False to save|not save this analysis script locally
    # toggle to_s3=True|False to save|not save this analysis script to s3
    analysis.save_analysis_script(to_linux=False, to_s3=False)

    # save/copy the log file?
    analysis.save_log(to_linux=False, to_s3=False)

    # zip the local results to s3?
    analysis.zip_results_to_s3(flag=False)

    spark = analysis.spark

    # build an example schema
    schema = Schema("example", ['a', 'b', 'c'], (2, 3, 5))
    sdftools.print_item(schema, "Toy example Schema")

    # build a set of GeounitNodes to use
    geocodes = ['000', '001', '002', '003', '010', '011', '012', '020', '022']
    geocode_dict = {3: 'block', 2: 'county'}

    # build geounits
    geounits = toytools.getToyGeounitData(schema, geocodes, geocode_dict)

    rdd = spark.sparkContext.parallelize(geounits).persist()

    sdftools.print_item(rdd.take(1), "One of the toy example geounits")

    # use Analysis to transform the rdd of geounitnodes into a spark dataframe
    df = datatools.rdd2df(rdd, schema)
    sdftools.print_item(df, "Toy example DF", 300)
示例#8
0
def analyzeQuery(query, analysis, spark, geolevel, schema_name, path):
    """
        Main plotting fxn.
            query           : str, name of a valid query for the target experiment's schema
            table_name      : str, name of a table (used for file-naming conventions)
            analysis        : Analysis setuptools.setup object, organizes Analysis metadata
            spark           : SparkSession object, attached to analysis object
            geolevels       : [str, ...], geolevels to compute over for the current query
            buckets         : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing
            schema          : str, name of ../programs/schema/schemas/schemamaker.py schema associated with target data
        Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3.
    """

    experiment_name = "NA"
    quantiles = [xi / 20. for xi in np.arange(20)] + [.975, .99, 1.]
    experiment = analysis.make_experiment(
        experiment_name, [path],
        schema_name=schema_name,
        dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT,
        budget_group='1',
        run_id='run1.0')
    spark_df = experiment.getDF()
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevel)

    if geolevel == C.PLACE:
        spark_df = spark_df.filter(spark_df.geocode[2:7] != "99999")
    elif geolevel == 'AIAN_AREAS':
        spark_df = spark_df.filter(spark_df.geocode != "9999")
    elif geolevel == 'OSE':
        spark_df = spark_df.filter(
            sf.col(AC.GEOCODE).substr(
                sf.length(sf.col(AC.GEOCODE)) -
                4, sf.length(sf.col(AC.GEOCODE))) != "99999")
    elif geolevel == 'AIANTract':
        spark_df = spark_df.filter(spark_df.geocode != "9" * 11)
    elif geolevel == 'AIANState':
        spark_df = spark_df.filter(spark_df.geocode != "99")
    elif geolevel == 'AIANBlock':
        spark_df = spark_df.filter(spark_df.geocode != "9" * 16)
    elif geolevel == 'COUNTY_NSMCD':
        spark_df = spark_df.filter(spark_df.geocode != "999")

    spark_df = sdftools.answerQueries(spark_df, schema, [query, denom_query])

    spark_df = sdftools.getL1Relative(spark_df,
                                      colname="L1Relative",
                                      denom_query=denom_query,
                                      denom_level=denom_level).persist()

    spark_rdd_prop_lt = spark_df.rdd.map(
        lambda row: (int(np.digitize(row["orig"], POPULATION_BIN_STARTS)), 1.
                     if row["L1Relative"] <= THRESHOLD else 0.))
    spark_df_prop_lt = spark_rdd_prop_lt.toDF(["pop_bin", "prop_lt"])

    # Find the proportion of geounits that have L1Relative errors less than threshold for each bin:
    grouped_df_prop_lt = spark_df_prop_lt.groupBy("pop_bin").agg({
        "prop_lt": "avg",
        "*": "count"
    })
    # print("RCM", grouped_df_prop_lt.first())
    prop_lt = grouped_df_prop_lt.collect()
    prop_lt_dict = {}
    prop_lt_counts = {}
    for row in prop_lt:
        prop_lt_dict[int(row["pop_bin"])] = np.round(row["avg(prop_lt)"], 5)
        prop_lt_counts[int(row["pop_bin"])] = int(row["count(1)"])
    print(prop_lt_dict)
    pop_bin_indices = list(prop_lt_dict.keys())
    for k in range(len(POPULATION_BIN_STARTS)):
        if k not in pop_bin_indices:
            prop_lt_dict[k] = None
            prop_lt_counts[k] = 0
    print(
        f"geounits counts for each bin: {[(POPULATION_BIN_STARTS[k], prop_lt_counts[k]) for k in range(len(POPULATION_BIN_STARTS))]}"
    )
    prop_lt_reformat = [(POPULATION_BIN_STARTS[k], prop_lt_dict[k])
                        for k in range(len(POPULATION_BIN_STARTS))]

    spark_df = spark_df.filter(spark_df.orig >= POPULATION_CUTOFF)
    # Count above POPULATION_CUTOFF
    count = spark_df.count()
    # For the quantiles and the avg, we will omit geounits that would not have had a well defined L1Relative metric well defined
    # due to division by zero: (See the comments in the UDF used in sdftools.getL1Relative() for more detail.)
    spark_df = spark_df.filter(spark_df.L1Relative != 2.)
    count_correct_sign = spark_df.count()

    quantiles_df = sdftools.getGroupQuantiles(spark_df,
                                              columns=["L1Relative"],
                                              groupby=[AC.QUERY, AC.GEOLEVEL],
                                              quantiles=QUANTILES).collect()
    avg = spark_df.groupBy([AC.QUERY, AC.GEOLEVEL]).avg("L1Relative").collect()

    quantiles_dict = {}
    for row in quantiles_df:
        quantiles_dict[float(row["quantile"])] = np.round(row["L1Relative"], 5)
    quantiles_reformat = [(quant, quantiles_dict[quant])
                          for quant in QUANTILES]
    error_metrics = [
        np.round(avg[0]["avg(L1Relative)"], 5), count, count_correct_sign
    ] + [quantiles_reformat] + [prop_lt_reformat]

    print("error_metrics:", error_metrics)
    return error_metrics
示例#9
0
def MattsMetrics(query,
                 table_name,
                 analysis,
                 spark,
                 geolevels,
                 schema="DHCP_HHGQ"):
    """
    This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds"

    """
    print(
        f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}"
    )
    schema_name = schema
    paths, experiment_name, eps_str = getPathsAndName(schema_name)
    experiment = analysis.make_experiment(
        experiment_name,
        paths,
        schema_name=schema_name,
        dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    spark_df = experiment.getDF()
    print("df looks like:")
    spark_df.show()
    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    queries = [query]
    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels)
    spark_df = sdftools.answerQueries(spark_df, schema, queries)
    spark_df = sdftools.getFullWorkloadDF(
        spark_df,
        schema,
        queries,
        groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP])

    #spark_df.show(spark_df.count(), False)

    # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    spark_df = sdftools.getL1(spark_df,
                              colname="L1",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    spark_df = sdftools.getL2(spark_df,
                              colname="L2",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    # apply bin functions for particular tables
    if (table_name in table_bucket_list1):
        spark_df = sdftools.getCountBins(
            spark_df,
            column=AC.ORIG,
            bins=[0, 1000, 5000, 10000, 50000, 100000]).persist()
    if (table_name in table_bucket_list2):
        spark_df = sdftools.getCountBins(spark_df,
                                         column=AC.ORIG,
                                         bins=[0, 10, 100]).persist()
    # This finds overall metrics
    spark_df.show(100, False)

    if table_name not in (table_list_3_plus_list_age):

        for g in geolevels:
            spark_df1 = spark_df[spark_df['geolevel'] ==
                                 g]  # Separate data for each geolevel
            if table_name in table_default_no_bucket:  # If data is not in buckets
                bucket_size = "NA"
                metrics_result = sdftools.metrics_with_popbucket(spark_df1,
                                                                 bucket_size,
                                                                 spark,
                                                                 key="A")
                file_name = f"{table_name}_{g}.csv"

            if table_name in table_bucket_list2:  # if data is bucketed in 3 buckets,
                bucket_size = default_buckets2
                print("BUCKET SIZE IS:", bucket_size)
                metrics_result = sdftools.metrics_with_popbucket(spark_df1,
                                                                 bucket_size,
                                                                 spark,
                                                                 key="B")
                file_name = f"{table_name}_{g}.csv"

            if table_name in table_bucket_list1:  # Table 1 and 2, six buckets
                bucket_size = default_buckets1
                print("BUCKET SIZE IS:", bucket_size)
                metrics_result = sdftools.metrics_with_popbucket(spark_df1,
                                                                 bucket_size,
                                                                 spark,
                                                                 key="B")
                file_name = f"{table_name}_{g}.csv"

    if table_name in table_list_3geolevels:  #three geolevels, state, county, place, Tables 10,14,18,22

        metrics_result = sdftools.metrics_with_3geolevels(
            spark_df, spark, geolevels)
        file_name = f"{table_name}.csv"

    if table_name in table_list_age:  # Tables 32-35

        if table_name in table_age_bracket1:

            metrics_result = sdftools.metrics_with_age(spark_df,
                                                       spark,
                                                       age_range_list,
                                                       key="A")
        if table_name in table_age_bracket2:
            metrics_result = sdftools.metrics_with_age(spark_df,
                                                       spark,
                                                       age_range_list,
                                                       key="B")
        if table_name in table_age_bracket3:
            metrics_result = sdftools.metrics_with_age(spark_df,
                                                       spark,
                                                       age_range_list2,
                                                       key="A")
        if table_name in table_age_bracket4:
            metrics_result = sdftools.metrics_with_age(spark_df,
                                                       spark,
                                                       age_range_list2,
                                                       key="B")

        file_name = f"{table_name}.csv"
    pandas_df = metrics_result.toPandas()
    csv_savepath = experiment.save_location_linux + file_name
    du.makePath(du.getdir(csv_savepath))
    pandas_df.to_csv(csv_savepath, index=False)
示例#10
0
    analysis.save_log(to_linux=False, to_s3=False)

    # zip the local results to s3?
    analysis.zip_results_to_s3(flag=False)

    spark = analysis.spark

    #######################################################
    # Create an experiment using one or more DAS Run paths
    #######################################################
    paths = [
        f"{AC.S3_BASE}kifer001/Aug29_experiments_hierarchicalAgeRangeTopDown_branchFactor4_output_danVariant1-2/td4/"
    ]

    experiment = analysis.make_experiment("danVariant1-2", paths)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    ##############################
    # Work with the Experiment DF
    ##############################
    df = experiment.getDF()
    schema = experiment.schema
    sdftools.print_item(df, "Experiment DF")

    ######################
    # Aggregate Geolevels
    ######################
    geolevels = [C.STATE, C.COUNTY]

    # OPTIONAL
    # ========
示例#11
0
    # save the analysis script?
    # toggle to_linux=True|False to save|not save this analysis script locally
    # toggle to_s3=True|False to save|not save this analysis script to s3
    analysis.save_analysis_script(to_linux=False, to_s3=False)

    # save/copy the log file?
    analysis.save_log(to_linux=False, to_s3=False)

    # zip the local results to s3?
    analysis.zip_results_to_s3(flag=False)

    spark = analysis.spark

    schema_name = "DHCP_HHGQ"
    schema = SchemaMaker.fromName(name=schema_name)
    sdftools.print_item(schema, "Schema")

    num_geocodes = 5
    density = 0.00001
    scale = 10
    geounits = getToyGeounitData_dict(num_geocodes, schema, density, scale)

    sdftools.print_item(geounits, "Randomly-generated Geounits")

    rdd = spark.sparkContext.parallelize(geounits)
    sdftools.print_item(rdd, "RDD of random geounits")

    df = datatools.rdd2df(rdd, schema)
    # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    df = df.select(['geocode'] + schema.dimnames + ['orig', 'priv'])
    sdftools.print_item(df, "DF of random geounit data", 1000)
def MattsMetrics(query,
                 table_name,
                 analysis,
                 spark,
                 geolevels,
                 buckets=default_buckets,
                 schema="DHCP_HHGQ"):
    """
    This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds"

    """
    print(
        f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}"
    )
    schema_name = schema
    paths, experiment_name, eps_str = getPathsAndName(schema_name)
    experiment = analysis.make_experiment(
        experiment_name,
        paths,
        schema_name=schema_name,
        dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    spark_df = experiment.getDF()
    print("df looks like:")
    spark_df.show()
    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    queries = [query]
    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels)
    spark_df = sdftools.answerQueries(spark_df, schema, queries)

    #spark_df.show(spark_df.count(), False)
    # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    spark_df = sdftools.getL1(spark_df,
                              colname="L1",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    spark_df = sdftools.getL2(spark_df,
                              colname="L2",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    # apply bin functions for particular tables
    if (table_name in table_default_bucket_list):
        spark_df = sdftools.getCountBins(
            spark_df,
            column=AC.ORIG,
            bins=[0, 1000, 5000, 10000, 50000, 100000]).persist()
    if (table_name in table_default_bucket_list2):
        spark_df = sdftools.getCountBins(spark_df,
                                         column=AC.ORIG,
                                         bins=[0, 10, 100]).persist()
    # This finds overall metrics
    #spark_df.show(spark_df.count(), False)

    for g in geolevels:
        spark_df = spark_df[spark_df['geolevel'] == g]
        print("This has all levels")
        spark_df.show(150, False)

        metrics_dataframe = sdftools.mattsmetrics(spark_df, spark)
        Counts = spark_df.count()
        print("Counts are", Counts)
        newRow = spark.createDataFrame([(Counts, "Counts")])
        metrics_dataframe = metrics_dataframe.union(newRow)
        pandas_df = metrics_dataframe.toPandas()
        csv_savepath = experiment.save_location_linux + f"{table_name}_{g}.csv"
        du.makePath(du.getdir(csv_savepath))
        pandas_df.to_csv(csv_savepath, index=False)
        if table_name in table_default_bucket_list2:  # If data needs bucketing

            for b in default_buckets2:  # calculate Metrics at each bucket
                subset_sparkdf = spark_df[spark_df['orig_count_bin'] ==
                                          b]  #subset into bins
                subset_sparkdf = subset_sparkdf.subtract(
                    subset_sparkdf.filter(subset_sparkdf.level.rlike("Not"))
                )  # Removes instances of Not Hispanic..from dataframe
                subset_sparkdf.show(100, False)
                print("Make sure its bucketed and without 'Not' values")
                subset_metrics = sdftools.mattsmetrics(subset_sparkdf, spark)
                Counts = subset_sparkdf.count()
                newRow = spark.createDataFrame([(b, "Bucket")])
                newRow1 = spark.createDataFrame([(Counts, "Counts")])
                subset_metrics = subset_metrics.union(newRow).union(newRow1)
                pandas_df = subset_metrics.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{b}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)
    analysis.save_log(to_linux=False, to_s3=False)

    # zip the local results to s3?
    analysis.zip_results_to_s3(flag=False)

    spark = analysis.spark

    #######################################################
    # Create an experiment using one or more DAS Run paths
    #######################################################
    paths = [
        f"{AC.S3_BASE}kifer001/Aug29_experiments_hierarchicalAgeRangeTopDown_branchFactor4_output_danVariant1-2/td4/run_0000/"
    ]

    experiment = analysis.make_experiment("danVariant1-2", paths)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    ##############################
    # Work with the Experiment DF
    ##############################
    df = experiment.getDF()
    schema = experiment.schema
    sdftools.print_item(df, "Experiment DF")

    #################################
    # Calculate Population Densities
    #################################
    geolevels = [C.STATE, C.COUNTY]
    popdf = sdftools.population_density(spark, df, schema, geolevels)
    sdftools.print_item(popdf, "Population Density DF", 1000)
示例#14
0
def MattsMetrics(query,
                 table_name,
                 analysis,
                 spark,
                 geolevels,
                 buckets=default_buckets,
                 schema="DHCP_HHGQ"):
    """
    This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds"

    """
    print(
        f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}"
    )
    schema_name = schema
    paths, experiment_name, eps_str = getPathsAndName(schema_name)
    experiment = analysis.make_experiment(
        experiment_name,
        paths,
        schema_name=schema_name,
        dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    spark_df = experiment.getDF()
    print("df looks like:")
    spark_df.show()
    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    queries = [query]
    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels)
    spark_df = sdftools.answerQueries(spark_df, schema, queries)
    spark_df = sdftools.getFullWorkloadDF(
        spark_df,
        schema,
        queries,
        groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP])

    #spark_df.show(spark_df.count(), False)
    # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    spark_df = sdftools.getL1(spark_df,
                              colname="L1",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    spark_df = sdftools.getL2(spark_df,
                              colname="L2",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    # apply bin functions for particular tables
    if (table_name in table_default_bucket_list):
        spark_df = sdftools.getCountBins(
            spark_df,
            column=AC.ORIG,
            bins=[0, 1000, 5000, 10000, 50000, 100000]).persist()
    if (table_name in table_default_bucket_list2):
        spark_df = sdftools.getCountBins(spark_df,
                                         column=AC.ORIG,
                                         bins=[0, 10, 100]).persist()
    # This finds overall metrics
    spark_df.show(100, False)

    for g in geolevels:
        spark_df1 = spark_df[spark_df['geolevel'] ==
                             g]  # Separate data for each geolevel
        if table_name in table_default_no_bucket:  # If data is not in buckets
            if table_name in table_race_query:  # Table 17, 18, 21 and others
                print("no buckets, with race query")
                spark_df2 = spark_df1.subtract(
                    spark_df1.filter(spark_df1.level.rlike("Not")))
                spark_df2.show(100, False)
                print("Make sure 'Not' values are removed")
                metrics_dataframe = sdftools.mattsmetrics(spark_df2, spark)
                Counts = spark_df2.count()
                print("Counts are", Counts)
                newRow = spark.createDataFrame([(Counts, "Counts")])
                metrics_dataframe = metrics_dataframe.union(newRow)
                pandas_df = metrics_dataframe.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)

            else:
                print("no buckets, without race query")
                spark_df1.show(100, False)
                spark_df2 = spark_df1.subtract(
                    spark_df1.filter(spark_df1.level.rlike("Not")))
                print("with Not removed")
                spark_df2.show(100, False)
                metrics_dataframe = sdftools.mattsmetrics(spark_df2, spark)
                Counts = spark_df2.count()
                print("Counts are", Counts)
                newRow = spark.createDataFrame([(Counts, "Counts")])
                metrics_dataframe = metrics_dataframe.union(newRow)
                pandas_df = metrics_dataframe.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)

        if table_name in table_age_bracket1:
            print("Data is in age brackets, 0 to 17, 18 to 64, 65+")
            spark_df1.show(100, False)
            for age_range in age_range_list:
                subset_sparkdf1 = spark_df1.filter(
                    spark_df1.level.rlike(age_range))
                subset_sparkdf1.show(100, False)
                metrics_dataframe = sdftools.mattsmetrics(
                    subset_sparkdf1, spark)
                #subset_sparkdf1.show(100, False)
                Counts = subset_sparkdf1.count()
                print("Counts are", Counts)
                newRow = spark.createDataFrame([(Counts, "Counts")])
                metrics_dataframe = metrics_dataframe.union(newRow)
                pandas_df = metrics_dataframe.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)
        if table_name in table_age_bracket2:
            print("Data is age buckets, with sex query")
            spark_df1.show(100, False)
            for sexlevel in sex_range_list:
                subset_sparkdf1 = spark_df1.filter(
                    spark_df1.level.rlike(sexlevel))
                for age_range in age_range_list:
                    subset_sparkdf2 = subset_sparkdf1.filter(
                        subset_sparkdf1.level.rlike(age_range))
                    subset_sparkdf2.show(100, False)
                    metrics_dataframe = sdftools.mattsmetrics(
                        subset_sparkdf2, spark)
                    #subset_sparkdf1.show(100, False)
                    Counts = subset_sparkdf2.count()
                    print("Counts are", Counts)
                    newRow = spark.createDataFrame([(Counts, "Counts")])
                    metrics_dataframe = metrics_dataframe.union(newRow)
                    pandas_df = metrics_dataframe.toPandas()
                    csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}_{sexlevel}.csv"
                    du.makePath(du.getdir(csv_savepath))
                    pandas_df.to_csv(csv_savepath, index=False)
        if table_name in table_age_bracket3:
            print("Data is in age brackets of 5 year age groups")
            spark_df1.show(100, False)
            for age_range in age_range_list2:
                subset_sparkdf1 = spark_df1.filter(
                    spark_df1.level.rlike(age_range))
                subset_sparkdf1.show(100, False)
                metrics_dataframe = sdftools.mattsmetrics(
                    subset_sparkdf1, spark)
                #subset_sparkdf1.show(100, False)
                Counts = subset_sparkdf1.count()
                print("Counts are", Counts)
                newRow = spark.createDataFrame([(Counts, "Counts")])
                metrics_dataframe = metrics_dataframe.union(newRow)
                pandas_df = metrics_dataframe.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)
        if table_name in table_age_bracket4:
            print("Data is age buckets of 5 year age groups, with sex query")
            spark_df1.show(100, False)
            for sexlevel in sex_range_list:
                subset_sparkdf1 = spark_df1.filter(
                    spark_df1.level.rlike(sexlevel))
                for age_range in age_range_list2:
                    subset_sparkdf2 = subset_sparkdf1.filter(
                        subset_sparkdf1.level.rlike(age_range))
                    subset_sparkdf2.show(100, False)
                    metrics_dataframe = sdftools.mattsmetrics(
                        subset_sparkdf2, spark)
                    #subset_sparkdf1.show(100, False)
                    Counts = subset_sparkdf2.count()
                    print("Counts are", Counts)
                    newRow = spark.createDataFrame([(Counts, "Counts")])
                    metrics_dataframe = metrics_dataframe.union(newRow)
                    pandas_df = metrics_dataframe.toPandas()
                    csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}_{sexlevel}.csv"
                    du.makePath(du.getdir(csv_savepath))
                    pandas_df.to_csv(csv_savepath, index=False)

        if table_name in table_default_bucket_list2:  # If data is in buckets [0,10],[10,100),[100+)
            print("data is bucketed and treated accordingly")
            #if table_name in table_race_query:

            for b in default_buckets2:  # calculate Metrics at each bucket
                print("Bucket is:", b)
                subset_sparkdf = spark_df1[spark_df1['orig_count_bin'] ==
                                           b]  #subset into bins
                subset_sparkdf.show(100, False)
                print("Bucketed data")
                subset_sparkdf1 = subset_sparkdf.subtract(
                    subset_sparkdf.filter(subset_sparkdf.level.rlike("Not")))
                subset_sparkdf1.show(100, False)
                print("Make sure its bucketed and 'Not' values are removed")
                subset_metrics = sdftools.mattsmetrics(subset_sparkdf1, spark)
                Counts = subset_sparkdf1.count()
                newRow = spark.createDataFrame([(b, "Bucket")])
                newRow1 = spark.createDataFrame([(Counts, "Counts")])
                subset_metrics = subset_metrics.union(newRow).union(newRow1)
                pandas_df = subset_metrics.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{b}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)

        if table_name in table_default_bucket_list:  # If data is in buckets [0,1000],[1000,5000),etc. Table 1 and 2
            print("data is bucketed and treated accordingly")
            #if table_name in table_race_query:

            for b in default_buckets:  # calculate Metrics at each bucket
                print("Bucket is:", b)
                subset_sparkdf = spark_df1[spark_df1['orig_count_bin'] ==
                                           b]  #subset into bins
                subset_sparkdf.show(100, False)
                print("Bucketed data")
                subset_sparkdf1 = subset_sparkdf.subtract(
                    subset_sparkdf.filter(subset_sparkdf.level.rlike("Not")))
                subset_sparkdf1.show(100, False)
                print("Make sure its bucketed and 'Not' values are removed")
                subset_metrics = sdftools.mattsmetrics(subset_sparkdf1, spark)
                Counts = subset_sparkdf1.count()
                newRow = spark.createDataFrame([(b, "Bucket")])
                newRow1 = spark.createDataFrame([(Counts, "Counts")])
                subset_metrics = subset_metrics.union(newRow).union(newRow1)
                pandas_df = subset_metrics.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{b}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)
示例#15
0
def MattsMetrics(query,
                 table_name,
                 analysis,
                 spark,
                 geolevels,
                 buckets=default_buckets,
                 schema="DHCP_HHGQ"):
    """
    This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds"

    """
    print(
        f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}"
    )
    schema_name = schema
    paths, experiment_name, eps_str = getPathsAndName(schema_name)
    experiment = analysis.make_experiment(
        experiment_name,
        paths,
        schema_name=schema_name,
        dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    spark_df = experiment.getDF()
    print("df looks like:")
    spark_df.show()
    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    queries = [query]
    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels)
    spark_df = sdftools.answerQueries(spark_df, schema, queries)

    spark_df.show()
    # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    spark_df = sdftools.getL1(spark_df,
                              colname="L1",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    spark_df = sdftools.getL2(spark_df,
                              colname="L2",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    spark_df = sdftools.getCountBins(
        spark_df, column=AC.ORIG, bins=[0, 1000, 5000, 10000, 50000,
                                        100000]).persist()

    for b in default_buckets:  # calculate Metrics
        subset_sparkdf = spark_df[spark_df['orig_count_bin'] ==
                                  b]  #subset into bins
        subset_sparkdf.show()
        MAE_value = sdftools.MAE(subset_sparkdf)
        print("Bucket size is", b)
        print("MAE value is", MAE_value)

        RMS_value = sdftools.RMS(subset_sparkdf)
        CoV_value = sdftools.Coe_of_variation(subset_sparkdf, RMS_value)

        print("RMS value is", RMS_value)
        print("Coefficient of Variation is", CoV_value)
        MAPE_value = sdftools.MAPE(subset_sparkdf)
        print("MAPE value is", MAPE_value)

        MALPE_value = sdftools.MALPE(subset_sparkdf)

        print("MALPE value is", MALPE_value)

        print("Counts of percent differences between 5 and 10 percent: ")
        # 5to10percentCount = sdftools.Count_percentdiff_5to10percent(subset_spark)
        # This function disabled for now
        greaterthan10percentCount = sdftools.Count_percentdiff_10percent(
            subset_sparkdf)

        #ze.groupBy().agg(F.count(F.when(F.col("abs diff div cef")>0.05, True)),F.count(F.when(F.col("abs diff div cef")<0.1,True))).show()
        #  ze.groupBy().agg(F.count(F.when(F.col("abs diff div cef")>0.05 and F.col("abs diff div cef")<0.1),True)).show()
        print("Counts of percent differences greater than 10 percent: ")

        greaterthan10percentCount.show()
示例#16
0
def analyzeQuery(query,
                 table_name,
                 analysis,
                 spark,
                 geolevels,
                 buckets=default_buckets,
                 schema="DHCP_HHGQ"):
    """
        Main plotting fxn.

            query           : str, name of a valid query for the target experiment's schema
            table_name      : str, name of a table (used for file-naming conventions)
            analysis        : Analysis setuptools.setup object, organizes Analysis metadata
            spark           : SparkSession object, attached to analysis object
            geolevels       : [str, ...], geolevels to compute over for the current query
            buckets         : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing

        Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3.
    """
    print(
        f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}"
    )
    schema_name = schema
    paths, experiment_name = getPathsAndName(schema_name)
    experiment = analysis.make_experiment(
        experiment_name,
        paths,
        schema_name=schema_name,
        dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    df = experiment.getDF()
    if TEST:
        df = df.limit(TEST_NUM)
    print("df looks like:")
    df.show()
    schema = experiment.schema
    sdftools.print_item(df, "Flat Experiment DF")

    queries = [query]
    #y=sdftools.getAnswers(spark,df,geolevels,schema,queries)
    rddWithAbsDiff = getRddWithAbsDiff(spark, df, geolevels, queries, schema)
    rddWithAbsDiff = sdftools.getFullWorkloadDF(
        rddWithAbsDiff,
        schema,
        queri,
        groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP])

    rdd = sdftools.getRowGroupsAsRDD(rddWithAbsDiff,
                                     groupby=[AC.GEOLEVEL, AC.QUERY])
    rdd = rdd.flatMapValues(
        lambda rows: sepBounds(rows, 'orig', buckets)).persist()
    rdd = rdd.map(lambda row: Row(**row[1]))
    df = rdd.toDF().persist()

    metric_name = "Avg( |q(MDF) - q(CEF)| )"
    x_axis_variable_name = 'CEF Count, Binned'

    df = df.groupby([
        'geocode', 'geolevel', 'level', 'Bin0', 'Bin1', 'Bin2', 'Bin3', 'Bin4',
        'Bin5'
    ]).avg()
    pandas_df = df.toPandas()
    pandas_df = pandas_df.rename(columns={
        "avg(abs diff)": metric_name,
        "avg(orig)": "orig"
    })
    pandas_df[x_axis_variable_name] = pandas_df.apply(
        lambda row: binIndexToInteger(row, buckets), axis=1)
    plt.figure(1, figsize=(11, 8.5))
    plt.rc('axes', labelsize=8)
    """
    print(pandas_df.head(30))
    print(f"pandas_df headers: {list(pandas_df.columns.values)}")
    tmpDf = pandas_df[[x_axis_variable_name, 'orig', metric_name]]
    print("tmpDf looks like:")
    with pandas.option_context('display.max_rows', None, 'display.max_columns', None):
        print(tmpDf)
    print("^^^^ pandas df looks like ^^^^")
    print("And first 3 rows:")
    print(pandas_df.iloc[:3])
    #print(df.dtypes)
    print("And first 100 rows, subset to Bins:")
    print(pandas_df.iloc[0:101,3:9])
    print(pandas_df.iloc[0:101,-1])
    """

    # Saving data frame
    csv_savepath = experiment.save_location_linux + f"Executive_Priority_Tabulations_#1_{experiment_name}.csv"
    du.makePath(du.getdir(csv_savepath))
    pandas_df.to_csv(csv_savepath, index=False)

    makePlots(experiment, experiment_name, table_name, queries,
              x_axis_variable_name, metric_name, geolevels, pandas_df, buckets,
              schema_name)
    analysis.save_analysis_script(to_linux=False, to_s3=False)
    
    # save/copy the log file?
    analysis.save_log(to_linux=False, to_s3=False)
    
    # zip the local results to s3?
    analysis.zip_results_to_s3(flag=False)

    spark = analysis.spark




    # build an example schema
    schema = Schema("example", ['a', 'b', 'c'], (2,3,5))
    sdftools.print_item(schema, "Toy example Schema")
    
    # build a set of GeounitNodes to use
    geocodes = ['000', '001', '002', '003', '010', '011', '012', '020', '022']
    geocode_dict = {3: 'block', 2: 'county'}
    
    # build geounits
    geounits = toytools.getToyGeounitData_GeounitNode(schema, geocodes, geocode_dict, raw_params={'low': 0, 'high': 100})
        
    rdd = spark.sparkContext.parallelize(geounits).persist()

    sdftools.print_item(rdd.take(1), "One of the toy example geounits")

    # use Analysis to transform the rdd of geounitnodes into a spark dataframe
    df = datatools.rdd2df(rdd, schema)
    sdftools.print_item(df, "Toy example DF", 300)
示例#18
0
def MattsMetrics(query,
                 table_name,
                 analysis,
                 spark,
                 geolevels,
                 key,
                 agekey,
                 sexkey,
                 bucketkey,
                 schema="DHCP_HHGQ"):
    """
    This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds"

    """
    print(
        f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}"
    )
    schema_name = schema
    paths, experiment_name, eps_str = getPathsAndName(schema_name)
    experiment = analysis.make_experiment(
        experiment_name,
        paths,
        schema_name=schema_name,
        dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    spark_df = experiment.getDF()
    print("df looks like:")
    spark_df.show()
    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    queries = [query]
    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels)
    spark_df = sdftools.answerQueries(spark_df, schema, queries)
    spark_df = sdftools.getFullWorkloadDF(
        spark_df,
        schema,
        queries,
        groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP])

    #spark_df.show(spark_df.count(), False)
    # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    spark_df = sdftools.getL1(spark_df,
                              colname="L1",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    spark_df = sdftools.getL2(spark_df,
                              colname="L2",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    # apply bin functions for particular tables
    if (table_name in table_bucket_list1):
        spark_df = sdftools.getCountBins(
            spark_df,
            column=AC.ORIG,
            bins=[0, 1000, 5000, 10000, 50000, 100000]).persist()
    if (table_name in table_bucket_list2):
        spark_df = sdftools.getCountBins(spark_df,
                                         column=AC.ORIG,
                                         bins=[0, 10, 100]).persist()
    # This finds overall metrics
    spark_df.show(100, False)

    metrics_result = sdftools.combined_metrics(spark_df, spark, geolevels,
                                               agekey, sexkey, bucketkey, key)
    file_name = f"{table_name}.csv"
    pandas_df = metrics_result.toPandas()
    csv_savepath = experiment.save_location_linux + file_name
    du.makePath(du.getdir(csv_savepath))
    pandas_df.to_csv(csv_savepath, index=False)
    analysis.save_log(to_linux=False, to_s3=False)

    # zip the local results to s3?
    analysis.zip_results_to_s3(flag=False)

    spark = analysis.spark

    #######################################################
    # Create an experiment using one or more DAS Run paths
    #######################################################
    paths = [
        f"{AC.S3_BASE}kifer001/Aug29_experiments_hierarchicalAgeRangeTopDown_branchFactor4_output_danVariant1-2/td4/"
    ]

    experiment = analysis.make_experiment("danVariant1-2", paths)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    ##############################
    # Work with the Experiment DF
    ##############################
    df = experiment.getDF()
    schema = experiment.schema
    sdftools.print_item(df, "Experiment DF")

    ######################
    # Aggregate Geolevels
    ######################
    geolevels = [C.STATE, C.COUNTY]

    # OPTIONAL
    # ========
def analyzeQuery(query,
                 table_name,
                 analysis,
                 spark,
                 geolevels,
                 eps,
                 schema_name="DHCP_HHGQ"):
    """
        Main plotting fxn.
            query           : str, name of a valid query for the target experiment's schema
            table_name      : str, name of a table (used for file-naming conventions)
            analysis        : Analysis setuptools.setup object, organizes Analysis metadata
            spark           : SparkSession object, attached to analysis object
            geolevels       : [str, ...], geolevels to compute over for the current query
            buckets         : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing
            schema          : str, name of ../programs/schema/schemas/schemamaker.py schema associated with target data
        Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3.
    """
    geolevel = geolevels[0]
    EPT = table_name[:4] + "_" + schema_name
    graph_title = f"Error for query: {table_name}-{query}, eps: {int(eps)}, geography: {geolevels}\nDisclosure Prohibited - Title 13 U.S.C."
    plt.figure(1, figsize=(20, 40))
    sns.set(style="ticks")
    fig, axes = plt.subplots(ncols=3, nrows=2, sharey=True, sharex=True)
    axes_flat = axes.ravel()
    sns.despine(fig=fig)
    #.set_title(graph_title)
    print(
        f"For table {table_name}, analyzing query {query} at geolevel {geolevel} with schema_name {schema_name} and eps: {eps}"
    )
    num_trials, paths, experiment_name, eps_str, spines, mechanisms = getPathsAndName(
        schema_name, query, table_name, eps)
    plt.xscale('log')
    plt.yscale('symlog', linthreshy=100)
    for k, path in enumerate(paths):
        axes_flat[k].set_title(spines[k] + '_' + mechanisms[k])
        experiment = analysis.make_experiment(
            experiment_name, [path],
            schema_name=schema_name,
            dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
        spark_df = experiment.getDF()
        sdftools.print_item(experiment.__dict__, "Experiment Attributes")

        schema = experiment.schema
        sdftools.print_item(spark_df, "Flat Experiment DF")

        queries = [query]
        spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels)
        # jitter points to make them visually distinct:
        spark_df = sdftools.answerQueries(spark_df, schema, queries) \
                           .withColumn("Error", sf.col("priv") - sf.col("orig") + sf.rand() - 1/2.) \
                           .withColumn("orig", sf.col("orig") + sf.rand() - 1/2.)
        if geolevel == "AIAN_AREAS":
            spark_df = spark_df.filter(spark_df.geocode != "9999")
        elif geolevel == 'OSE':
            spark_df = spark_df.filter(
                sf.col(AC.GEOCODE).substr(
                    sf.length(sf.col(AC.GEOCODE)) -
                    4, sf.length(sf.col(AC.GEOCODE))) != "99999")
        elif geolevel == 'AIANTract':
            spark_df = spark_df.filter(spark_df.geocode != "9" * 11)
        elif geolevel == 'AIANState':
            spark_df = spark_df.filter(spark_df.geocode != "99")
        elif geolevel == 'AIANBlock':
            spark_df = spark_df.filter(spark_df.geocode != "9" * 16)
        # t = spark_df.filter(sf.abs(spark_df.Error) > 1000)
        spark_df = spark_df.select(["orig", "Error"])

        pandas_df = spark_df.toPandas()
        #if pandas_df.max()["Error"] == pandas_df.min()["Error"]:
        #    continue
        sns.scatterplot(x="orig",
                        y="Error",
                        data=pandas_df,
                        alpha=.6,
                        s=10,
                        marker="+",
                        ax=axes_flat[k])
        axes_flat[k].axhline(0., ls='--')

    filename = f"{table_name}_{query.replace(' ', '_')}_{geolevel}"
    plot_path = f"{experiment.save_location_linux}epsilon_{eps_str}/"
    du.makePath(plot_path)
    plt.savefig(plot_path + filename + ".png")
    plt.clf()
示例#21
0
    # setup analysis
    #################
    analysis_results_save_location = f"/mnt/users/moran331/analysis_reports/"
    spark_loglevel = "ERROR"
    analysis = setuptools.setup(save_location=analysis_results_save_location,
                                spark_loglevel=spark_loglevel)
    spark = analysis.spark

    schema = SchemaMaker.fromName("DHCP_SCHEMA")
    num_geocodes = 2
    geocodes = [str(x).zfill(16) for x in range(num_geocodes)]
    geounits = [
        testdata_random_geounit_generator(x, schema, density=0.00001, scale=10)
        for x in geocodes
    ]
    sdftools.print_item(geounits, "Random Geounit Data")

    rdd = spark.sparkContext.parallelize(geounits).persist()
    sdftools.print_item(rdd, "Parallelized RDD data")

    df = rdd.flatMap(lambda node: mappers.getSparseDF_mapper(node, schema)
                     ).map(lambda row: Row(**row)).toDF().persist()
    sdftools.print_item(df, "DF of Random Geounit Data")

    df = df.withColumn("STATE", sf.col("geocode")[0:2]).persist()
    sdftools.print_item(df, "DF with STATE code")

    df = sdftools.aggregateGeolevels(spark, df, 'STATE')
    sdftools.print_item(df, "Aggregated to the STATE geolevel")

    query = 'sex * age'
示例#22
0
    spark = analysis.spark


    #######################################################
    # Create an experiment using one or more DAS Run paths
    #######################################################
    s3_path_exp = "s3://uscb-decennial-ite-das/users/heiss002/cnstatDdpSchema_SinglePassRegular_va_cnstatDpqueries_cnstatGeolevels"
    paths = [
        f"{s3_path_exp}/data-run1.0-epsilon16.0-BlockNodeDicts/",
        f"{s3_path_exp}/data-run1.0-epsilon4.0-BlockNodeDicts/"
    ]
    
    schema_name = "DHCP_HHGQ"
    experiment = analysis.make_experiment("cnstat", paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")
    
    ##############################
    # Work with the Experiment DF
    ##############################
    df = experiment.getDF()
    schema = experiment.schema
    sdftools.print_item(df, "Flat Experiment DF")
    
    ######################
    # Aggregate Geolevels
    ######################
    geolevels = [C.STATE, C.COUNTY]
    
    
    df = sdftools.aggregateGeolevels(spark, df, geolevels)
def household_tvd(sdf, geolevels, queries, spark):  
    """
    Calculating 1-TVD for the Household information
    
    1 - [ (A + B) / C ], where
    
    A = SUM( L1( detailed(CEF_g), detailed(DAS_g) ) ) for all g in the geolevel
    
    B = SUM( L1( total(CEF_g), total(DAS_g) ) ) for all g in the geolevel
    
    C = 2 * SUM(_invar['gqhh_vect'][0]_g) <- household invariant total in the geolevel
    """
    results = {}

    # get the invariants path
    block_level_invar_path = sdf.runs[0].data_path
    
    # prep invar_df (household info only)
    invar_rdd = spark.sparkContext.pickleFile(block_level_invar_path)
    
    # since the household total invariant for the geolevel will be the same across all geolevels, we can just calculate the value
    invar_val = invar_rdd.map(hh_invar_mapper).reduce(operator.add)
    sdftools.print_item(invar_val, "Household Total Invariant")
    
    double_invar_val = 2 * invar_val
    sdftools.print_item(double_invar_val, "2 * Household Total Invariant")
    
    # compute, for each geolevel, the 1-TVD for household info
    
    # old way... compute each geolevel individually
    # for geolevel in du.aslist(geolevels) + [geolevels]:
    
    # new way... compute all geolevels at once
    for geolevel in [geolevels]:
        # A
        geosdf = sdf.clone()
        sdftools.print_item(geosdf, "Original SDF")
        
        geosdf = geosdf.getGeolevels(geolevel)
        sdftools.print_item(geosdf, f"Geolevel '{geolevel}'")
        
        geosdf = geosdf.getQueryAnswers(queries)
        sdftools.print_item(geosdf, f"Detailed Query at geolevel '{geolevel}'")
        
        geosdf = geosdf.L1(colname="L1_A")
        sdftools.print_item(geosdf, f"L1_A")
        
        geosdf = geosdf.sum(groupby=[AC.GEOCODE, AC.GEOLEVEL])
        sdftools.print_item(geosdf, "sum(L1_A)")
        
        # B
        geosdf = geosdf.L1(colname="L1_B")
        sdftools.print_item(geosdf, "L1_B")
        
        geosdf = geosdf.sum(groupby=[AC.GEOLEVEL])
        sdftools.print_item(geosdf, "sum(L1_B)")

        # 1-[(A+B)/C]
        geosdf.df = geosdf.df.withColumn("A+B", sf.col("L1_A") + sf.col("L1_B")).persist()
        sdftools.print_item(geosdf, "A+B")
        
        geosdf.df = geosdf.df.withColumn("2*geolevel_hh_invar", sf.lit(double_invar_val)).persist()
        
        geosdf.df = geosdf.df.withColumn("TVD", sf.col("A+B") / sf.col("2*geolevel_hh_invar")).persist()
        sdftools.print_item(geosdf, "TVD")
        
        geosdf.df = geosdf.df.withColumn("1-TVD", sf.lit(1) - sf.col("TVD")).persist()
        sdftools.print_item(geosdf, "1-TVD")
        
        
        if isinstance(geolevel, list):
            key = f"All_Geolevels_Household_1-TVD"
        else:
            key = f"{geolevel}_Household_1-TVD"
            
        results[key] = geosdf
    
    return results
示例#24
0
 nm_state = noisy_measurements_state
 print(nm_state)
 
 geounit = nm_state.take(1).pop()
 
 print(geounit)
 geocode = geounit.geocode
 print(geocode)
 dp_queries = geounit.dp_queries
 
 experiment_name = "dhcp_eps4_run36"
 experiment_path = f"{S3_BASE}/lecle301/dhcp_eps4/run36_of_25/full_person/"
 experiment = analysis.make_experiment(experiment_name, experiment_path)
 
 df = experiment.getDF()
 sdftools.print_item(df, "Experiment DF", show=100)
 
 geolevel_df = sdftools.aggregateGeolevels(experiment.spark, df, ['STATE'])
 sdftools.print_item(geolevel_df, "Geolevel DF")
 
 filtered_df = df.filter(df.geocode == geocode).persist()
 sdftools.print_item(filtered_df, "Experiment DF", show=1000)
 
 
 
     
 
 
 
 
 
示例#25
0
    analysis.save_log(to_linux=False, to_s3=False)

    # zip the local results to s3?
    analysis.zip_results_to_s3(flag=False)

    spark = analysis.spark

    #######################################################
    # Create an experiment using one or more DAS Run paths
    #######################################################
    paths = [
        f"{AC.S3_BASE}kifer001/Aug29_experiments_hierarchicalAgeRangeTopDown_branchFactor4_output_danVariant1-2/td4/run_0000/"
    ]

    experiment = analysis.make_experiment("danVariant1-2", paths)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    ##############################
    # Work with the Experiment DF
    ##############################
    df = experiment.getDF()
    schema = experiment.schema
    sdftools.print_item(df, "Experiment DF")

    geolevels = [
        C.STATE, C.COUNTY, C.TRACT_GROUP, C.TRACT, C.BLOCK_GROUP, C.BLOCK,
        C.SLDL, C.SLDU
    ]

    queries = [
        'total', 'hhgq', 'votingage * citizen', 'numraces * hispanic',
示例#26
0
    ################################
    S3_BASE = "s3://uscb-decennial-ite-das/users"
    
    #################
    # setup analysis
    #################
    analysis_results_save_location = f"/mnt/users/moran331/analysis_reports/"
    spark_loglevel = "ERROR"
    analysis = setuptools.setup(save_location=analysis_results_save_location, spark_loglevel=spark_loglevel)
    spark = analysis.spark

    schema = SchemaMaker.fromName("DHCP_SCHEMA")
    num_geocodes = 5
    geocodes = [str(x).zfill(16) for x in range(num_geocodes)]
    geounits = [testdata_random_geounit_generator(x, schema, density=0.00001, scale=10) for x in geocodes]
    sdftools.print_item(geounits, "Random Geounit Data")
    
    rdd = spark.sparkContext.parallelize(geounits).persist()
    sdftools.print_item(rdd, "Parallelized RDD data")
    
    df = rdd.flatMap(lambda node: mappers.getSparseDF_mapper(node, schema)).map(lambda row: Row(**row)).toDF().persist()
    sdftools.print_item(df, "DF of Random Geounit Data")
    
    recoder = dhcp_to_mdf2020.DHCPToMDF2020Recoder().recode
    mdf = (
        # privatized means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
        rdd.flatMap(lambda node: mappers.getMicrodataDF_mapper(node, schema, privatized=True, mangled_names=True, recoders=recoder))
           .zipWithIndex()
           .map(addIndexToRow)
           .map(lambda row: Row(**row))
           .toDF()
def analyzeQuery(query, analysis, spark, geolevel, schema_name, path):
    """
        Main plotting fxn.
            query           : str, name of a valid query for the target experiment's schema
            table_name      : str, name of a table (used for file-naming conventions)
            analysis        : Analysis setuptools.setup object, organizes Analysis metadata
            spark           : SparkSession object, attached to analysis object
            geolevels       : [str, ...], geolevels to compute over for the current query
            buckets         : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing
            schema          : str, name of ../programs/schema/schemas/schemamaker.py schema associated with target data
        Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3.
    """
    # To avoid cases in which max(numerator_query_levels)/denom_query_level >= 1:
    assert query != denom_query

    experiment_name = "NA"
    experiment = analysis.make_experiment(
        experiment_name, [path],
        schema_name=schema_name,
        dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT,
        budget_group='1',
        run_id='run1.0')
    spark_df = experiment.getDF()
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevel)
    spark_df = sdftools.remove_not_in_area(spark_df, [geolevel])
    spark_df = sdftools.answerQueries(spark_df, schema, [query, denom_query])

    spark_df = sdftools.getL1Relative(spark_df,
                                      colname="L1Relative",
                                      denom_query=denom_query,
                                      denom_level=denom_level).persist()
    query_counts = spark_df.rdd.map(lambda row: (row[AC.QUERY], )).countByKey()
    query_counts_keys = list(query_counts.keys())
    assert len(query_counts_keys) == 1 and query_counts_keys[0] == query

    spark_rdd_prop_lt = spark_df.rdd.map(
        lambda row: (int(np.digitize(row["orig"], POPULATION_BIN_STARTS)), 1.
                     if row["L1Relative"] <= THRESHOLD else 0.))
    spark_df_prop_lt = spark_rdd_prop_lt.toDF(["pop_bin", "prop_lt"])

    # Find the proportion of geounits that have L1Relative errors less than threshold for each bin:
    grouped_df_prop_lt = spark_df_prop_lt.groupBy("pop_bin").agg({
        "prop_lt": "avg",
        "*": "count"
    })
    prop_lt = grouped_df_prop_lt.collect()
    n_bins = len(POPULATION_BIN_STARTS) + 1
    prop_lt_list = [None] * n_bins
    prop_lt_counts = [0] * n_bins
    for row in prop_lt:
        prop_lt_list[int(row["pop_bin"])] = np.round(row["avg(prop_lt)"], 5)
        prop_lt_counts[int(row["pop_bin"])] = int(row["count(1)"])
    print(prop_lt_list)
    print(
        f"geounits counts for each bin: {[(POPULATION_BIN_STARTS[k], prop_lt_counts[k]) for k in range(len(POPULATION_BIN_STARTS))]}"
    )

    population_bin_starts = np.concatenate(
        ([-np.inf], POPULATION_BIN_STARTS, [np.inf]))
    ranges = list(
        zip(population_bin_starts[:-1], population_bin_starts[1:] - 1))
    assert len(prop_lt_list) == (len(population_bin_starts) - 1)
    prop_lt_reformat = list(zip(ranges, prop_lt_list))

    spark_df = spark_df.filter(spark_df.orig >= POPULATION_CUTOFF)
    # Count above POPULATION_CUTOFF
    count = spark_df.count()
    # For the quantiles and the avg, we will omit geounits that would not have had a well defined L1Relative metric well defined
    # due to division by zero: (See the comments in the UDF used in sdftools.getL1Relative() for more detail.)
    spark_df = spark_df.filter(spark_df.L1Relative != 2.)
    count_correct_sign = spark_df.count()

    quantiles_df = sdftools.getGroupQuantiles(spark_df,
                                              columns=["L1Relative"],
                                              groupby=[AC.QUERY, AC.GEOLEVEL],
                                              quantiles=QUANTILES).collect()
    avg = spark_df.groupBy([AC.QUERY, AC.GEOLEVEL]).avg("L1Relative").collect()

    quantiles_dict = {}
    for row in quantiles_df:
        quantiles_dict[float(row["quantile"])] = np.round(row["L1Relative"], 5)
    quantiles_reformat = [(quant, quantiles_dict[quant])
                          for quant in QUANTILES]
    error_metrics = [
        np.round(avg[0]["avg(L1Relative)"], 5), count, count_correct_sign
    ] + [quantiles_reformat] + [prop_lt_reformat]

    print("error_metrics:", error_metrics)
    return error_metrics
示例#28
0
def mf01(sdf, geolevels, queries):
    """ Answers queries at the geolevels specified """
    sdf = sdf.getGeolevels(geolevels).answerQueries(queries)
    sdftools.print_item(
        sdf, f"Query answers for geolevels {geolevels} and queries {queries}")
    return sdf