def getCategoryByAgeQuantilesFast(sdf, queries, product, state, plot=False): df = sdf.df results = {} for query in queries: res = sdftools.categoryByAgeQuantiles(df, sdf.schema, query, labels=True) results.update(res) if plot: for key, df in results.items(): queryname, category, datatype = parseAgeQuantileKey(key, fsname=False) sdftools.print_item( df.count(), "Number of rows in the Spark DF before transforming to Pandas DF" ) if datatype == "quantile_df": age_quantile_pandas_df = df.toPandas() saveloc = du.getdir(sdf.metric_save_location) rp.age_quantile_lineplot(age_quantile_pandas_df, saveloc, product, state) else: # datatype == "survival_props" pass return results
def analyzeQuery(query, table_name, analysis, spark, geolevels, eps, buckets=default_buckets, schema="DHCP_HHGQ"): """ Main plotting fxn. query : str, name of a valid query for the target experiment's schema table_name : str, name of a table (used for file-naming conventions) analysis : Analysis setuptools.setup object, organizes Analysis metadata spark : SparkSession object, attached to analysis object geolevels : [str, ...], geolevels to compute over for the current query buckets : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing schema : str, name of ../programs/schema/schemas/schemamaker.py schema associated with target data Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3. """ print(f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}") schema_name = schema num_trials, paths, experiment_name, eps_str = getPathsAndName(schema_name, query, table_name, eps) print(f"Passing paths to Analysis experiment maker: {paths}") experiment = analysis.make_experiment(experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") spark_df = experiment.getDF() print("df looks like:") spark_df.show() schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") queries = [query] spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels) spark_df = sdftools.answerQueries(spark_df, schema, queries) spark_df = sdftools.getAvgAbsErrorByTrueCountRuns(spark_df, bins=[0,1,10,100,1000,10000]).persist() missing_rows_pandas_df = sdftools.getMissingRowCounts(spark_df, schema, queries, groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.PLB, AC.BUDGET_GROUP]) missing_rows_dict = defaultdict(int) for index, row in missing_rows_pandas_df.iterrows(): #print(f"missing df row # {index} geolevel, sum(missing) = {row['geolevel']},{row['sum(missing)']}") missing_rows_dict[row['geolevel']] = row['sum(missing)'] spark_df.show() print("^^^^ with abs error, DF looks like ^^^^") metric_name = "Avg( |q(MDF) - q(CEF)| )" x_axis_variable_name = 'CEF Count, Binned' pandas_df = spark_df.toPandas() pandas_df = pandas_df.rename(columns={"abs_error":metric_name, "orig_count_bin":x_axis_variable_name}) plt.figure(1, figsize=(11,8.5)) plt.rc('axes', labelsize=8) print(f"pandas df before plotting has cols: {pandas_df.columns.values}") print(f"{x_axis_variable_name} column has distinct levels: {pandas_df[x_axis_variable_name].unique()}") buckets = pandas_df[x_axis_variable_name].unique() buckets = sorted(buckets, key=lambda bucket_name: largestIntInStr(bucket_name)) print(f"Sorted bucket names: {buckets}") # Saving data frame csv_savepath = experiment.save_location_linux + f"{experiment_name}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) makePlots(experiment, experiment_name, table_name, queries, x_axis_variable_name, metric_name, geolevels, pandas_df, buckets, schema_name, eps_str, missing_rows_dict, num_trials)
def setup(save_location=None, spark_loglevel="ERROR", cli_args=True, logname=None, num_core_nodes=None, analysis_script=None): """ Create a new spark session and Analysis object (from DataTools) to run experiments :param save_location: filepath of desired location :param spark_loglevel: level at which to set logging :param cli_args: optional boolean: use CLI arguments (will engage argument parser) or function args. default True :param logname: optional filepath of logfile. default None (if CLI) :param num_core_nodes: optional int: number of nodes. default None (if CLI) :param analysis_script: name of Analysis script to run. default None (if CLI) :return: Analysis object from the arguments """ assert save_location is not None, "Need to specify local directory where the results of analysis will be saved" assert spark_loglevel in ["ALL", "DEBUG", "ERROR", "FATAL", "INFO", "OFF", "TRACE", "WARN"], \ "Invalid Spark loglevel" # if parsing args from CLI (default True) if cli_args: parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument("--logname", help="logname used to output log data and for saving analysis results") parser.add_argument("--num_core_nodes", help="number of core nodes in the cluster upon spark start") parser.add_argument("--analysis_script", help="the analysis script being called by the run script") args, unknown = parser.parse_known_args() logname = args.logname.split('.')[0] # get rid of the .log part logname = logname.split('/')[1] # get rid of the logs/ part for the save_location and app_name # otherwise get args from function header else: assert logname is not None and num_core_nodes is not None and analysis_script is not None, \ "If not using command line, must pass logname, num_core_nodes, and analysis_script" assert type(logname) is str and type(analysis_script) is str, "logname and analysis_script must be strings" assert type(num_core_nodes) is int and num_core_nodes > 0, "num_core_nodes must be positive integer" class Arguments(): def __init__(self, logname, num_core_nodes, analysis_script): self.logname = logname self.num_core_nodes = num_core_nodes self.analysis_script = analysis_script args = Arguments(logname=logname, num_core_nodes=num_core_nodes, analysis_script=analysis_script) #save_location = f"/mnt/users/{os.environ['JBID']}/analysis_results/{logname}/" save_location = f"{du.addslash(save_location)}{logname}/" app_name = f"DAS_Analysis | {os.environ['JBID']} | {logname}" spark = SparkSession.builder.appName(app_name).getOrCreate() sdftools.print_item(du.pretty(dict(spark.sparkContext.getConf().getAll())), "Spark configurations being used") sdftools.print_item(args.num_core_nodes, "Number of Core Nodes on this cluster") sdftools.print_item(spark_loglevel, f"Spark loglevel") spark.sparkContext.setLogLevel(spark_loglevel) analysis_script_path = args.analysis_script sdftools.print_item(analysis_script_path, "The Analysis Script's path") return datatools.Analysis(spark, save_location, analysis_script_path)
def getGeolevelTVDFast(sdf, product, state, plot=False): sdf = sdf.geolevel_tvd(groupby=[AC.GEOLEVEL, AC.RUN_ID, AC.QUERY, AC.PLB]) if plot: saveloc = du.getdir(sdf.metric_save_location) sdftools.print_item( sdf.df.count(), "Number of rows in the Spark DF before transforming to Pandas DF") geolevel_tvd_pandas_df = sdf.toPandas() rp.geolevel_tvd_lineplot(geolevel_tvd_pandas_df, saveloc, product, state) rp.geolevel_tvd_heatmap(geolevel_tvd_pandas_df, saveloc, product, state) results = {'geolevel_tvd': sdf} return results
def generateReport(sdf, geolevels, queries, age_queries, product, state, plot=False): geolevel_sdf = sdf.getGeolevels(geolevels) results = {} # for getSignedError, getGeolevelTVD, and getGeolevelSparsity queries_sdf = geolevel_sdf.answerQueries(queries) # signed error calculations res = getSignedErrorFast(queries_sdf.clone()) results.update(res) sdftools.print_item(results, "Results after Signed Error Calculations") # geolevel 1-tvd calculations res = getGeolevelTVDFast(queries_sdf.clone(), product, state, plot) results.update(res) sdftools.print_item( results, "Results after Signed Error and Geolevel 1-TVD Calculations") # geolevel sparsity calculations res = getGeolevelSparsityFast(queries_sdf.clone()) results.update(res) sdftools.print_item( results, "Results after Signed Error, Geolevel 1-TVD, and Sparsity Calculations" ) # age quantile calculations res = getCategoryByAgeQuantilesFast(geolevel_sdf.clone(), age_queries, product, state, plot) results.update(res) sdftools.print_item( results, "Results after Signed Error, Geolevel 1-TVD, Sparsity, and Age Quantile Calculations" ) return results
def analyzeQuery(query, table_name, analysis, spark, geolevels, buckets=default_buckets, schema="DHCP_HHGQ"): """ Main plotting fxn. query : str, name of a valid query for the target experiment's schema table_name : str, name of a table (used for file-naming conventions) analysis : Analysis setuptools.setup object, organizes Analysis metadata spark : SparkSession object, attached to analysis object geolevels : [str, ...], geolevels to compute over for the current query buckets : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3. """ print(f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}") schema_name = schema paths, experiment_name, eps_str = getPathsAndName(schema_name) experiment = analysis.make_experiment(experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") spark_df = experiment.getDF() print("df looks like:") spark_df.show() schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") queries = [query] #y=sdftools.getAnswers(spark,df,geolevels,schema,queries) # Old approach to computing df with abs diff, bucketed by true count: #sparkDFWithAbsDiff = getSparkDFWithAbsDiff(spark, spark_df, geolevels, queries, schema) #getSignedErrorByTrueCountRuns(spark_df, bins=[0,1,10,100,1000,10000]): #rdd = sdftools.getRowGroupsAsRDD(sparkDFWithAbsDiff, groupby=[AC.GEOLEVEL, AC.QUERY]) #rdd = rdd.flatMapValues(lambda rows: sepBounds(rows, 'orig', buckets)).persist() #rdd = rdd.map(lambda row: Row(**row[1])) #spark_df = rdd.toDF().persist() # New (actually preexisting) approach to computing spark_df with abs diff, bucketed by true count: # (avoids pandas dfs inside mappers, which is RAM-hungry) spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels) spark_df = sdftools.answerQueries(spark_df, schema, queries) spark_df = sdftools.getFullWorkloadDF(spark_df, schema, queries,groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]) spark_df = sdftools.getAvgAbsErrorByTrueCountRuns(spark_df, bins=[0,1,10,100,1000,10000]).persist() spark_df.show() print("^^^^ with abs error, DF looks like ^^^^") metric_name = "Avg( |q(MDF) - q(CEF)| )" x_axis_variable_name = 'CEF Count, Binned' # spark_df = spark_df.groupby(['geocode','geolevel','level','Bin0','Bin1','Bin2','Bin3','Bin4','Bin5']).avg() # Below spark_df has cols: geocode, geolevel, run_id, plb, budget_group, query, orig_count_bin, signed_error, re #spark_df = spark_df.groupby(['geocode', 'geolevel', 'plb', 'budget_group', 'query', 'orig_count_bin']).avg() #print("^^^^ after averaging, spark_df looks like ^^^^") pandas_df = spark_df.toPandas() #pandas_df = pandas_df.rename(columns={"avg(signed_error)":metric_name, "avg(orig)":"orig"}) #pandas_df[x_axis_variable_name] = pandas_df.apply(lambda row: binIndexToInteger(row, buckets), axis=1) #pandas_df = pandas_df.rename(columns={"avg(signed_error)":metric_name, "avg(orig_count_bin)":"orig"}) pandas_df = pandas_df.rename(columns={"abs_error":metric_name, "orig_count_bin":x_axis_variable_name}) plt.figure(1, figsize=(11,8.5)) plt.rc('axes', labelsize=8) print(f"pandas df before plotting has cols: {pandas_df.columns.values}") print(f"{x_axis_variable_name} column has distinct levels: {pandas_df[x_axis_variable_name].unique()}") buckets = pandas_df[x_axis_variable_name].unique() buckets = sorted(buckets, key=lambda bucket_name: largestIntInStr(bucket_name)) print(f"Sorted bucket names: {buckets}") new_bucket_order = [0,1,2,3,5,4] # Apply ordering system to make 10000+ the last bucket buckets = [buckets[i] for i in new_bucket_order] print(f"Sorted bucket names: {buckets}") """ print(pandas_df.head(30)) print(f"pandas_df headers: {list(pandas_df.columns.values)}") tmpDf = pandas_df[[x_axis_variable_name, 'orig', metric_name]] print("tmpDf looks like:") with pandas.option_context('display.max_rows', None, 'display.max_columns', None): print(tmpDf) print("^^^^ pandas df looks like ^^^^") print("And first 3 rows:") print(pandas_df.iloc[:3]) #print(df.dtypes) print("And first 100 rows, subset to Bins:") print(pandas_df.iloc[0:101,3:9]) print(pandas_df.iloc[0:101,-1]) """ # Saving data frame csv_savepath = experiment.save_location_linux + f"Executive_Priority_Tabulations_#1_{experiment_name}_{table_name}_{query}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) makePlots(experiment, experiment_name, table_name, queries, x_axis_variable_name, metric_name, geolevels, pandas_df, buckets, schema_name, eps_str)
# save the analysis script? # toggle to_linux=True|False to save|not save this analysis script locally # toggle to_s3=True|False to save|not save this analysis script to s3 analysis.save_analysis_script(to_linux=False, to_s3=False) # save/copy the log file? analysis.save_log(to_linux=False, to_s3=False) # zip the local results to s3? analysis.zip_results_to_s3(flag=False) spark = analysis.spark # build an example schema schema = Schema("example", ['a', 'b', 'c'], (2, 3, 5)) sdftools.print_item(schema, "Toy example Schema") # build a set of GeounitNodes to use geocodes = ['000', '001', '002', '003', '010', '011', '012', '020', '022'] geocode_dict = {3: 'block', 2: 'county'} # build geounits geounits = toytools.getToyGeounitData(schema, geocodes, geocode_dict) rdd = spark.sparkContext.parallelize(geounits).persist() sdftools.print_item(rdd.take(1), "One of the toy example geounits") # use Analysis to transform the rdd of geounitnodes into a spark dataframe df = datatools.rdd2df(rdd, schema) sdftools.print_item(df, "Toy example DF", 300)
def analyzeQuery(query, analysis, spark, geolevel, schema_name, path): """ Main plotting fxn. query : str, name of a valid query for the target experiment's schema table_name : str, name of a table (used for file-naming conventions) analysis : Analysis setuptools.setup object, organizes Analysis metadata spark : SparkSession object, attached to analysis object geolevels : [str, ...], geolevels to compute over for the current query buckets : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing schema : str, name of ../programs/schema/schemas/schemamaker.py schema associated with target data Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3. """ experiment_name = "NA" quantiles = [xi / 20. for xi in np.arange(20)] + [.975, .99, 1.] experiment = analysis.make_experiment( experiment_name, [path], schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT, budget_group='1', run_id='run1.0') spark_df = experiment.getDF() sdftools.print_item(experiment.__dict__, "Experiment Attributes") schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevel) if geolevel == C.PLACE: spark_df = spark_df.filter(spark_df.geocode[2:7] != "99999") elif geolevel == 'AIAN_AREAS': spark_df = spark_df.filter(spark_df.geocode != "9999") elif geolevel == 'OSE': spark_df = spark_df.filter( sf.col(AC.GEOCODE).substr( sf.length(sf.col(AC.GEOCODE)) - 4, sf.length(sf.col(AC.GEOCODE))) != "99999") elif geolevel == 'AIANTract': spark_df = spark_df.filter(spark_df.geocode != "9" * 11) elif geolevel == 'AIANState': spark_df = spark_df.filter(spark_df.geocode != "99") elif geolevel == 'AIANBlock': spark_df = spark_df.filter(spark_df.geocode != "9" * 16) elif geolevel == 'COUNTY_NSMCD': spark_df = spark_df.filter(spark_df.geocode != "999") spark_df = sdftools.answerQueries(spark_df, schema, [query, denom_query]) spark_df = sdftools.getL1Relative(spark_df, colname="L1Relative", denom_query=denom_query, denom_level=denom_level).persist() spark_rdd_prop_lt = spark_df.rdd.map( lambda row: (int(np.digitize(row["orig"], POPULATION_BIN_STARTS)), 1. if row["L1Relative"] <= THRESHOLD else 0.)) spark_df_prop_lt = spark_rdd_prop_lt.toDF(["pop_bin", "prop_lt"]) # Find the proportion of geounits that have L1Relative errors less than threshold for each bin: grouped_df_prop_lt = spark_df_prop_lt.groupBy("pop_bin").agg({ "prop_lt": "avg", "*": "count" }) # print("RCM", grouped_df_prop_lt.first()) prop_lt = grouped_df_prop_lt.collect() prop_lt_dict = {} prop_lt_counts = {} for row in prop_lt: prop_lt_dict[int(row["pop_bin"])] = np.round(row["avg(prop_lt)"], 5) prop_lt_counts[int(row["pop_bin"])] = int(row["count(1)"]) print(prop_lt_dict) pop_bin_indices = list(prop_lt_dict.keys()) for k in range(len(POPULATION_BIN_STARTS)): if k not in pop_bin_indices: prop_lt_dict[k] = None prop_lt_counts[k] = 0 print( f"geounits counts for each bin: {[(POPULATION_BIN_STARTS[k], prop_lt_counts[k]) for k in range(len(POPULATION_BIN_STARTS))]}" ) prop_lt_reformat = [(POPULATION_BIN_STARTS[k], prop_lt_dict[k]) for k in range(len(POPULATION_BIN_STARTS))] spark_df = spark_df.filter(spark_df.orig >= POPULATION_CUTOFF) # Count above POPULATION_CUTOFF count = spark_df.count() # For the quantiles and the avg, we will omit geounits that would not have had a well defined L1Relative metric well defined # due to division by zero: (See the comments in the UDF used in sdftools.getL1Relative() for more detail.) spark_df = spark_df.filter(spark_df.L1Relative != 2.) count_correct_sign = spark_df.count() quantiles_df = sdftools.getGroupQuantiles(spark_df, columns=["L1Relative"], groupby=[AC.QUERY, AC.GEOLEVEL], quantiles=QUANTILES).collect() avg = spark_df.groupBy([AC.QUERY, AC.GEOLEVEL]).avg("L1Relative").collect() quantiles_dict = {} for row in quantiles_df: quantiles_dict[float(row["quantile"])] = np.round(row["L1Relative"], 5) quantiles_reformat = [(quant, quantiles_dict[quant]) for quant in QUANTILES] error_metrics = [ np.round(avg[0]["avg(L1Relative)"], 5), count, count_correct_sign ] + [quantiles_reformat] + [prop_lt_reformat] print("error_metrics:", error_metrics) return error_metrics
def MattsMetrics(query, table_name, analysis, spark, geolevels, schema="DHCP_HHGQ"): """ This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds" """ print( f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}" ) schema_name = schema paths, experiment_name, eps_str = getPathsAndName(schema_name) experiment = analysis.make_experiment( experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") spark_df = experiment.getDF() print("df looks like:") spark_df.show() schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") queries = [query] spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels) spark_df = sdftools.answerQueries(spark_df, schema, queries) spark_df = sdftools.getFullWorkloadDF( spark_df, schema, queries, groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]) #spark_df.show(spark_df.count(), False) # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production spark_df = sdftools.getL1(spark_df, colname="L1", col1=AC.PRIV, col2=AC.ORIG) spark_df = sdftools.getL2(spark_df, colname="L2", col1=AC.PRIV, col2=AC.ORIG) # apply bin functions for particular tables if (table_name in table_bucket_list1): spark_df = sdftools.getCountBins( spark_df, column=AC.ORIG, bins=[0, 1000, 5000, 10000, 50000, 100000]).persist() if (table_name in table_bucket_list2): spark_df = sdftools.getCountBins(spark_df, column=AC.ORIG, bins=[0, 10, 100]).persist() # This finds overall metrics spark_df.show(100, False) if table_name not in (table_list_3_plus_list_age): for g in geolevels: spark_df1 = spark_df[spark_df['geolevel'] == g] # Separate data for each geolevel if table_name in table_default_no_bucket: # If data is not in buckets bucket_size = "NA" metrics_result = sdftools.metrics_with_popbucket(spark_df1, bucket_size, spark, key="A") file_name = f"{table_name}_{g}.csv" if table_name in table_bucket_list2: # if data is bucketed in 3 buckets, bucket_size = default_buckets2 print("BUCKET SIZE IS:", bucket_size) metrics_result = sdftools.metrics_with_popbucket(spark_df1, bucket_size, spark, key="B") file_name = f"{table_name}_{g}.csv" if table_name in table_bucket_list1: # Table 1 and 2, six buckets bucket_size = default_buckets1 print("BUCKET SIZE IS:", bucket_size) metrics_result = sdftools.metrics_with_popbucket(spark_df1, bucket_size, spark, key="B") file_name = f"{table_name}_{g}.csv" if table_name in table_list_3geolevels: #three geolevels, state, county, place, Tables 10,14,18,22 metrics_result = sdftools.metrics_with_3geolevels( spark_df, spark, geolevels) file_name = f"{table_name}.csv" if table_name in table_list_age: # Tables 32-35 if table_name in table_age_bracket1: metrics_result = sdftools.metrics_with_age(spark_df, spark, age_range_list, key="A") if table_name in table_age_bracket2: metrics_result = sdftools.metrics_with_age(spark_df, spark, age_range_list, key="B") if table_name in table_age_bracket3: metrics_result = sdftools.metrics_with_age(spark_df, spark, age_range_list2, key="A") if table_name in table_age_bracket4: metrics_result = sdftools.metrics_with_age(spark_df, spark, age_range_list2, key="B") file_name = f"{table_name}.csv" pandas_df = metrics_result.toPandas() csv_savepath = experiment.save_location_linux + file_name du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False)
analysis.save_log(to_linux=False, to_s3=False) # zip the local results to s3? analysis.zip_results_to_s3(flag=False) spark = analysis.spark ####################################################### # Create an experiment using one or more DAS Run paths ####################################################### paths = [ f"{AC.S3_BASE}kifer001/Aug29_experiments_hierarchicalAgeRangeTopDown_branchFactor4_output_danVariant1-2/td4/" ] experiment = analysis.make_experiment("danVariant1-2", paths) sdftools.print_item(experiment.__dict__, "Experiment Attributes") ############################## # Work with the Experiment DF ############################## df = experiment.getDF() schema = experiment.schema sdftools.print_item(df, "Experiment DF") ###################### # Aggregate Geolevels ###################### geolevels = [C.STATE, C.COUNTY] # OPTIONAL # ========
# save the analysis script? # toggle to_linux=True|False to save|not save this analysis script locally # toggle to_s3=True|False to save|not save this analysis script to s3 analysis.save_analysis_script(to_linux=False, to_s3=False) # save/copy the log file? analysis.save_log(to_linux=False, to_s3=False) # zip the local results to s3? analysis.zip_results_to_s3(flag=False) spark = analysis.spark schema_name = "DHCP_HHGQ" schema = SchemaMaker.fromName(name=schema_name) sdftools.print_item(schema, "Schema") num_geocodes = 5 density = 0.00001 scale = 10 geounits = getToyGeounitData_dict(num_geocodes, schema, density, scale) sdftools.print_item(geounits, "Randomly-generated Geounits") rdd = spark.sparkContext.parallelize(geounits) sdftools.print_item(rdd, "RDD of random geounits") df = datatools.rdd2df(rdd, schema) # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production df = df.select(['geocode'] + schema.dimnames + ['orig', 'priv']) sdftools.print_item(df, "DF of random geounit data", 1000)
def MattsMetrics(query, table_name, analysis, spark, geolevels, buckets=default_buckets, schema="DHCP_HHGQ"): """ This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds" """ print( f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}" ) schema_name = schema paths, experiment_name, eps_str = getPathsAndName(schema_name) experiment = analysis.make_experiment( experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") spark_df = experiment.getDF() print("df looks like:") spark_df.show() schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") queries = [query] spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels) spark_df = sdftools.answerQueries(spark_df, schema, queries) #spark_df.show(spark_df.count(), False) # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production spark_df = sdftools.getL1(spark_df, colname="L1", col1=AC.PRIV, col2=AC.ORIG) spark_df = sdftools.getL2(spark_df, colname="L2", col1=AC.PRIV, col2=AC.ORIG) # apply bin functions for particular tables if (table_name in table_default_bucket_list): spark_df = sdftools.getCountBins( spark_df, column=AC.ORIG, bins=[0, 1000, 5000, 10000, 50000, 100000]).persist() if (table_name in table_default_bucket_list2): spark_df = sdftools.getCountBins(spark_df, column=AC.ORIG, bins=[0, 10, 100]).persist() # This finds overall metrics #spark_df.show(spark_df.count(), False) for g in geolevels: spark_df = spark_df[spark_df['geolevel'] == g] print("This has all levels") spark_df.show(150, False) metrics_dataframe = sdftools.mattsmetrics(spark_df, spark) Counts = spark_df.count() print("Counts are", Counts) newRow = spark.createDataFrame([(Counts, "Counts")]) metrics_dataframe = metrics_dataframe.union(newRow) pandas_df = metrics_dataframe.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) if table_name in table_default_bucket_list2: # If data needs bucketing for b in default_buckets2: # calculate Metrics at each bucket subset_sparkdf = spark_df[spark_df['orig_count_bin'] == b] #subset into bins subset_sparkdf = subset_sparkdf.subtract( subset_sparkdf.filter(subset_sparkdf.level.rlike("Not")) ) # Removes instances of Not Hispanic..from dataframe subset_sparkdf.show(100, False) print("Make sure its bucketed and without 'Not' values") subset_metrics = sdftools.mattsmetrics(subset_sparkdf, spark) Counts = subset_sparkdf.count() newRow = spark.createDataFrame([(b, "Bucket")]) newRow1 = spark.createDataFrame([(Counts, "Counts")]) subset_metrics = subset_metrics.union(newRow).union(newRow1) pandas_df = subset_metrics.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{b}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False)
analysis.save_log(to_linux=False, to_s3=False) # zip the local results to s3? analysis.zip_results_to_s3(flag=False) spark = analysis.spark ####################################################### # Create an experiment using one or more DAS Run paths ####################################################### paths = [ f"{AC.S3_BASE}kifer001/Aug29_experiments_hierarchicalAgeRangeTopDown_branchFactor4_output_danVariant1-2/td4/run_0000/" ] experiment = analysis.make_experiment("danVariant1-2", paths) sdftools.print_item(experiment.__dict__, "Experiment Attributes") ############################## # Work with the Experiment DF ############################## df = experiment.getDF() schema = experiment.schema sdftools.print_item(df, "Experiment DF") ################################# # Calculate Population Densities ################################# geolevels = [C.STATE, C.COUNTY] popdf = sdftools.population_density(spark, df, schema, geolevels) sdftools.print_item(popdf, "Population Density DF", 1000)
def MattsMetrics(query, table_name, analysis, spark, geolevels, buckets=default_buckets, schema="DHCP_HHGQ"): """ This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds" """ print( f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}" ) schema_name = schema paths, experiment_name, eps_str = getPathsAndName(schema_name) experiment = analysis.make_experiment( experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") spark_df = experiment.getDF() print("df looks like:") spark_df.show() schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") queries = [query] spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels) spark_df = sdftools.answerQueries(spark_df, schema, queries) spark_df = sdftools.getFullWorkloadDF( spark_df, schema, queries, groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]) #spark_df.show(spark_df.count(), False) # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production spark_df = sdftools.getL1(spark_df, colname="L1", col1=AC.PRIV, col2=AC.ORIG) spark_df = sdftools.getL2(spark_df, colname="L2", col1=AC.PRIV, col2=AC.ORIG) # apply bin functions for particular tables if (table_name in table_default_bucket_list): spark_df = sdftools.getCountBins( spark_df, column=AC.ORIG, bins=[0, 1000, 5000, 10000, 50000, 100000]).persist() if (table_name in table_default_bucket_list2): spark_df = sdftools.getCountBins(spark_df, column=AC.ORIG, bins=[0, 10, 100]).persist() # This finds overall metrics spark_df.show(100, False) for g in geolevels: spark_df1 = spark_df[spark_df['geolevel'] == g] # Separate data for each geolevel if table_name in table_default_no_bucket: # If data is not in buckets if table_name in table_race_query: # Table 17, 18, 21 and others print("no buckets, with race query") spark_df2 = spark_df1.subtract( spark_df1.filter(spark_df1.level.rlike("Not"))) spark_df2.show(100, False) print("Make sure 'Not' values are removed") metrics_dataframe = sdftools.mattsmetrics(spark_df2, spark) Counts = spark_df2.count() print("Counts are", Counts) newRow = spark.createDataFrame([(Counts, "Counts")]) metrics_dataframe = metrics_dataframe.union(newRow) pandas_df = metrics_dataframe.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) else: print("no buckets, without race query") spark_df1.show(100, False) spark_df2 = spark_df1.subtract( spark_df1.filter(spark_df1.level.rlike("Not"))) print("with Not removed") spark_df2.show(100, False) metrics_dataframe = sdftools.mattsmetrics(spark_df2, spark) Counts = spark_df2.count() print("Counts are", Counts) newRow = spark.createDataFrame([(Counts, "Counts")]) metrics_dataframe = metrics_dataframe.union(newRow) pandas_df = metrics_dataframe.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) if table_name in table_age_bracket1: print("Data is in age brackets, 0 to 17, 18 to 64, 65+") spark_df1.show(100, False) for age_range in age_range_list: subset_sparkdf1 = spark_df1.filter( spark_df1.level.rlike(age_range)) subset_sparkdf1.show(100, False) metrics_dataframe = sdftools.mattsmetrics( subset_sparkdf1, spark) #subset_sparkdf1.show(100, False) Counts = subset_sparkdf1.count() print("Counts are", Counts) newRow = spark.createDataFrame([(Counts, "Counts")]) metrics_dataframe = metrics_dataframe.union(newRow) pandas_df = metrics_dataframe.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) if table_name in table_age_bracket2: print("Data is age buckets, with sex query") spark_df1.show(100, False) for sexlevel in sex_range_list: subset_sparkdf1 = spark_df1.filter( spark_df1.level.rlike(sexlevel)) for age_range in age_range_list: subset_sparkdf2 = subset_sparkdf1.filter( subset_sparkdf1.level.rlike(age_range)) subset_sparkdf2.show(100, False) metrics_dataframe = sdftools.mattsmetrics( subset_sparkdf2, spark) #subset_sparkdf1.show(100, False) Counts = subset_sparkdf2.count() print("Counts are", Counts) newRow = spark.createDataFrame([(Counts, "Counts")]) metrics_dataframe = metrics_dataframe.union(newRow) pandas_df = metrics_dataframe.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}_{sexlevel}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) if table_name in table_age_bracket3: print("Data is in age brackets of 5 year age groups") spark_df1.show(100, False) for age_range in age_range_list2: subset_sparkdf1 = spark_df1.filter( spark_df1.level.rlike(age_range)) subset_sparkdf1.show(100, False) metrics_dataframe = sdftools.mattsmetrics( subset_sparkdf1, spark) #subset_sparkdf1.show(100, False) Counts = subset_sparkdf1.count() print("Counts are", Counts) newRow = spark.createDataFrame([(Counts, "Counts")]) metrics_dataframe = metrics_dataframe.union(newRow) pandas_df = metrics_dataframe.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) if table_name in table_age_bracket4: print("Data is age buckets of 5 year age groups, with sex query") spark_df1.show(100, False) for sexlevel in sex_range_list: subset_sparkdf1 = spark_df1.filter( spark_df1.level.rlike(sexlevel)) for age_range in age_range_list2: subset_sparkdf2 = subset_sparkdf1.filter( subset_sparkdf1.level.rlike(age_range)) subset_sparkdf2.show(100, False) metrics_dataframe = sdftools.mattsmetrics( subset_sparkdf2, spark) #subset_sparkdf1.show(100, False) Counts = subset_sparkdf2.count() print("Counts are", Counts) newRow = spark.createDataFrame([(Counts, "Counts")]) metrics_dataframe = metrics_dataframe.union(newRow) pandas_df = metrics_dataframe.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}_{sexlevel}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) if table_name in table_default_bucket_list2: # If data is in buckets [0,10],[10,100),[100+) print("data is bucketed and treated accordingly") #if table_name in table_race_query: for b in default_buckets2: # calculate Metrics at each bucket print("Bucket is:", b) subset_sparkdf = spark_df1[spark_df1['orig_count_bin'] == b] #subset into bins subset_sparkdf.show(100, False) print("Bucketed data") subset_sparkdf1 = subset_sparkdf.subtract( subset_sparkdf.filter(subset_sparkdf.level.rlike("Not"))) subset_sparkdf1.show(100, False) print("Make sure its bucketed and 'Not' values are removed") subset_metrics = sdftools.mattsmetrics(subset_sparkdf1, spark) Counts = subset_sparkdf1.count() newRow = spark.createDataFrame([(b, "Bucket")]) newRow1 = spark.createDataFrame([(Counts, "Counts")]) subset_metrics = subset_metrics.union(newRow).union(newRow1) pandas_df = subset_metrics.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{b}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) if table_name in table_default_bucket_list: # If data is in buckets [0,1000],[1000,5000),etc. Table 1 and 2 print("data is bucketed and treated accordingly") #if table_name in table_race_query: for b in default_buckets: # calculate Metrics at each bucket print("Bucket is:", b) subset_sparkdf = spark_df1[spark_df1['orig_count_bin'] == b] #subset into bins subset_sparkdf.show(100, False) print("Bucketed data") subset_sparkdf1 = subset_sparkdf.subtract( subset_sparkdf.filter(subset_sparkdf.level.rlike("Not"))) subset_sparkdf1.show(100, False) print("Make sure its bucketed and 'Not' values are removed") subset_metrics = sdftools.mattsmetrics(subset_sparkdf1, spark) Counts = subset_sparkdf1.count() newRow = spark.createDataFrame([(b, "Bucket")]) newRow1 = spark.createDataFrame([(Counts, "Counts")]) subset_metrics = subset_metrics.union(newRow).union(newRow1) pandas_df = subset_metrics.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{b}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False)
def MattsMetrics(query, table_name, analysis, spark, geolevels, buckets=default_buckets, schema="DHCP_HHGQ"): """ This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds" """ print( f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}" ) schema_name = schema paths, experiment_name, eps_str = getPathsAndName(schema_name) experiment = analysis.make_experiment( experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") spark_df = experiment.getDF() print("df looks like:") spark_df.show() schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") queries = [query] spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels) spark_df = sdftools.answerQueries(spark_df, schema, queries) spark_df.show() # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production spark_df = sdftools.getL1(spark_df, colname="L1", col1=AC.PRIV, col2=AC.ORIG) spark_df = sdftools.getL2(spark_df, colname="L2", col1=AC.PRIV, col2=AC.ORIG) spark_df = sdftools.getCountBins( spark_df, column=AC.ORIG, bins=[0, 1000, 5000, 10000, 50000, 100000]).persist() for b in default_buckets: # calculate Metrics subset_sparkdf = spark_df[spark_df['orig_count_bin'] == b] #subset into bins subset_sparkdf.show() MAE_value = sdftools.MAE(subset_sparkdf) print("Bucket size is", b) print("MAE value is", MAE_value) RMS_value = sdftools.RMS(subset_sparkdf) CoV_value = sdftools.Coe_of_variation(subset_sparkdf, RMS_value) print("RMS value is", RMS_value) print("Coefficient of Variation is", CoV_value) MAPE_value = sdftools.MAPE(subset_sparkdf) print("MAPE value is", MAPE_value) MALPE_value = sdftools.MALPE(subset_sparkdf) print("MALPE value is", MALPE_value) print("Counts of percent differences between 5 and 10 percent: ") # 5to10percentCount = sdftools.Count_percentdiff_5to10percent(subset_spark) # This function disabled for now greaterthan10percentCount = sdftools.Count_percentdiff_10percent( subset_sparkdf) #ze.groupBy().agg(F.count(F.when(F.col("abs diff div cef")>0.05, True)),F.count(F.when(F.col("abs diff div cef")<0.1,True))).show() # ze.groupBy().agg(F.count(F.when(F.col("abs diff div cef")>0.05 and F.col("abs diff div cef")<0.1),True)).show() print("Counts of percent differences greater than 10 percent: ") greaterthan10percentCount.show()
def analyzeQuery(query, table_name, analysis, spark, geolevels, buckets=default_buckets, schema="DHCP_HHGQ"): """ Main plotting fxn. query : str, name of a valid query for the target experiment's schema table_name : str, name of a table (used for file-naming conventions) analysis : Analysis setuptools.setup object, organizes Analysis metadata spark : SparkSession object, attached to analysis object geolevels : [str, ...], geolevels to compute over for the current query buckets : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3. """ print( f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}" ) schema_name = schema paths, experiment_name = getPathsAndName(schema_name) experiment = analysis.make_experiment( experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") df = experiment.getDF() if TEST: df = df.limit(TEST_NUM) print("df looks like:") df.show() schema = experiment.schema sdftools.print_item(df, "Flat Experiment DF") queries = [query] #y=sdftools.getAnswers(spark,df,geolevels,schema,queries) rddWithAbsDiff = getRddWithAbsDiff(spark, df, geolevels, queries, schema) rddWithAbsDiff = sdftools.getFullWorkloadDF( rddWithAbsDiff, schema, queri, groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]) rdd = sdftools.getRowGroupsAsRDD(rddWithAbsDiff, groupby=[AC.GEOLEVEL, AC.QUERY]) rdd = rdd.flatMapValues( lambda rows: sepBounds(rows, 'orig', buckets)).persist() rdd = rdd.map(lambda row: Row(**row[1])) df = rdd.toDF().persist() metric_name = "Avg( |q(MDF) - q(CEF)| )" x_axis_variable_name = 'CEF Count, Binned' df = df.groupby([ 'geocode', 'geolevel', 'level', 'Bin0', 'Bin1', 'Bin2', 'Bin3', 'Bin4', 'Bin5' ]).avg() pandas_df = df.toPandas() pandas_df = pandas_df.rename(columns={ "avg(abs diff)": metric_name, "avg(orig)": "orig" }) pandas_df[x_axis_variable_name] = pandas_df.apply( lambda row: binIndexToInteger(row, buckets), axis=1) plt.figure(1, figsize=(11, 8.5)) plt.rc('axes', labelsize=8) """ print(pandas_df.head(30)) print(f"pandas_df headers: {list(pandas_df.columns.values)}") tmpDf = pandas_df[[x_axis_variable_name, 'orig', metric_name]] print("tmpDf looks like:") with pandas.option_context('display.max_rows', None, 'display.max_columns', None): print(tmpDf) print("^^^^ pandas df looks like ^^^^") print("And first 3 rows:") print(pandas_df.iloc[:3]) #print(df.dtypes) print("And first 100 rows, subset to Bins:") print(pandas_df.iloc[0:101,3:9]) print(pandas_df.iloc[0:101,-1]) """ # Saving data frame csv_savepath = experiment.save_location_linux + f"Executive_Priority_Tabulations_#1_{experiment_name}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) makePlots(experiment, experiment_name, table_name, queries, x_axis_variable_name, metric_name, geolevels, pandas_df, buckets, schema_name)
analysis.save_analysis_script(to_linux=False, to_s3=False) # save/copy the log file? analysis.save_log(to_linux=False, to_s3=False) # zip the local results to s3? analysis.zip_results_to_s3(flag=False) spark = analysis.spark # build an example schema schema = Schema("example", ['a', 'b', 'c'], (2,3,5)) sdftools.print_item(schema, "Toy example Schema") # build a set of GeounitNodes to use geocodes = ['000', '001', '002', '003', '010', '011', '012', '020', '022'] geocode_dict = {3: 'block', 2: 'county'} # build geounits geounits = toytools.getToyGeounitData_GeounitNode(schema, geocodes, geocode_dict, raw_params={'low': 0, 'high': 100}) rdd = spark.sparkContext.parallelize(geounits).persist() sdftools.print_item(rdd.take(1), "One of the toy example geounits") # use Analysis to transform the rdd of geounitnodes into a spark dataframe df = datatools.rdd2df(rdd, schema) sdftools.print_item(df, "Toy example DF", 300)
def MattsMetrics(query, table_name, analysis, spark, geolevels, key, agekey, sexkey, bucketkey, schema="DHCP_HHGQ"): """ This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds" """ print( f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}" ) schema_name = schema paths, experiment_name, eps_str = getPathsAndName(schema_name) experiment = analysis.make_experiment( experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") spark_df = experiment.getDF() print("df looks like:") spark_df.show() schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") queries = [query] spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels) spark_df = sdftools.answerQueries(spark_df, schema, queries) spark_df = sdftools.getFullWorkloadDF( spark_df, schema, queries, groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]) #spark_df.show(spark_df.count(), False) # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production spark_df = sdftools.getL1(spark_df, colname="L1", col1=AC.PRIV, col2=AC.ORIG) spark_df = sdftools.getL2(spark_df, colname="L2", col1=AC.PRIV, col2=AC.ORIG) # apply bin functions for particular tables if (table_name in table_bucket_list1): spark_df = sdftools.getCountBins( spark_df, column=AC.ORIG, bins=[0, 1000, 5000, 10000, 50000, 100000]).persist() if (table_name in table_bucket_list2): spark_df = sdftools.getCountBins(spark_df, column=AC.ORIG, bins=[0, 10, 100]).persist() # This finds overall metrics spark_df.show(100, False) metrics_result = sdftools.combined_metrics(spark_df, spark, geolevels, agekey, sexkey, bucketkey, key) file_name = f"{table_name}.csv" pandas_df = metrics_result.toPandas() csv_savepath = experiment.save_location_linux + file_name du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False)
analysis.save_log(to_linux=False, to_s3=False) # zip the local results to s3? analysis.zip_results_to_s3(flag=False) spark = analysis.spark ####################################################### # Create an experiment using one or more DAS Run paths ####################################################### paths = [ f"{AC.S3_BASE}kifer001/Aug29_experiments_hierarchicalAgeRangeTopDown_branchFactor4_output_danVariant1-2/td4/" ] experiment = analysis.make_experiment("danVariant1-2", paths) sdftools.print_item(experiment.__dict__, "Experiment Attributes") ############################## # Work with the Experiment DF ############################## df = experiment.getDF() schema = experiment.schema sdftools.print_item(df, "Experiment DF") ###################### # Aggregate Geolevels ###################### geolevels = [C.STATE, C.COUNTY] # OPTIONAL # ========
def analyzeQuery(query, table_name, analysis, spark, geolevels, eps, schema_name="DHCP_HHGQ"): """ Main plotting fxn. query : str, name of a valid query for the target experiment's schema table_name : str, name of a table (used for file-naming conventions) analysis : Analysis setuptools.setup object, organizes Analysis metadata spark : SparkSession object, attached to analysis object geolevels : [str, ...], geolevels to compute over for the current query buckets : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing schema : str, name of ../programs/schema/schemas/schemamaker.py schema associated with target data Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3. """ geolevel = geolevels[0] EPT = table_name[:4] + "_" + schema_name graph_title = f"Error for query: {table_name}-{query}, eps: {int(eps)}, geography: {geolevels}\nDisclosure Prohibited - Title 13 U.S.C." plt.figure(1, figsize=(20, 40)) sns.set(style="ticks") fig, axes = plt.subplots(ncols=3, nrows=2, sharey=True, sharex=True) axes_flat = axes.ravel() sns.despine(fig=fig) #.set_title(graph_title) print( f"For table {table_name}, analyzing query {query} at geolevel {geolevel} with schema_name {schema_name} and eps: {eps}" ) num_trials, paths, experiment_name, eps_str, spines, mechanisms = getPathsAndName( schema_name, query, table_name, eps) plt.xscale('log') plt.yscale('symlog', linthreshy=100) for k, path in enumerate(paths): axes_flat[k].set_title(spines[k] + '_' + mechanisms[k]) experiment = analysis.make_experiment( experiment_name, [path], schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) spark_df = experiment.getDF() sdftools.print_item(experiment.__dict__, "Experiment Attributes") schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") queries = [query] spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels) # jitter points to make them visually distinct: spark_df = sdftools.answerQueries(spark_df, schema, queries) \ .withColumn("Error", sf.col("priv") - sf.col("orig") + sf.rand() - 1/2.) \ .withColumn("orig", sf.col("orig") + sf.rand() - 1/2.) if geolevel == "AIAN_AREAS": spark_df = spark_df.filter(spark_df.geocode != "9999") elif geolevel == 'OSE': spark_df = spark_df.filter( sf.col(AC.GEOCODE).substr( sf.length(sf.col(AC.GEOCODE)) - 4, sf.length(sf.col(AC.GEOCODE))) != "99999") elif geolevel == 'AIANTract': spark_df = spark_df.filter(spark_df.geocode != "9" * 11) elif geolevel == 'AIANState': spark_df = spark_df.filter(spark_df.geocode != "99") elif geolevel == 'AIANBlock': spark_df = spark_df.filter(spark_df.geocode != "9" * 16) # t = spark_df.filter(sf.abs(spark_df.Error) > 1000) spark_df = spark_df.select(["orig", "Error"]) pandas_df = spark_df.toPandas() #if pandas_df.max()["Error"] == pandas_df.min()["Error"]: # continue sns.scatterplot(x="orig", y="Error", data=pandas_df, alpha=.6, s=10, marker="+", ax=axes_flat[k]) axes_flat[k].axhline(0., ls='--') filename = f"{table_name}_{query.replace(' ', '_')}_{geolevel}" plot_path = f"{experiment.save_location_linux}epsilon_{eps_str}/" du.makePath(plot_path) plt.savefig(plot_path + filename + ".png") plt.clf()
# setup analysis ################# analysis_results_save_location = f"/mnt/users/moran331/analysis_reports/" spark_loglevel = "ERROR" analysis = setuptools.setup(save_location=analysis_results_save_location, spark_loglevel=spark_loglevel) spark = analysis.spark schema = SchemaMaker.fromName("DHCP_SCHEMA") num_geocodes = 2 geocodes = [str(x).zfill(16) for x in range(num_geocodes)] geounits = [ testdata_random_geounit_generator(x, schema, density=0.00001, scale=10) for x in geocodes ] sdftools.print_item(geounits, "Random Geounit Data") rdd = spark.sparkContext.parallelize(geounits).persist() sdftools.print_item(rdd, "Parallelized RDD data") df = rdd.flatMap(lambda node: mappers.getSparseDF_mapper(node, schema) ).map(lambda row: Row(**row)).toDF().persist() sdftools.print_item(df, "DF of Random Geounit Data") df = df.withColumn("STATE", sf.col("geocode")[0:2]).persist() sdftools.print_item(df, "DF with STATE code") df = sdftools.aggregateGeolevels(spark, df, 'STATE') sdftools.print_item(df, "Aggregated to the STATE geolevel") query = 'sex * age'
spark = analysis.spark ####################################################### # Create an experiment using one or more DAS Run paths ####################################################### s3_path_exp = "s3://uscb-decennial-ite-das/users/heiss002/cnstatDdpSchema_SinglePassRegular_va_cnstatDpqueries_cnstatGeolevels" paths = [ f"{s3_path_exp}/data-run1.0-epsilon16.0-BlockNodeDicts/", f"{s3_path_exp}/data-run1.0-epsilon4.0-BlockNodeDicts/" ] schema_name = "DHCP_HHGQ" experiment = analysis.make_experiment("cnstat", paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") ############################## # Work with the Experiment DF ############################## df = experiment.getDF() schema = experiment.schema sdftools.print_item(df, "Flat Experiment DF") ###################### # Aggregate Geolevels ###################### geolevels = [C.STATE, C.COUNTY] df = sdftools.aggregateGeolevels(spark, df, geolevels)
def household_tvd(sdf, geolevels, queries, spark): """ Calculating 1-TVD for the Household information 1 - [ (A + B) / C ], where A = SUM( L1( detailed(CEF_g), detailed(DAS_g) ) ) for all g in the geolevel B = SUM( L1( total(CEF_g), total(DAS_g) ) ) for all g in the geolevel C = 2 * SUM(_invar['gqhh_vect'][0]_g) <- household invariant total in the geolevel """ results = {} # get the invariants path block_level_invar_path = sdf.runs[0].data_path # prep invar_df (household info only) invar_rdd = spark.sparkContext.pickleFile(block_level_invar_path) # since the household total invariant for the geolevel will be the same across all geolevels, we can just calculate the value invar_val = invar_rdd.map(hh_invar_mapper).reduce(operator.add) sdftools.print_item(invar_val, "Household Total Invariant") double_invar_val = 2 * invar_val sdftools.print_item(double_invar_val, "2 * Household Total Invariant") # compute, for each geolevel, the 1-TVD for household info # old way... compute each geolevel individually # for geolevel in du.aslist(geolevels) + [geolevels]: # new way... compute all geolevels at once for geolevel in [geolevels]: # A geosdf = sdf.clone() sdftools.print_item(geosdf, "Original SDF") geosdf = geosdf.getGeolevels(geolevel) sdftools.print_item(geosdf, f"Geolevel '{geolevel}'") geosdf = geosdf.getQueryAnswers(queries) sdftools.print_item(geosdf, f"Detailed Query at geolevel '{geolevel}'") geosdf = geosdf.L1(colname="L1_A") sdftools.print_item(geosdf, f"L1_A") geosdf = geosdf.sum(groupby=[AC.GEOCODE, AC.GEOLEVEL]) sdftools.print_item(geosdf, "sum(L1_A)") # B geosdf = geosdf.L1(colname="L1_B") sdftools.print_item(geosdf, "L1_B") geosdf = geosdf.sum(groupby=[AC.GEOLEVEL]) sdftools.print_item(geosdf, "sum(L1_B)") # 1-[(A+B)/C] geosdf.df = geosdf.df.withColumn("A+B", sf.col("L1_A") + sf.col("L1_B")).persist() sdftools.print_item(geosdf, "A+B") geosdf.df = geosdf.df.withColumn("2*geolevel_hh_invar", sf.lit(double_invar_val)).persist() geosdf.df = geosdf.df.withColumn("TVD", sf.col("A+B") / sf.col("2*geolevel_hh_invar")).persist() sdftools.print_item(geosdf, "TVD") geosdf.df = geosdf.df.withColumn("1-TVD", sf.lit(1) - sf.col("TVD")).persist() sdftools.print_item(geosdf, "1-TVD") if isinstance(geolevel, list): key = f"All_Geolevels_Household_1-TVD" else: key = f"{geolevel}_Household_1-TVD" results[key] = geosdf return results
nm_state = noisy_measurements_state print(nm_state) geounit = nm_state.take(1).pop() print(geounit) geocode = geounit.geocode print(geocode) dp_queries = geounit.dp_queries experiment_name = "dhcp_eps4_run36" experiment_path = f"{S3_BASE}/lecle301/dhcp_eps4/run36_of_25/full_person/" experiment = analysis.make_experiment(experiment_name, experiment_path) df = experiment.getDF() sdftools.print_item(df, "Experiment DF", show=100) geolevel_df = sdftools.aggregateGeolevels(experiment.spark, df, ['STATE']) sdftools.print_item(geolevel_df, "Geolevel DF") filtered_df = df.filter(df.geocode == geocode).persist() sdftools.print_item(filtered_df, "Experiment DF", show=1000)
analysis.save_log(to_linux=False, to_s3=False) # zip the local results to s3? analysis.zip_results_to_s3(flag=False) spark = analysis.spark ####################################################### # Create an experiment using one or more DAS Run paths ####################################################### paths = [ f"{AC.S3_BASE}kifer001/Aug29_experiments_hierarchicalAgeRangeTopDown_branchFactor4_output_danVariant1-2/td4/run_0000/" ] experiment = analysis.make_experiment("danVariant1-2", paths) sdftools.print_item(experiment.__dict__, "Experiment Attributes") ############################## # Work with the Experiment DF ############################## df = experiment.getDF() schema = experiment.schema sdftools.print_item(df, "Experiment DF") geolevels = [ C.STATE, C.COUNTY, C.TRACT_GROUP, C.TRACT, C.BLOCK_GROUP, C.BLOCK, C.SLDL, C.SLDU ] queries = [ 'total', 'hhgq', 'votingage * citizen', 'numraces * hispanic',
################################ S3_BASE = "s3://uscb-decennial-ite-das/users" ################# # setup analysis ################# analysis_results_save_location = f"/mnt/users/moran331/analysis_reports/" spark_loglevel = "ERROR" analysis = setuptools.setup(save_location=analysis_results_save_location, spark_loglevel=spark_loglevel) spark = analysis.spark schema = SchemaMaker.fromName("DHCP_SCHEMA") num_geocodes = 5 geocodes = [str(x).zfill(16) for x in range(num_geocodes)] geounits = [testdata_random_geounit_generator(x, schema, density=0.00001, scale=10) for x in geocodes] sdftools.print_item(geounits, "Random Geounit Data") rdd = spark.sparkContext.parallelize(geounits).persist() sdftools.print_item(rdd, "Parallelized RDD data") df = rdd.flatMap(lambda node: mappers.getSparseDF_mapper(node, schema)).map(lambda row: Row(**row)).toDF().persist() sdftools.print_item(df, "DF of Random Geounit Data") recoder = dhcp_to_mdf2020.DHCPToMDF2020Recoder().recode mdf = ( # privatized means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production rdd.flatMap(lambda node: mappers.getMicrodataDF_mapper(node, schema, privatized=True, mangled_names=True, recoders=recoder)) .zipWithIndex() .map(addIndexToRow) .map(lambda row: Row(**row)) .toDF()
def analyzeQuery(query, analysis, spark, geolevel, schema_name, path): """ Main plotting fxn. query : str, name of a valid query for the target experiment's schema table_name : str, name of a table (used for file-naming conventions) analysis : Analysis setuptools.setup object, organizes Analysis metadata spark : SparkSession object, attached to analysis object geolevels : [str, ...], geolevels to compute over for the current query buckets : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing schema : str, name of ../programs/schema/schemas/schemamaker.py schema associated with target data Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3. """ # To avoid cases in which max(numerator_query_levels)/denom_query_level >= 1: assert query != denom_query experiment_name = "NA" experiment = analysis.make_experiment( experiment_name, [path], schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT, budget_group='1', run_id='run1.0') spark_df = experiment.getDF() sdftools.print_item(experiment.__dict__, "Experiment Attributes") schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevel) spark_df = sdftools.remove_not_in_area(spark_df, [geolevel]) spark_df = sdftools.answerQueries(spark_df, schema, [query, denom_query]) spark_df = sdftools.getL1Relative(spark_df, colname="L1Relative", denom_query=denom_query, denom_level=denom_level).persist() query_counts = spark_df.rdd.map(lambda row: (row[AC.QUERY], )).countByKey() query_counts_keys = list(query_counts.keys()) assert len(query_counts_keys) == 1 and query_counts_keys[0] == query spark_rdd_prop_lt = spark_df.rdd.map( lambda row: (int(np.digitize(row["orig"], POPULATION_BIN_STARTS)), 1. if row["L1Relative"] <= THRESHOLD else 0.)) spark_df_prop_lt = spark_rdd_prop_lt.toDF(["pop_bin", "prop_lt"]) # Find the proportion of geounits that have L1Relative errors less than threshold for each bin: grouped_df_prop_lt = spark_df_prop_lt.groupBy("pop_bin").agg({ "prop_lt": "avg", "*": "count" }) prop_lt = grouped_df_prop_lt.collect() n_bins = len(POPULATION_BIN_STARTS) + 1 prop_lt_list = [None] * n_bins prop_lt_counts = [0] * n_bins for row in prop_lt: prop_lt_list[int(row["pop_bin"])] = np.round(row["avg(prop_lt)"], 5) prop_lt_counts[int(row["pop_bin"])] = int(row["count(1)"]) print(prop_lt_list) print( f"geounits counts for each bin: {[(POPULATION_BIN_STARTS[k], prop_lt_counts[k]) for k in range(len(POPULATION_BIN_STARTS))]}" ) population_bin_starts = np.concatenate( ([-np.inf], POPULATION_BIN_STARTS, [np.inf])) ranges = list( zip(population_bin_starts[:-1], population_bin_starts[1:] - 1)) assert len(prop_lt_list) == (len(population_bin_starts) - 1) prop_lt_reformat = list(zip(ranges, prop_lt_list)) spark_df = spark_df.filter(spark_df.orig >= POPULATION_CUTOFF) # Count above POPULATION_CUTOFF count = spark_df.count() # For the quantiles and the avg, we will omit geounits that would not have had a well defined L1Relative metric well defined # due to division by zero: (See the comments in the UDF used in sdftools.getL1Relative() for more detail.) spark_df = spark_df.filter(spark_df.L1Relative != 2.) count_correct_sign = spark_df.count() quantiles_df = sdftools.getGroupQuantiles(spark_df, columns=["L1Relative"], groupby=[AC.QUERY, AC.GEOLEVEL], quantiles=QUANTILES).collect() avg = spark_df.groupBy([AC.QUERY, AC.GEOLEVEL]).avg("L1Relative").collect() quantiles_dict = {} for row in quantiles_df: quantiles_dict[float(row["quantile"])] = np.round(row["L1Relative"], 5) quantiles_reformat = [(quant, quantiles_dict[quant]) for quant in QUANTILES] error_metrics = [ np.round(avg[0]["avg(L1Relative)"], 5), count, count_correct_sign ] + [quantiles_reformat] + [prop_lt_reformat] print("error_metrics:", error_metrics) return error_metrics
def mf01(sdf, geolevels, queries): """ Answers queries at the geolevels specified """ sdf = sdf.getGeolevels(geolevels).answerQueries(queries) sdftools.print_item( sdf, f"Query answers for geolevels {geolevels} and queries {queries}") return sdf