def run(self, engine_tuple): block_nodes, feas_dict = engine_tuple # access the SparkSession (needed for aggregating geolevels) spark = SparkSession(SparkContext.getOrCreate()) # transform the rdd of block-level nodes into a 'sparse histogram' spark df df = datatools.rdd2df(block_nodes, self.setup.schema_obj) sdftools.show(df, "The Block-level Geounit Nodes as Sparse Histogram DF", 1000) # read the geolevels from the error_metrics section of the config file geolevels = self.setup.config['error_metrics']['geolevels'].split(", ") #geolevels = self.setup.levels # aggregate blocks to get the different geolevels df = sdftools.aggregateGeolevels(spark, df, geolevels) sdftools.show(df, f"DF with all Geolevels in {geolevels}", 1000) # access the queries from the error_metrics section of the config file queries = self.setup.config['error_metrics']['queries'].split(", ") # and answer the queries df = sdftools.answerQueries(df, self.setup.schema_obj, queries) sdftools.show(df, f"DF with all Queries in {queries}", 1000) # compute the Geolevel 1-TVD metric geolevel_tvd = sdftools.getGeolevelTVD(df, groupby=[AC.GEOLEVEL, AC.QUERY]) geolevel_tvd = geolevel_tvd.orderBy([AC.QUERY, AC.GEOLEVEL]) sdftools.show(geolevel_tvd, f"Geolevel 1-TVD per geolevel per query", 1000) # calculate sparsity change sparsity_df = sdftools.getCellSparsityByGroup(df, self.setup.schema_obj, groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.QUERY]) sdftools.show(sparsity_df, f"Query and Geolevel DF with Sparsity per group", 1000)
def analyzeQuery(query, table_name, analysis, spark, geolevels, eps, buckets=default_buckets, schema="DHCP_HHGQ"): """ Main plotting fxn. query : str, name of a valid query for the target experiment's schema table_name : str, name of a table (used for file-naming conventions) analysis : Analysis setuptools.setup object, organizes Analysis metadata spark : SparkSession object, attached to analysis object geolevels : [str, ...], geolevels to compute over for the current query buckets : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing schema : str, name of ../programs/schema/schemas/schemamaker.py schema associated with target data Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3. """ print(f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}") schema_name = schema num_trials, paths, experiment_name, eps_str = getPathsAndName(schema_name, query, table_name, eps) print(f"Passing paths to Analysis experiment maker: {paths}") experiment = analysis.make_experiment(experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") spark_df = experiment.getDF() print("df looks like:") spark_df.show() schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") queries = [query] spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels) spark_df = sdftools.answerQueries(spark_df, schema, queries) spark_df = sdftools.getAvgAbsErrorByTrueCountRuns(spark_df, bins=[0,1,10,100,1000,10000]).persist() missing_rows_pandas_df = sdftools.getMissingRowCounts(spark_df, schema, queries, groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.PLB, AC.BUDGET_GROUP]) missing_rows_dict = defaultdict(int) for index, row in missing_rows_pandas_df.iterrows(): #print(f"missing df row # {index} geolevel, sum(missing) = {row['geolevel']},{row['sum(missing)']}") missing_rows_dict[row['geolevel']] = row['sum(missing)'] spark_df.show() print("^^^^ with abs error, DF looks like ^^^^") metric_name = "Avg( |q(MDF) - q(CEF)| )" x_axis_variable_name = 'CEF Count, Binned' pandas_df = spark_df.toPandas() pandas_df = pandas_df.rename(columns={"abs_error":metric_name, "orig_count_bin":x_axis_variable_name}) plt.figure(1, figsize=(11,8.5)) plt.rc('axes', labelsize=8) print(f"pandas df before plotting has cols: {pandas_df.columns.values}") print(f"{x_axis_variable_name} column has distinct levels: {pandas_df[x_axis_variable_name].unique()}") buckets = pandas_df[x_axis_variable_name].unique() buckets = sorted(buckets, key=lambda bucket_name: largestIntInStr(bucket_name)) print(f"Sorted bucket names: {buckets}") # Saving data frame csv_savepath = experiment.save_location_linux + f"{experiment_name}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) makePlots(experiment, experiment_name, table_name, queries, x_axis_variable_name, metric_name, geolevels, pandas_df, buckets, schema_name, eps_str, missing_rows_dict, num_trials)
def analyzeQuery(query, analysis, spark, geolevel, schema_name, path): """ Main plotting fxn. query : str, name of a valid query for the target experiment's schema table_name : str, name of a table (used for file-naming conventions) analysis : Analysis setuptools.setup object, organizes Analysis metadata spark : SparkSession object, attached to analysis object geolevels : [str, ...], geolevels to compute over for the current query buckets : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing schema : str, name of ../programs/schema/schemas/schemamaker.py schema associated with target data Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3. """ # To avoid cases in which max(numerator_query_levels)/denom_query_level >= 1: assert query != denom_query experiment_name = "NA" experiment = analysis.make_experiment( experiment_name, [path], schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT, budget_group='1', run_id='run1.0') spark_df = experiment.getDF() sdftools.print_item(experiment.__dict__, "Experiment Attributes") schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevel) spark_df = sdftools.remove_not_in_area(spark_df, [geolevel]) spark_df = sdftools.answerQueries(spark_df, schema, [query, denom_query]) spark_df = sdftools.getL1Relative(spark_df, colname="L1Relative", denom_query=denom_query, denom_level=denom_level).persist() query_counts = spark_df.rdd.map(lambda row: (row[AC.QUERY], )).countByKey() query_counts_keys = list(query_counts.keys()) assert len(query_counts_keys) == 1 and query_counts_keys[0] == query spark_rdd_prop_lt = spark_df.rdd.map( lambda row: (int(np.digitize(row["orig"], POPULATION_BIN_STARTS)), 1. if row["L1Relative"] <= THRESHOLD else 0.)) spark_df_prop_lt = spark_rdd_prop_lt.toDF(["pop_bin", "prop_lt"]) # Find the proportion of geounits that have L1Relative errors less than threshold for each bin: grouped_df_prop_lt = spark_df_prop_lt.groupBy("pop_bin").agg({ "prop_lt": "avg", "*": "count" }) prop_lt = grouped_df_prop_lt.collect() n_bins = len(POPULATION_BIN_STARTS) + 1 prop_lt_list = [None] * n_bins prop_lt_counts = [0] * n_bins for row in prop_lt: prop_lt_list[int(row["pop_bin"])] = np.round(row["avg(prop_lt)"], 5) prop_lt_counts[int(row["pop_bin"])] = int(row["count(1)"]) print(prop_lt_list) print( f"geounits counts for each bin: {[(POPULATION_BIN_STARTS[k], prop_lt_counts[k]) for k in range(len(POPULATION_BIN_STARTS))]}" ) population_bin_starts = np.concatenate( ([-np.inf], POPULATION_BIN_STARTS, [np.inf])) ranges = list( zip(population_bin_starts[:-1], population_bin_starts[1:] - 1)) assert len(prop_lt_list) == (len(population_bin_starts) - 1) prop_lt_reformat = list(zip(ranges, prop_lt_list)) spark_df = spark_df.filter(spark_df.orig >= POPULATION_CUTOFF) # Count above POPULATION_CUTOFF count = spark_df.count() # For the quantiles and the avg, we will omit geounits that would not have had a well defined L1Relative metric well defined # due to division by zero: (See the comments in the UDF used in sdftools.getL1Relative() for more detail.) spark_df = spark_df.filter(spark_df.L1Relative != 2.) count_correct_sign = spark_df.count() quantiles_df = sdftools.getGroupQuantiles(spark_df, columns=["L1Relative"], groupby=[AC.QUERY, AC.GEOLEVEL], quantiles=QUANTILES).collect() avg = spark_df.groupBy([AC.QUERY, AC.GEOLEVEL]).avg("L1Relative").collect() quantiles_dict = {} for row in quantiles_df: quantiles_dict[float(row["quantile"])] = np.round(row["L1Relative"], 5) quantiles_reformat = [(quant, quantiles_dict[quant]) for quant in QUANTILES] error_metrics = [ np.round(avg[0]["avg(L1Relative)"], 5), count, count_correct_sign ] + [quantiles_reformat] + [prop_lt_reformat] print("error_metrics:", error_metrics) return error_metrics
sdftools.print_item(experiment.__dict__, "Experiment Attributes") ############################## # Work with the Experiment DF ############################## df = experiment.getDF() schema = experiment.schema sdftools.print_item(df, "Experiment DF") geolevels = [ C.STATE, C.COUNTY, C.TRACT_GROUP, C.TRACT, C.BLOCK_GROUP, C.BLOCK, C.SLDL, C.SLDU ] queries = [ 'total', 'hhgq', 'votingage * citizen', 'numraces * hispanic', 'cenrace * hispanic', 'sex * age', 'detailed' ] ##################################################### # Binning and Filtering "Large" and "Small" Geounits ##################################################### # 0a. Aggregate Blocks to get Geographic Units at all desired Geographic Levels geoleveldf = sdftools.aggregateGeolevels(spark, df, geolevels) # 0b. Answer Queries querydf = sdftools.answerQueries(geoleveldf, schema, queries, labels=True) # TODO: Implement the binning and filtering operations
def analyzeQuery(query, analysis, spark, geolevel, schema_name, path): """ Main plotting fxn. query : str, name of a valid query for the target experiment's schema table_name : str, name of a table (used for file-naming conventions) analysis : Analysis setuptools.setup object, organizes Analysis metadata spark : SparkSession object, attached to analysis object geolevels : [str, ...], geolevels to compute over for the current query buckets : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing schema : str, name of ../programs/schema/schemas/schemamaker.py schema associated with target data Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3. """ experiment_name = "NA" quantiles = [xi / 20. for xi in np.arange(20)] + [.975, .99, 1.] experiment = analysis.make_experiment( experiment_name, [path], schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT, budget_group='1', run_id='run1.0') spark_df = experiment.getDF() sdftools.print_item(experiment.__dict__, "Experiment Attributes") schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevel) if geolevel == C.PLACE: spark_df = spark_df.filter(spark_df.geocode[2:7] != "99999") elif geolevel == 'AIAN_AREAS': spark_df = spark_df.filter(spark_df.geocode != "9999") elif geolevel == 'OSE': spark_df = spark_df.filter( sf.col(AC.GEOCODE).substr( sf.length(sf.col(AC.GEOCODE)) - 4, sf.length(sf.col(AC.GEOCODE))) != "99999") elif geolevel == 'AIANTract': spark_df = spark_df.filter(spark_df.geocode != "9" * 11) elif geolevel == 'AIANState': spark_df = spark_df.filter(spark_df.geocode != "99") elif geolevel == 'AIANBlock': spark_df = spark_df.filter(spark_df.geocode != "9" * 16) elif geolevel == 'COUNTY_NSMCD': spark_df = spark_df.filter(spark_df.geocode != "999") spark_df = sdftools.answerQueries(spark_df, schema, [query, denom_query]) spark_df = sdftools.getL1Relative(spark_df, colname="L1Relative", denom_query=denom_query, denom_level=denom_level).persist() spark_rdd_prop_lt = spark_df.rdd.map( lambda row: (int(np.digitize(row["orig"], POPULATION_BIN_STARTS)), 1. if row["L1Relative"] <= THRESHOLD else 0.)) spark_df_prop_lt = spark_rdd_prop_lt.toDF(["pop_bin", "prop_lt"]) # Find the proportion of geounits that have L1Relative errors less than threshold for each bin: grouped_df_prop_lt = spark_df_prop_lt.groupBy("pop_bin").agg({ "prop_lt": "avg", "*": "count" }) # print("RCM", grouped_df_prop_lt.first()) prop_lt = grouped_df_prop_lt.collect() prop_lt_dict = {} prop_lt_counts = {} for row in prop_lt: prop_lt_dict[int(row["pop_bin"])] = np.round(row["avg(prop_lt)"], 5) prop_lt_counts[int(row["pop_bin"])] = int(row["count(1)"]) print(prop_lt_dict) pop_bin_indices = list(prop_lt_dict.keys()) for k in range(len(POPULATION_BIN_STARTS)): if k not in pop_bin_indices: prop_lt_dict[k] = None prop_lt_counts[k] = 0 print( f"geounits counts for each bin: {[(POPULATION_BIN_STARTS[k], prop_lt_counts[k]) for k in range(len(POPULATION_BIN_STARTS))]}" ) prop_lt_reformat = [(POPULATION_BIN_STARTS[k], prop_lt_dict[k]) for k in range(len(POPULATION_BIN_STARTS))] spark_df = spark_df.filter(spark_df.orig >= POPULATION_CUTOFF) # Count above POPULATION_CUTOFF count = spark_df.count() # For the quantiles and the avg, we will omit geounits that would not have had a well defined L1Relative metric well defined # due to division by zero: (See the comments in the UDF used in sdftools.getL1Relative() for more detail.) spark_df = spark_df.filter(spark_df.L1Relative != 2.) count_correct_sign = spark_df.count() quantiles_df = sdftools.getGroupQuantiles(spark_df, columns=["L1Relative"], groupby=[AC.QUERY, AC.GEOLEVEL], quantiles=QUANTILES).collect() avg = spark_df.groupBy([AC.QUERY, AC.GEOLEVEL]).avg("L1Relative").collect() quantiles_dict = {} for row in quantiles_df: quantiles_dict[float(row["quantile"])] = np.round(row["L1Relative"], 5) quantiles_reformat = [(quant, quantiles_dict[quant]) for quant in QUANTILES] error_metrics = [ np.round(avg[0]["avg(L1Relative)"], 5), count, count_correct_sign ] + [quantiles_reformat] + [prop_lt_reformat] print("error_metrics:", error_metrics) return error_metrics
geounits = [ testdata_random_geounit_generator(x, schema, density=0.00001, scale=10) for x in geocodes ] sdftools.print_item(geounits, "Random Geounit Data") rdd = spark.sparkContext.parallelize(geounits).persist() sdftools.print_item(rdd, "Parallelized RDD data") df = rdd.flatMap(lambda node: mappers.getSparseDF_mapper(node, schema) ).map(lambda row: Row(**row)).toDF().persist() sdftools.print_item(df, "DF of Random Geounit Data") df = df.withColumn("STATE", sf.col("geocode")[0:2]).persist() sdftools.print_item(df, "DF with STATE code") df = sdftools.aggregateGeolevels(spark, df, 'STATE') sdftools.print_item(df, "Aggregated to the STATE geolevel") query = 'sex * age' df = sdftools.answerQuery(df, schema, query, labels=False) sdftools.print_item(df, "Answering the sex query") groupby = ['geocode', 'geolevel'] rdd = sdftools.getRowGroupsAsRDD(df, groupby) df = rdd.flatMapValues(prob_vector_mapper).map( lambda row: Row(**row[1])).toDF() df = df.withColumn('age', sf.col('age').cast("int")).persist() df = df.sort(['geocode', 'age', 'sex']).persist() sdftools.print_item(df, f"Prob vector for {query} query", show=1000)
def MattsMetrics(query, table_name, analysis, spark, geolevels, schema="DHCP_HHGQ"): """ This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds" """ print( f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}" ) schema_name = schema paths, experiment_name, eps_str = getPathsAndName(schema_name) experiment = analysis.make_experiment( experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") spark_df = experiment.getDF() print("df looks like:") spark_df.show() schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") queries = [query] spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels) spark_df = sdftools.answerQueries(spark_df, schema, queries) spark_df = sdftools.getFullWorkloadDF( spark_df, schema, queries, groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]) #spark_df.show(spark_df.count(), False) # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production spark_df = sdftools.getL1(spark_df, colname="L1", col1=AC.PRIV, col2=AC.ORIG) spark_df = sdftools.getL2(spark_df, colname="L2", col1=AC.PRIV, col2=AC.ORIG) # apply bin functions for particular tables if (table_name in table_bucket_list1): spark_df = sdftools.getCountBins( spark_df, column=AC.ORIG, bins=[0, 1000, 5000, 10000, 50000, 100000]).persist() if (table_name in table_bucket_list2): spark_df = sdftools.getCountBins(spark_df, column=AC.ORIG, bins=[0, 10, 100]).persist() # This finds overall metrics spark_df.show(100, False) if table_name not in (table_list_3_plus_list_age): for g in geolevels: spark_df1 = spark_df[spark_df['geolevel'] == g] # Separate data for each geolevel if table_name in table_default_no_bucket: # If data is not in buckets bucket_size = "NA" metrics_result = sdftools.metrics_with_popbucket(spark_df1, bucket_size, spark, key="A") file_name = f"{table_name}_{g}.csv" if table_name in table_bucket_list2: # if data is bucketed in 3 buckets, bucket_size = default_buckets2 print("BUCKET SIZE IS:", bucket_size) metrics_result = sdftools.metrics_with_popbucket(spark_df1, bucket_size, spark, key="B") file_name = f"{table_name}_{g}.csv" if table_name in table_bucket_list1: # Table 1 and 2, six buckets bucket_size = default_buckets1 print("BUCKET SIZE IS:", bucket_size) metrics_result = sdftools.metrics_with_popbucket(spark_df1, bucket_size, spark, key="B") file_name = f"{table_name}_{g}.csv" if table_name in table_list_3geolevels: #three geolevels, state, county, place, Tables 10,14,18,22 metrics_result = sdftools.metrics_with_3geolevels( spark_df, spark, geolevels) file_name = f"{table_name}.csv" if table_name in table_list_age: # Tables 32-35 if table_name in table_age_bracket1: metrics_result = sdftools.metrics_with_age(spark_df, spark, age_range_list, key="A") if table_name in table_age_bracket2: metrics_result = sdftools.metrics_with_age(spark_df, spark, age_range_list, key="B") if table_name in table_age_bracket3: metrics_result = sdftools.metrics_with_age(spark_df, spark, age_range_list2, key="A") if table_name in table_age_bracket4: metrics_result = sdftools.metrics_with_age(spark_df, spark, age_range_list2, key="B") file_name = f"{table_name}.csv" pandas_df = metrics_result.toPandas() csv_savepath = experiment.save_location_linux + file_name du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False)
############################## df = experiment.getDF() schema = experiment.schema sdftools.print_item(df, "Experiment DF") ############################## # Accuracy Metrics ############################## """ Mean / Median Absolute Error (MAE): 1. Calculate total population at County geographic level 2. Calculate |MDF-CEF| for the total populations for each county 3. Calculate the mean or median across all county total populations """ # 1a. Aggregate to County geographic level county_df = sdftools.aggregateGeolevels(spark, df, [C.COUNTY]) sdftools.show(county_df, "Counties") # 1b. Answer the "total" query for all counties county_totals_df = sdftools.answerQueries(county_df, schema, "total", labels=True) sdftools.show(county_totals_df, "County total pops") # 2. Calculate L1(MDF, CEF) # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production abs_error_county_totals_df = sdftools.getL1(county_totals_df, colname="AbsError", col1=AC.PRIV, col2=AC.ORIG)
rdd = spark.sparkContext.parallelize(geounits).persist() sdftools.print_item(rdd.take(1), "One of the toy example geounits") # use Analysis to transform the rdd of geounitnodes into a spark dataframe df = datatools.rdd2df(rdd, schema) sdftools.print_item(df, "Toy example DF", 300) # aggregate geolevels df = df.withColumn("block", sf.col(AC.GEOCODE)[0:3]).persist() df = df.withColumn("county", sf.col(AC.GEOCODE)[0:2]).persist() df = df.withColumn("nation", sf.col(AC.GEOCODE)[0:1]).persist() sdftools.show(df, "df with geolevel crosswalk columns") df = sdftools.aggregateGeolevels(spark, df, ['block', 'county', 'nation']) sdftools.show(df, "df after geolevel aggregation", 1000) # answer total query qdf = sdftools.answerQuery(df, schema, "total", labels=False, merge_dims=False) sdftools.show(qdf, "Query df with the query 'total'", 1000) # select geounits by quantile bins rdd = sdftools.getRowGroupsAsRDD(qdf, groupby=[AC.GEOLEVEL, AC.QUERY]) sdftools.show(rdd.collect(), "Row groups") def row_selection_mapper(rows, selection_function, **selection_kwargs): pandas_df = pandas.DataFrame(rows) pandas_df = selection_function(pandas_df, **selection_kwargs) rows = pandas_df.to_dict('records')
geounit = nm_state.take(1).pop() print(geounit) geocode = geounit.geocode print(geocode) dp_queries = geounit.dp_queries experiment_name = "dhcp_eps4_run36" experiment_path = f"{S3_BASE}/lecle301/dhcp_eps4/run36_of_25/full_person/" experiment = analysis.make_experiment(experiment_name, experiment_path) df = experiment.getDF() sdftools.print_item(df, "Experiment DF", show=100) geolevel_df = sdftools.aggregateGeolevels(experiment.spark, df, ['STATE']) sdftools.print_item(geolevel_df, "Geolevel DF") filtered_df = df.filter(df.geocode == geocode).persist() sdftools.print_item(filtered_df, "Experiment DF", show=1000)
def analyzeQuery(query, table_name, analysis, spark, geolevels, eps, schema_name="DHCP_HHGQ"): """ Main plotting fxn. query : str, name of a valid query for the target experiment's schema table_name : str, name of a table (used for file-naming conventions) analysis : Analysis setuptools.setup object, organizes Analysis metadata spark : SparkSession object, attached to analysis object geolevels : [str, ...], geolevels to compute over for the current query buckets : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing schema : str, name of ../programs/schema/schemas/schemamaker.py schema associated with target data Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3. """ geolevel = geolevels[0] EPT = table_name[:4] + "_" + schema_name graph_title = f"Error for query: {table_name}-{query}, eps: {int(eps)}, geography: {geolevels}\nDisclosure Prohibited - Title 13 U.S.C." plt.figure(1, figsize=(20, 40)) sns.set(style="ticks") fig, axes = plt.subplots(ncols=3, nrows=2, sharey=True, sharex=True) axes_flat = axes.ravel() sns.despine(fig=fig) #.set_title(graph_title) print( f"For table {table_name}, analyzing query {query} at geolevel {geolevel} with schema_name {schema_name} and eps: {eps}" ) num_trials, paths, experiment_name, eps_str, spines, mechanisms = getPathsAndName( schema_name, query, table_name, eps) plt.xscale('log') plt.yscale('symlog', linthreshy=100) for k, path in enumerate(paths): axes_flat[k].set_title(spines[k] + '_' + mechanisms[k]) experiment = analysis.make_experiment( experiment_name, [path], schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) spark_df = experiment.getDF() sdftools.print_item(experiment.__dict__, "Experiment Attributes") schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") queries = [query] spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels) # jitter points to make them visually distinct: spark_df = sdftools.answerQueries(spark_df, schema, queries) \ .withColumn("Error", sf.col("priv") - sf.col("orig") + sf.rand() - 1/2.) \ .withColumn("orig", sf.col("orig") + sf.rand() - 1/2.) if geolevel == "AIAN_AREAS": spark_df = spark_df.filter(spark_df.geocode != "9999") elif geolevel == 'OSE': spark_df = spark_df.filter( sf.col(AC.GEOCODE).substr( sf.length(sf.col(AC.GEOCODE)) - 4, sf.length(sf.col(AC.GEOCODE))) != "99999") elif geolevel == 'AIANTract': spark_df = spark_df.filter(spark_df.geocode != "9" * 11) elif geolevel == 'AIANState': spark_df = spark_df.filter(spark_df.geocode != "99") elif geolevel == 'AIANBlock': spark_df = spark_df.filter(spark_df.geocode != "9" * 16) # t = spark_df.filter(sf.abs(spark_df.Error) > 1000) spark_df = spark_df.select(["orig", "Error"]) pandas_df = spark_df.toPandas() #if pandas_df.max()["Error"] == pandas_df.min()["Error"]: # continue sns.scatterplot(x="orig", y="Error", data=pandas_df, alpha=.6, s=10, marker="+", ax=axes_flat[k]) axes_flat[k].axhline(0., ls='--') filename = f"{table_name}_{query.replace(' ', '_')}_{geolevel}" plot_path = f"{experiment.save_location_linux}epsilon_{eps_str}/" du.makePath(plot_path) plt.savefig(plot_path + filename + ".png") plt.clf()
def MattsMetrics(query, table_name, analysis, spark, geolevels, key, agekey, sexkey, bucketkey, schema="DHCP_HHGQ"): """ This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds" """ print( f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}" ) schema_name = schema paths, experiment_name, eps_str = getPathsAndName(schema_name) experiment = analysis.make_experiment( experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") spark_df = experiment.getDF() print("df looks like:") spark_df.show() schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") queries = [query] spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels) spark_df = sdftools.answerQueries(spark_df, schema, queries) spark_df = sdftools.getFullWorkloadDF( spark_df, schema, queries, groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]) #spark_df.show(spark_df.count(), False) # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production spark_df = sdftools.getL1(spark_df, colname="L1", col1=AC.PRIV, col2=AC.ORIG) spark_df = sdftools.getL2(spark_df, colname="L2", col1=AC.PRIV, col2=AC.ORIG) # apply bin functions for particular tables if (table_name in table_bucket_list1): spark_df = sdftools.getCountBins( spark_df, column=AC.ORIG, bins=[0, 1000, 5000, 10000, 50000, 100000]).persist() if (table_name in table_bucket_list2): spark_df = sdftools.getCountBins(spark_df, column=AC.ORIG, bins=[0, 10, 100]).persist() # This finds overall metrics spark_df.show(100, False) metrics_result = sdftools.combined_metrics(spark_df, spark, geolevels, agekey, sexkey, bucketkey, key) file_name = f"{table_name}.csv" pandas_df = metrics_result.toPandas() csv_savepath = experiment.save_location_linux + file_name du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False)
def MattsMetrics(query, table_name, analysis, spark, geolevels, buckets=default_buckets, schema="DHCP_HHGQ"): """ This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds" """ print( f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}" ) schema_name = schema paths, experiment_name, eps_str = getPathsAndName(schema_name) experiment = analysis.make_experiment( experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") spark_df = experiment.getDF() print("df looks like:") spark_df.show() schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") queries = [query] spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels) spark_df = sdftools.answerQueries(spark_df, schema, queries) #spark_df.show(spark_df.count(), False) # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production spark_df = sdftools.getL1(spark_df, colname="L1", col1=AC.PRIV, col2=AC.ORIG) spark_df = sdftools.getL2(spark_df, colname="L2", col1=AC.PRIV, col2=AC.ORIG) # apply bin functions for particular tables if (table_name in table_default_bucket_list): spark_df = sdftools.getCountBins( spark_df, column=AC.ORIG, bins=[0, 1000, 5000, 10000, 50000, 100000]).persist() if (table_name in table_default_bucket_list2): spark_df = sdftools.getCountBins(spark_df, column=AC.ORIG, bins=[0, 10, 100]).persist() # This finds overall metrics #spark_df.show(spark_df.count(), False) for g in geolevels: spark_df = spark_df[spark_df['geolevel'] == g] print("This has all levels") spark_df.show(150, False) metrics_dataframe = sdftools.mattsmetrics(spark_df, spark) Counts = spark_df.count() print("Counts are", Counts) newRow = spark.createDataFrame([(Counts, "Counts")]) metrics_dataframe = metrics_dataframe.union(newRow) pandas_df = metrics_dataframe.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) if table_name in table_default_bucket_list2: # If data needs bucketing for b in default_buckets2: # calculate Metrics at each bucket subset_sparkdf = spark_df[spark_df['orig_count_bin'] == b] #subset into bins subset_sparkdf = subset_sparkdf.subtract( subset_sparkdf.filter(subset_sparkdf.level.rlike("Not")) ) # Removes instances of Not Hispanic..from dataframe subset_sparkdf.show(100, False) print("Make sure its bucketed and without 'Not' values") subset_metrics = sdftools.mattsmetrics(subset_sparkdf, spark) Counts = subset_sparkdf.count() newRow = spark.createDataFrame([(b, "Bucket")]) newRow1 = spark.createDataFrame([(Counts, "Counts")]) subset_metrics = subset_metrics.union(newRow).union(newRow1) pandas_df = subset_metrics.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{b}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False)
def MattsMetrics(query, table_name, analysis, spark, geolevels, buckets=default_buckets, schema="DHCP_HHGQ"): """ This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds" """ print( f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}" ) schema_name = schema paths, experiment_name, eps_str = getPathsAndName(schema_name) experiment = analysis.make_experiment( experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") spark_df = experiment.getDF() print("df looks like:") spark_df.show() schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") queries = [query] spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels) spark_df = sdftools.answerQueries(spark_df, schema, queries) spark_df = sdftools.getFullWorkloadDF( spark_df, schema, queries, groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]) #spark_df.show(spark_df.count(), False) # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production spark_df = sdftools.getL1(spark_df, colname="L1", col1=AC.PRIV, col2=AC.ORIG) spark_df = sdftools.getL2(spark_df, colname="L2", col1=AC.PRIV, col2=AC.ORIG) # apply bin functions for particular tables if (table_name in table_default_bucket_list): spark_df = sdftools.getCountBins( spark_df, column=AC.ORIG, bins=[0, 1000, 5000, 10000, 50000, 100000]).persist() if (table_name in table_default_bucket_list2): spark_df = sdftools.getCountBins(spark_df, column=AC.ORIG, bins=[0, 10, 100]).persist() # This finds overall metrics spark_df.show(100, False) for g in geolevels: spark_df1 = spark_df[spark_df['geolevel'] == g] # Separate data for each geolevel if table_name in table_default_no_bucket: # If data is not in buckets if table_name in table_race_query: # Table 17, 18, 21 and others print("no buckets, with race query") spark_df2 = spark_df1.subtract( spark_df1.filter(spark_df1.level.rlike("Not"))) spark_df2.show(100, False) print("Make sure 'Not' values are removed") metrics_dataframe = sdftools.mattsmetrics(spark_df2, spark) Counts = spark_df2.count() print("Counts are", Counts) newRow = spark.createDataFrame([(Counts, "Counts")]) metrics_dataframe = metrics_dataframe.union(newRow) pandas_df = metrics_dataframe.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) else: print("no buckets, without race query") spark_df1.show(100, False) spark_df2 = spark_df1.subtract( spark_df1.filter(spark_df1.level.rlike("Not"))) print("with Not removed") spark_df2.show(100, False) metrics_dataframe = sdftools.mattsmetrics(spark_df2, spark) Counts = spark_df2.count() print("Counts are", Counts) newRow = spark.createDataFrame([(Counts, "Counts")]) metrics_dataframe = metrics_dataframe.union(newRow) pandas_df = metrics_dataframe.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) if table_name in table_age_bracket1: print("Data is in age brackets, 0 to 17, 18 to 64, 65+") spark_df1.show(100, False) for age_range in age_range_list: subset_sparkdf1 = spark_df1.filter( spark_df1.level.rlike(age_range)) subset_sparkdf1.show(100, False) metrics_dataframe = sdftools.mattsmetrics( subset_sparkdf1, spark) #subset_sparkdf1.show(100, False) Counts = subset_sparkdf1.count() print("Counts are", Counts) newRow = spark.createDataFrame([(Counts, "Counts")]) metrics_dataframe = metrics_dataframe.union(newRow) pandas_df = metrics_dataframe.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) if table_name in table_age_bracket2: print("Data is age buckets, with sex query") spark_df1.show(100, False) for sexlevel in sex_range_list: subset_sparkdf1 = spark_df1.filter( spark_df1.level.rlike(sexlevel)) for age_range in age_range_list: subset_sparkdf2 = subset_sparkdf1.filter( subset_sparkdf1.level.rlike(age_range)) subset_sparkdf2.show(100, False) metrics_dataframe = sdftools.mattsmetrics( subset_sparkdf2, spark) #subset_sparkdf1.show(100, False) Counts = subset_sparkdf2.count() print("Counts are", Counts) newRow = spark.createDataFrame([(Counts, "Counts")]) metrics_dataframe = metrics_dataframe.union(newRow) pandas_df = metrics_dataframe.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}_{sexlevel}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) if table_name in table_age_bracket3: print("Data is in age brackets of 5 year age groups") spark_df1.show(100, False) for age_range in age_range_list2: subset_sparkdf1 = spark_df1.filter( spark_df1.level.rlike(age_range)) subset_sparkdf1.show(100, False) metrics_dataframe = sdftools.mattsmetrics( subset_sparkdf1, spark) #subset_sparkdf1.show(100, False) Counts = subset_sparkdf1.count() print("Counts are", Counts) newRow = spark.createDataFrame([(Counts, "Counts")]) metrics_dataframe = metrics_dataframe.union(newRow) pandas_df = metrics_dataframe.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) if table_name in table_age_bracket4: print("Data is age buckets of 5 year age groups, with sex query") spark_df1.show(100, False) for sexlevel in sex_range_list: subset_sparkdf1 = spark_df1.filter( spark_df1.level.rlike(sexlevel)) for age_range in age_range_list2: subset_sparkdf2 = subset_sparkdf1.filter( subset_sparkdf1.level.rlike(age_range)) subset_sparkdf2.show(100, False) metrics_dataframe = sdftools.mattsmetrics( subset_sparkdf2, spark) #subset_sparkdf1.show(100, False) Counts = subset_sparkdf2.count() print("Counts are", Counts) newRow = spark.createDataFrame([(Counts, "Counts")]) metrics_dataframe = metrics_dataframe.union(newRow) pandas_df = metrics_dataframe.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}_{sexlevel}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) if table_name in table_default_bucket_list2: # If data is in buckets [0,10],[10,100),[100+) print("data is bucketed and treated accordingly") #if table_name in table_race_query: for b in default_buckets2: # calculate Metrics at each bucket print("Bucket is:", b) subset_sparkdf = spark_df1[spark_df1['orig_count_bin'] == b] #subset into bins subset_sparkdf.show(100, False) print("Bucketed data") subset_sparkdf1 = subset_sparkdf.subtract( subset_sparkdf.filter(subset_sparkdf.level.rlike("Not"))) subset_sparkdf1.show(100, False) print("Make sure its bucketed and 'Not' values are removed") subset_metrics = sdftools.mattsmetrics(subset_sparkdf1, spark) Counts = subset_sparkdf1.count() newRow = spark.createDataFrame([(b, "Bucket")]) newRow1 = spark.createDataFrame([(Counts, "Counts")]) subset_metrics = subset_metrics.union(newRow).union(newRow1) pandas_df = subset_metrics.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{b}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) if table_name in table_default_bucket_list: # If data is in buckets [0,1000],[1000,5000),etc. Table 1 and 2 print("data is bucketed and treated accordingly") #if table_name in table_race_query: for b in default_buckets: # calculate Metrics at each bucket print("Bucket is:", b) subset_sparkdf = spark_df1[spark_df1['orig_count_bin'] == b] #subset into bins subset_sparkdf.show(100, False) print("Bucketed data") subset_sparkdf1 = subset_sparkdf.subtract( subset_sparkdf.filter(subset_sparkdf.level.rlike("Not"))) subset_sparkdf1.show(100, False) print("Make sure its bucketed and 'Not' values are removed") subset_metrics = sdftools.mattsmetrics(subset_sparkdf1, spark) Counts = subset_sparkdf1.count() newRow = spark.createDataFrame([(b, "Bucket")]) newRow1 = spark.createDataFrame([(Counts, "Counts")]) subset_metrics = subset_metrics.union(newRow).union(newRow1) pandas_df = subset_metrics.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{b}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False)
def MattsMetrics(query, table_name, analysis, spark, geolevels, buckets=default_buckets, schema="DHCP_HHGQ"): """ This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds" """ print( f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}" ) schema_name = schema paths, experiment_name, eps_str = getPathsAndName(schema_name) experiment = analysis.make_experiment( experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") spark_df = experiment.getDF() print("df looks like:") spark_df.show() schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") queries = [query] spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels) spark_df = sdftools.answerQueries(spark_df, schema, queries) spark_df.show() # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production spark_df = sdftools.getL1(spark_df, colname="L1", col1=AC.PRIV, col2=AC.ORIG) spark_df = sdftools.getL2(spark_df, colname="L2", col1=AC.PRIV, col2=AC.ORIG) spark_df = sdftools.getCountBins( spark_df, column=AC.ORIG, bins=[0, 1000, 5000, 10000, 50000, 100000]).persist() for b in default_buckets: # calculate Metrics subset_sparkdf = spark_df[spark_df['orig_count_bin'] == b] #subset into bins subset_sparkdf.show() MAE_value = sdftools.MAE(subset_sparkdf) print("Bucket size is", b) print("MAE value is", MAE_value) RMS_value = sdftools.RMS(subset_sparkdf) CoV_value = sdftools.Coe_of_variation(subset_sparkdf, RMS_value) print("RMS value is", RMS_value) print("Coefficient of Variation is", CoV_value) MAPE_value = sdftools.MAPE(subset_sparkdf) print("MAPE value is", MAPE_value) MALPE_value = sdftools.MALPE(subset_sparkdf) print("MALPE value is", MALPE_value) print("Counts of percent differences between 5 and 10 percent: ") # 5to10percentCount = sdftools.Count_percentdiff_5to10percent(subset_spark) # This function disabled for now greaterthan10percentCount = sdftools.Count_percentdiff_10percent( subset_sparkdf) #ze.groupBy().agg(F.count(F.when(F.col("abs diff div cef")>0.05, True)),F.count(F.when(F.col("abs diff div cef")<0.1,True))).show() # ze.groupBy().agg(F.count(F.when(F.col("abs diff div cef")>0.05 and F.col("abs diff div cef")<0.1),True)).show() print("Counts of percent differences greater than 10 percent: ") greaterthan10percentCount.show()
def analyzeQuery(query, table_name, analysis, spark, geolevels, buckets=default_buckets, schema="DHCP_HHGQ"): """ Main plotting fxn. query : str, name of a valid query for the target experiment's schema table_name : str, name of a table (used for file-naming conventions) analysis : Analysis setuptools.setup object, organizes Analysis metadata spark : SparkSession object, attached to analysis object geolevels : [str, ...], geolevels to compute over for the current query buckets : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3. """ print(f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}") schema_name = schema paths, experiment_name, eps_str = getPathsAndName(schema_name) experiment = analysis.make_experiment(experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") spark_df = experiment.getDF() print("df looks like:") spark_df.show() schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") queries = [query] #y=sdftools.getAnswers(spark,df,geolevels,schema,queries) # Old approach to computing df with abs diff, bucketed by true count: #sparkDFWithAbsDiff = getSparkDFWithAbsDiff(spark, spark_df, geolevels, queries, schema) #getSignedErrorByTrueCountRuns(spark_df, bins=[0,1,10,100,1000,10000]): #rdd = sdftools.getRowGroupsAsRDD(sparkDFWithAbsDiff, groupby=[AC.GEOLEVEL, AC.QUERY]) #rdd = rdd.flatMapValues(lambda rows: sepBounds(rows, 'orig', buckets)).persist() #rdd = rdd.map(lambda row: Row(**row[1])) #spark_df = rdd.toDF().persist() # New (actually preexisting) approach to computing spark_df with abs diff, bucketed by true count: # (avoids pandas dfs inside mappers, which is RAM-hungry) spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels) spark_df = sdftools.answerQueries(spark_df, schema, queries) spark_df = sdftools.getFullWorkloadDF(spark_df, schema, queries,groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]) spark_df = sdftools.getAvgAbsErrorByTrueCountRuns(spark_df, bins=[0,1,10,100,1000,10000]).persist() spark_df.show() print("^^^^ with abs error, DF looks like ^^^^") metric_name = "Avg( |q(MDF) - q(CEF)| )" x_axis_variable_name = 'CEF Count, Binned' # spark_df = spark_df.groupby(['geocode','geolevel','level','Bin0','Bin1','Bin2','Bin3','Bin4','Bin5']).avg() # Below spark_df has cols: geocode, geolevel, run_id, plb, budget_group, query, orig_count_bin, signed_error, re #spark_df = spark_df.groupby(['geocode', 'geolevel', 'plb', 'budget_group', 'query', 'orig_count_bin']).avg() #print("^^^^ after averaging, spark_df looks like ^^^^") pandas_df = spark_df.toPandas() #pandas_df = pandas_df.rename(columns={"avg(signed_error)":metric_name, "avg(orig)":"orig"}) #pandas_df[x_axis_variable_name] = pandas_df.apply(lambda row: binIndexToInteger(row, buckets), axis=1) #pandas_df = pandas_df.rename(columns={"avg(signed_error)":metric_name, "avg(orig_count_bin)":"orig"}) pandas_df = pandas_df.rename(columns={"abs_error":metric_name, "orig_count_bin":x_axis_variable_name}) plt.figure(1, figsize=(11,8.5)) plt.rc('axes', labelsize=8) print(f"pandas df before plotting has cols: {pandas_df.columns.values}") print(f"{x_axis_variable_name} column has distinct levels: {pandas_df[x_axis_variable_name].unique()}") buckets = pandas_df[x_axis_variable_name].unique() buckets = sorted(buckets, key=lambda bucket_name: largestIntInStr(bucket_name)) print(f"Sorted bucket names: {buckets}") new_bucket_order = [0,1,2,3,5,4] # Apply ordering system to make 10000+ the last bucket buckets = [buckets[i] for i in new_bucket_order] print(f"Sorted bucket names: {buckets}") """ print(pandas_df.head(30)) print(f"pandas_df headers: {list(pandas_df.columns.values)}") tmpDf = pandas_df[[x_axis_variable_name, 'orig', metric_name]] print("tmpDf looks like:") with pandas.option_context('display.max_rows', None, 'display.max_columns', None): print(tmpDf) print("^^^^ pandas df looks like ^^^^") print("And first 3 rows:") print(pandas_df.iloc[:3]) #print(df.dtypes) print("And first 100 rows, subset to Bins:") print(pandas_df.iloc[0:101,3:9]) print(pandas_df.iloc[0:101,-1]) """ # Saving data frame csv_savepath = experiment.save_location_linux + f"Executive_Priority_Tabulations_#1_{experiment_name}_{table_name}_{query}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) makePlots(experiment, experiment_name, table_name, queries, x_axis_variable_name, metric_name, geolevels, pandas_df, buckets, schema_name, eps_str)