def create_pandas_bins(pandas_df, columns, bins, btype="quantile"): quantile_dict = {} columns = du.aslist(columns) quantile_bins = du.aslist(bins) for column in columns: if btype == "quantile": labels = [f"'({bins[i]}, {bins[i+1]}]'" for i,b in enumerate(bins[1:])] pandas_df[f'{column}_bins'], retbins = pandas.qcut(pandas_df[column], bins, duplicates='drop', retbins=True) print(f"Bins: {retbins}") else: pandas_df[f'{column}_bins'] = pandas.cut(x=pandas_df[column], bins=quantiles) return pandas_df
def addMetadataColumns(df, dasrun, metadata_columns=[AC.PLB, AC.RUN_ID, AC.BUDGET_GROUP]): """ adds additional experiment/run metadata columns to the Spark DataFrame Parameters ========== df : Spark DataFrame dasrun : DASRun object A wrapper for the run path and other important metadata for an experiment run metadata_columns : list of strings; default is [AC.PLB, AC.RUN_ID, AC.BUDGET_GROUP] Add the provided columns to the data frame if the data exists in the das run Returns ======= Spark DataFrame """ for column in du.aslist(metadata_columns): lit = getattr(dasrun, column) if lit is not None: df = df.withColumn(column, sf.lit(lit)).persist() return df
def getGRFC(spark, columns=None): """ returns the GRFC columns as a Spark DataFrame Parameters ========== spark : SparkSession columns : str or list of str Default: None - return all columns Returns ======= a Spark DataFrame containing information from the GRFC file """ grfc_loc = f"{DAS_S3ROOT}/2010/cefv2/pp10_grf_tab_ikeda_100219.csv" grfc = spark.read.option("header", "true").csv(grfc_loc) grfc = grfc.withColumn('BLOCK', sf.concat(sf.col("TABBLKST"), sf.col("TABBLKCOU"), sf.col("TABTRACTCE"), sf.col("TABBLK")[0:1], sf.col("TABBLK"))) grfc = grfc.withColumn('geocode', sf.col('BLOCK')).persist() if columns is None: columns = grfc.columns else: # want geocode, at least, as the join column columns = np.unique(du.aslist(columns) + ['geocode']).tolist() grfc = grfc.select(columns) return grfc
def getToyGeounitData_GeounitNode(schema, geocodes=[ '000', '001', '002', '003', '010', '011', '012', '020', '022' ], geocode_dict={ 3: 'block', 2: 'county' }, raw_params={ 'low': 0, 'high': 2 }, syn_params={ 'low': 0, 'high': 5 }): geounits = [] for geocode in du.aslist(geocodes): if raw_params is not None: raw = np.random.randint(low=raw_params['low'], high=raw_params['high'], size=schema.size).reshape(schema.shape) if syn_params is not None: syn = np.random.randint(low=syn_params['low'], high=syn_params['high'], size=schema.size).reshape(schema.shape) geounits.append( GeounitNode(geocode=geocode, geocode_dict=geocode_dict, raw=multiSparse(raw), syn=multiSparse(syn))) return geounits
def getMicrodataDF_mapper(node, schema, privatized=True, mangled_names=True, recoders=None): # TODO: provide support for node = GeounitNode, in addition to node = dict GEOCODE = "geocode" ORIG = "raw" PRIV = "syn" DATATYPE = "data_type" import numpy as np if privatized: datakey = PRIV else: datakey = ORIG for item in [datakey, GEOCODE]: assert item in node, f"Cannot create microdata; '{item}' not found in the node." data = node[datakey].sparse_array all_nonzero_indices = data.indices.tolist() rows = [] for ind in all_nonzero_indices: rowdict = {} rowdict[GEOCODE] = node[GEOCODE] rowdict[DATATYPE] = str(datakey) num_records = int(data[0,ind]) cell = np.unravel_index(ind, schema.shape) for dim, level in enumerate(cell): if mangled_names: dimname = f"{schema.mangled_dimnames[dim]}" else: dimname = f"{schema.dimnames[dim]}" rowdict[dimname] = str(level) if recoders is not None: for recode in du.aslist(recoders): rowdict = recode(rowdict) row = rowdict rows += [row]*num_records return rows
def make_pandas_qcut_bin_labels(bins): bins = du.aslist(bins) binstr = [f"'(-Inf, {bins[0]}]'"] if len(bins) >= 2: for i,b in enumerate(bins[1:]): binstr += [f"'({bins[i]}, {bins[i+1]})'"] return binstr
def expandKeywords(querynames): """ looks through the querynames provided and expands any keywords found that refer to workloads Inputs: querynames: list of strings Outputs: list of strings (querynames) Notes: This allows for the use of keywords as shorthand for commonly used sets of queries (e.g. PL94, P12, etc.) Note that it does not check to see if the queries themselves are valid for a particular schema; this function only expands valid workload keywords into querynames and appends those querynames to the list of querynames that will be returned. """ expandednames = [] querynames = das_utils.aslist(querynames) for name in querynames: try: keynames = getWorkload(name) except (AssertionError, KeyError) as e: keynames = [name] expandednames += keynames return np.unique(expandednames).tolist()
def getToySparseHistDF(geounit_data, schema): records = [] for geounit in du.aslist(geounit_data): records += mappers.getSparseDF_mapper(geounit, schema) df = pandas.DataFrame(records) column_order = ['geocode'] + schema.dimnames df = df[column_order + [x for x in df.columns if x not in column_order]] return df
def getTable(data, schema, querynames): querynames = das_utils.aslist(querynames) answerdict = {} leveldict = {} for name in querynames: #print(name) answerdict[name] = schema.getQuery(name).answerWithShape(data) leveldict[name] = schema.getQueryLevel(name, flatten=False) return Table(answerdict, leveldict)
def getCustomQuerynames(self, querynames): queries = [] querynames = das_utils.aslist(querynames) for name in querynames: if name in self.tabledict: queries += self.tabledict[name] else: if self.schema.isValidQuery(name): queries += [name] queries = np.unique(queries).tolist() return queries
def getQueries(self, querynames): """ returns a dictionary of querybase objects Inputs: querynames: a single string or list of strings referring to the queries to build """ queries = {} querynames = das_utils.aslist(querynames) for name in querynames: queries[name] = self.getQuery(name) return queries
def getWorkloadByTable(self, tablenames=None): if tablenames is None: tablenames = self.tablenames else: tablenames = das_utils.aslist(tablenames) querynames = [] for name in tablenames: if name in self.tabledict: querynames += self.tabledict[name] querynames = np.unique(querynames).tolist() return querynames
def getCustomTableTuples(self, querynames): """ returns a list of tuples (query, level) in the custom table shell used for comparing rows that exist to the table shell to find all missing rows a list of tuples has a much smaller memory footprint than using a pandas dataframe or even a numpy array """ tuples = [] querynames = self.getCustomQuerynames(das_utils.aslist(querynames)) for query in querynames: tuples += [(query, level) for level in self.schema.getQueryLevel(query)] return tuples
def getCustomQuerynames(self, querynames): querynames = self.standardize_querynames(querynames) queries = [] querynames = du.aslist(querynames) for name in querynames: if name in self.tabledict: queries += self.tabledict[name] else: if self.isValidQuery(name): queries += [name] else: print(f"Removing '{name}' from the list of queries.") queries = np.unique(queries).tolist() return queries
def getCustomQuerynames(self, querynames): queries = [] querynames = das_utils.aslist(querynames) for name in querynames: if name in self.tabledict: queries += self.tabledict[name] else: if self.schema.isValidQuery(name): queries += [name] else: print( f"'{name}' is not a valid query for this schema\nRemoving it from the list of queries." ) queries = np.unique(queries).tolist() return queries
def getGroupingLevels(self, customlevels, groupings, keepdims): """ returns a dictionary of levels for groupings this function is primarily used in the buildRecodeQuerySeed function Inputs: customlevels (dict): levels as defined by the user (and specified in the recode's levels attribute) groupings (dict): the groupings defined for the recoded variable keepdims (list): the dimensions in dimnames to keep (marginalize the others) Outputs: a dictionary of levels """ levels = {} baselevels = self._getBaseLevels() for dim in keepdims: if dim in groupings: if dim in customlevels: levels[dim] = customlevels[dim] else: items = [] dimgroups = groupings[dim] # in order to make automatic level-generation work for groupings, each group needs to be # part of a list within the dim group. # For example, it's simpler to write { 'dim0': [1,2,3,4,5] } than # { 'dim0': [[1],[2],[3],[4],[5]] } # for a dimension dim0 that has 6 levels, but where we # want to ignore the first on. # As such, when the levels are automatically generated (i.e. no custom levels have been specified) # then the first dictionary above is automatically translated into the second here # This also works fine for other dimension groupings, such as { 'dim1': [[1],[2,3,4]] } # since it is translated into the exact same thing dimgroups = [das_utils.aslist(x) for x in dimgroups] for j in range(len(dimgroups)): items.append(".".join([ x for i, x in enumerate(baselevels[dim]) if i in dimgroups[j] ])) levels[dim] = items else: levels[dim] = baselevels[dim] return levels
def getQueryLevels(self, querynames, order=None, flatten=True): """ returns a dictionary of arrays corresponding to the levels found in the querynames Inputs: querynames: a string or list of strings order: the order of the dimensions for creating crosses the default is None, which sets the order to be the ordering of the dimnames attribute flatten: boolean. If True, return the levels as a flattened array If False, return the levels as a multdimensional numpy array """ qlevels = {} order = self.dimnames if order is None else order querynames = das_utils.aslist(querynames) for name in querynames: seed = self.getQuerySeed(name) qlevels[name] = seed.getQueryLevels(order=order, flatten=flatten) return qlevels
def getDASRunsNested(paths, search_threads=20, build_threads=20, schema_name=None): """ returns a list of DASRun objects that contain information about DAS Experiment data Parameters ========== paths: str or list of str List of s3 paths to DAS Experiment data search_threads: int (kwarg with default = 20) Number of `multiprocessing` threads to use during s3 search build_threads: int (kwarg with default = 20) Number of `multiprocessing` threads to use to construct DASRun objects schema_name: str (kwarg with default = None) The name of the Schema associated with all of the DAS Experiments in the path_list Notes: - If None, the DASRun class will search for the schema_name within the config file and will throw an error if it can't be found. - If not None, the DASRun class will just use the schema_name provided. Returns ======= A list of DASRun objects """ t0 = time.time() config_paths = [] paths = du.aslist(paths) for path in paths: config_paths += findDASRunConfigs(path, threads=search_threads) dasrun_ingredients = [(config_path, schema_name) for config_path in config_paths] with mp.Pool(build_threads) as pool: runs = pool.map(makeDASRunNested, dasrun_ingredients) t1 = time.time() print( f"It took {t1-t0} seconds to build all DASRuns from the found config.ini files" ) return runs
def getQueryLevels(self, querynames: Union[Iterable[str], str], order=None, flatten=True, cross_marker=" BY "): """ returns a dictionary of arrays corresponding to the levels found in the querynames Inputs: querynames: a string or list of strings order: the order of the dimensions for creating crosses the default is None, which sets the order to be the ordering of the dimnames attribute flatten: boolean. If True, return the levels as a flattened array If False, return the levels as a multdimensional numpy array """ querynames = self.standardize_querynames(querynames) qlevels = {} order = self.dimnames if order is None else order querynames = du.aslist(querynames) querynames = self.getCustomQuerynames(querynames) for name in querynames: if self.isValidQuery(name): seed = self.getQuerySeed(name) qlevels[name] = seed.getQueryLevels(order=order, flatten=flatten, cross_marker=cross_marker) return qlevels
def getWorkload(workload_keywords): """ returns a list of unique queries based on one or more workloads Inputs: workload_keywords: a list of strings associated with the workloads desired Outputs: a list of strings/query names Notes: Since some workloads share queries (e.g. "total"), this function allows us to concatenate lists of queries (from multiple workloads) and remove any duplicates, as duplicates cause HDMM to not work as we might want/expect. """ keys = das_utils.aslist(workload_keywords) querynames = [] for key in keys: querynames += getWorkloadByKey(key) unique_querynames = np.unique(querynames).tolist() return unique_querynames
def getQueryNames(self, nway=None, ignore=None, include=None): """ returns a list of valid query names Inputs: nway: an int or list of ints that refer to the marginal querynames desired ignore: a str or list of strs that refer to the "crossable" dimensions to remove from the queryname list include: a str or list of strs that refer to queries that must be part of the list (unless the queries are invalid) Outputs: a list of strings referring to the queries asked for Notes: Not often used in practice; used primarily for testing purposes 'detailed' always refers to the query expressed by the crosses between all of the dimname variables Example: if dimnames = ['a', 'b', 'c'] and there are two recoded variables ['a1', 'c6'], then, even though 'a1_b_c', 'a_b_c6', and 'a_b_c' are all valid queries, only 'a_b_c' matches the crosses of the original (dimnames) variables, so it is the only one that will be renamed as 'detailed' """ if nway is None: valid_names = self._getAllQueryNames() else: valid_names = [] for n in das_utils.aslist(nway): if isinstance(n, int): combos = list( itertools.combinations(self._getCrossableQueries(), n)) if n == 0: combos = ['total'] elif n == 1: combos = [list(x)[0] for x in combos] else: combos = [ C.SCHEMA_CROSS_JOIN_DELIM.join(list(x)) for x in combos ] valid_names += combos nonignored_names = set(valid_names) if ignore is not None: for name in das_utils.aslist(ignore): nonignored_names = nonignored_names.intersection( set([ x for x in valid_names if name not in re.split(C.SCHEMA_CROSS_SPLIT_DELIM, x) ])) valid_names = list(nonignored_names) if include is not None: for x in das_utils.aslist(include): if not np.any([ True if isSameQuery(x, y) else False for y in valid_names ]): valid_names.append(x) valid_names = list(set(valid_names)) for i, name in enumerate(valid_names): detailed_name = C.SCHEMA_CROSS_JOIN_DELIM.join(self.dimnames) if isSameQuery(name, detailed_name): valid_names[i] = "detailed" valid_names = [x for x in valid_names if self._validQuerySeed(x)] valid_names.sort( key=lambda s: len(re.split(C.SCHEMA_CROSS_SPLIT_DELIM, s))) return valid_names
def getCrosswalkDF(spark=None, columns=None, strong_mcd_states=STRONG_MCD_STATES, aian_areas=AIAN_AREAS, aian_ranges_path=AIAN_RANGES_PATH, fed_airs=FED_AIRS): """ Loads the 2010 crosswalk files that Simson generated from the 2010 GRFC into a Spark DF Parameters ========== spark : SparkSession columns : str or list of str (default is None, which will return all columns in the file) - This determines which columns survive from the original crosswalk data file, as the function will only return a Spark DF with the columns listed here Returns ======= a Spark DF containing crosswalk columns Notes ===== - This function also generates a number of additional columns to expand the ease-of-use when aggregating blocks to form geographic units in different geographic levels. - e.g. Rather than COUNTY being the 3-digit FIPS code, the COUNTY column will concatenate both the 2-digit STATE FIPS code and the 3-digit COUNTY FIPS code to create a 5-digit COUNTY code that is unique from all other 5-digit COUNTY codes. """ crosswalk = f"{DAS_S3ROOT}/2010/geounit_crosswalks/24vars/" crossdf = spark.read.option("header", "true").csv(crosswalk) # add "geocode" column based on GEOID (which is the 16 digit block id) crossdf = crossdf.withColumn("geocode", crossdf['GEOID']) # generate unique counties crossdf = crossdf.withColumn("COUNTY", sf.concat(sf.col("STATE"), sf.col("COUNTY"))) # generate unique tract groups crossdf = crossdf.withColumn("TRACT_GROUP", sf.concat(sf.col("County"), crossdf.TRACT[0:4])) # generate unique tracts crossdf = crossdf.withColumn("TRACT", sf.concat(sf.col("COUNTY"), sf.col("TRACT"))) # generate block group column crossdf = crossdf.withColumn("BLOCK_GROUP", crossdf.BLOCK[0:1]) # generate unique block groups crossdf = crossdf.withColumn("BLOCK_GROUP", sf.concat(sf.col("TRACT"), sf.col("BLOCK_GROUP"))) # generate unique blocks crossdf = crossdf.withColumn("BLOCK", sf.concat(sf.col("BLOCK_GROUP"), sf.col("BLOCK"))) # generate unique SLDLs (only unique if state fips has been prepended to the SLDL identifier) crossdf = crossdf.withColumn("SLDL", sf.concat(sf.col("STATE"), sf.col("SLDL"))) # generate unique SLDUs (only unique if state fips has been prepended to the SLDU identifier) crossdf = crossdf.withColumn("SLDU", sf.concat(sf.col("STATE"), sf.col("SLDU"))) # generate unique Congressional Districts (111th Congress) - only unique if state fips has been prepended to the CD identifier crossdf = crossdf.withColumn("CD", sf.concat(sf.col("STATE"), sf.col("CD"))) # generate unique school districts (only unique if state fips has been prepended to the identifiers) crossdf = crossdf.withColumn("SDELM", sf.concat(sf.col("STATE"), sf.col("SDELM"))) crossdf = crossdf.withColumn("SDSEC", sf.concat(sf.col("STATE"), sf.col("SDSEC"))) crossdf = crossdf.withColumn("SDUNI", sf.concat(sf.col("STATE"), sf.col("SDUNI"))) # generate unique urban areas and urban growth areas (only unique if state prepended) crossdf = crossdf.withColumn("UA", sf.concat(sf.col("STATE"), sf.col("UA"))) crossdf = crossdf.withColumn("UGA", sf.concat(sf.col("STATE"), sf.col("UGA"))) # generate unique puma and place ids (only unique if state prepended) crossdf = crossdf.withColumn("PUMA", sf.concat(sf.col("STATE"), sf.col("PUMA"))) crossdf = crossdf.withColumn("PLACE", sf.concat(sf.col("STATE"), sf.col("PLACE"))) # generate unique county subdivisions (only unique if state and county prepended) crossdf = crossdf.withColumn("COUSUB", sf.concat(sf.col("COUNTY"), sf.col("COUSUB"))) # generate unique subminor civil divisions (only unique if state, county, and county subdivisions prepended) crossdf = crossdf.withColumn("SUBMCD", sf.concat(sf.col("COUSUB"), sf.col("SUBMCD"))) # voting districts appear to have a floating space (" ") character in every VTD code, so we'll remove them as they # don't appear in the BlockAssign files for VTD ### Update - 2019-06-25 - The floating space is a valid character in the 6-character VTD codes; the first character # isn't always a " ", so " " is just another part of the code. #crossdf = crossdf.withColumn("VTD1st", crossdf.VTD[0:1]) # generate unique voting districts (only unique if state and county prepended) crossdf = crossdf.withColumn("VTD", sf.concat(sf.col("COUNTY"), sf.col("VTD"))) # create a column for the nation crossdf = crossdf.withColumn("US", sf.lit("Nation")) # Note: When using any of the columns from the next block, filter out IDs composed only of "9"'s aian_ranges_dict = make_aian_ranges_dict(aian_ranges_path, aian_areas) is_fed_air_udf = udf(lambda aiannhce: in_aian_class(aiannhce, fed_airs, aian_ranges_dict), BooleanType()) is_aian_udf = udf(lambda aiannhce: in_aian_class(aiannhce, aian_areas, aian_ranges_dict), BooleanType()) crossdf = add_aiannhce_col(spark, crossdf) # aian_areas: crossdf = crossdf.withColumn("AIAN_AREAS", sf.when(is_aian_udf("AIANNHCE"), sf.col("AIANNHCE")).otherwise(CC.NOT_AN_AIAN_AREA)) crossdf = crossdf.withColumn("FED_AIRS", sf.when(is_fed_air_udf("AIANNHCE"), sf.col("AIANNHCE")).otherwise(CC.NOT_AN_AIAN_AREA)) # portions of Blocks/Tracts/States within aian_areas: crossdf = crossdf.withColumn("AIANBlock", sf.when(sf.col("AIAN_AREAS") != CC.NOT_AN_AIAN_AREA, sf.col("BLOCK")).otherwise(CC.NOT_AN_AIAN_BLOCK)) crossdf = crossdf.withColumn("AIANTract", sf.col("AIANBlock")[0:11]) crossdf = crossdf.withColumn("AIANState", sf.col("AIANTract")[0:2]) # Define an off-spine entity (OSE) as Place in AIAN areas/ non-strong-MCD states and MCD otherwise: crossdf = crossdf.withColumn("OSE", sf.when((sf.col("AIAN_AREAS") == CC.NOT_AN_AIAN_AREA) & (sf.col("STATE").isin(strong_mcd_states)), sf.col("COUSUB")).otherwise(sf.col("PLACE"))) crossdf = crossdf.withColumn("COUNTY_NSMCD", sf.when(sf.col("STATE").isin(strong_mcd_states), CC.STRONG_MCD_COUNTY).otherwise(sf.col("COUNTY"))) crossdf = crossdf.withColumn("MCD", sf.when(sf.col("STATE").isin(strong_mcd_states), sf.col("COUSUB")).otherwise(sf.lit(CC.NOT_A_MCD))) if columns is None: columns = crossdf.columns else: # always want 'geocode' (aka Block ID, GEOID) in the crosswalk dataframe columns = np.unique(du.aslist(columns) + ['geocode']).tolist() crossdf = crossdf.select(columns) return crossdf
def getCustomTable(self, querynames, data=None): querynames = das_utils.aslist(querynames) if data is None: data = np.zeros(self.schema.shape) querynames = self.getCustomQuerynames(querynames) return getTable(data, self.schema, querynames).toDF()
def getDASRunsFlat(data_paths, schema_name, budget_group=None, run_id=None): data_paths = du.aslist(data_paths) dasruns = [ DASRunFlat(x, schema_name, budget_group, run_id) for x in data_paths ] return dasruns