def test_makeHistRowsFromMultiSparse(self, persons, as_dict, recode): node = self.makeNode(persons) if as_dict: node = node.toDict(keep_attrs=(SYN, GEOCODE, INVAR)) if recode: rows = makeHistRowsFromMultiSparse( node, self.schema, row_recoder=DHCPHHGQToMDFPersons2020Recoder) assert len(rows) == len(persons) else: rows = makeHistRowsFromMultiSparse(node, self.schema, add_schema_name=False) input_rows = ["|".join(map(str, row[:-1])) for row in persons] if not recode: match_cnt = 0 for row in rows: row_str = "|".join([row[var] for var in self.schema.dimnames]) for inp_row in input_rows: if row_str == inp_row: match_cnt += 1 input_rows.remove(inp_row) break assert match_cnt == len(rows) == len(persons) else: assert len(rows) == len(persons)
def test_makeHistRowsFromMultiSparse(self, hholds, units, as_dict, recode): node = self.makeNode(hholds, units) if as_dict: node = node.toDict(keep_attrs=(SYN, GEOCODE, INVAR)) if recode: rows = makeHistRowsFromMultiSparse( node, self.schema, row_recoder=Household2010ToMDFUnit2020Recoder) assert len(rows) == len(hholds) rows = addEmptyAndGQ(node, self.schema, rows, row_recoder=Household2010ToMDFUnit2020Recoder) assert len(rows) == len(units) else: rows = makeHistRowsFromMultiSparse(node, self.schema, add_schema_name=False) input_rows = ["|".join(map(str, row[:-1])) for row in hholds] if not recode: match_cnt = 0 for row in rows: row_str = "|".join([row[var] for var in self.schema.dimnames]) for inp_row in input_rows: if row_str == inp_row: match_cnt += 1 input_rows.remove(inp_row) break assert match_cnt == len(rows) == len(hholds) else: assert len(rows) == len(units)
def node2SparkRows(node: dict): # nodedict = node.toDict((SYN, INVAR, GEOCODE)) nodedict = {SYN: node[SYN], GEOCODE: node[GEOCODE]} persons = makeHistRowsFromMultiSparse(nodedict, schema, row_recoder=self.row_recoder) return persons
def node2SparkRows(node: GeounitNode): nodedict = node.toDict((SYN, INVAR, GEOCODE)) persons = makeHistRowsFromMultiSparse( nodedict, schema, row_recoder=self.row_recoder, geocode_dict=inverted_geodict) return persons
def test_makeHistRowsFromMultiSparseRecode(self, persons): node = self.makeNode(persons) rows = makeHistRowsFromMultiSparse( node.toDict(keep_attrs=(SYN, GEOCODE, INVAR)), self.schema, row_recoder=DHCPHHGQToMDFPersons2020Recoder) ## TODO: Some testing of the MDF spec output should probably be done here. Maybe on just one case. Maybe not, and just test it within the # writer test below assert len(rows) == len(persons)
def node2SparkRows(node: GeounitNode): nodedict = node.toDict((SYN, INVAR, GEOCODE)) households = makeHistRowsFromMultiSparse( nodedict, schema, row_recoder=self.row_recoder) units = addEmptyAndGQ( nodedict, schema, households, row_recoder=self.row_recoder, gqtype_recoder=HHGQUnitDemoProductAttr.das2mdf, geocode_dict=inverted_geodict) return units
def node2SparkRows(node: dict): # nodedict = node.toDict((SYN, INVAR, GEOCODE)) # node already comes as a dict, but let's still clear everything except for SYN, INVAR and GEOCODE. nodedict = {SYN: node[SYN], GEOCODE: node[GEOCODE]} nodedict[INVAR] = node[INVAR] if INVAR in node else node['_invar'] households = makeHistRowsFromMultiSparse( nodedict, schema, row_recoder=self.row_recoder) units = addEmptyAndGQ(nodedict, schema, households, row_recoder=self.row_recoder, gqtype_recoder=gqtype_recoder) return units
def test_makeHistRowsFromMultiSparseRecode(self, hholds, units): node = self.makeNode(hholds, units) rows = makeHistRowsFromMultiSparse( node.toDict(keep_attrs=(SYN, GEOCODE, INVAR)), self.schema, row_recoder=Household2010ToMDFUnit2020Recoder) ## TODO: Some testing of the MDF spec output should probably be done here. Maybe on just one case. Maybe not, and just test it within the # writer test below assert len(rows) == len(hholds) rows = addEmptyAndGQ(node, self.schema, rows, row_recoder=Household2010ToMDFUnit2020Recoder) assert len(rows) == len(units) pass
def node2SparkRows(node: GeounitNode): nodedict = node.toDict((SYN, INVAR, GEOCODE)) households = makeHistRowsFromMultiSparse( nodedict, schema, row_recoder=self.row_recoder, geocode_dict=inverted_geodict, microdata_field=None) units = addGroupQuarters(nodedict, schema, households, row_recoder=self.row_recoder, geocode_dict=inverted_geodict, to_microdata=False) # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production ordered_cols = self.var_list + ['priv'] return [ Row(*ordered_cols)(*[unit[col] for col in ordered_cols]) for unit in units ]