def test_configReaderValidate(s_config, remove_option, msg, errtype, dd_das_stub): import programs.reader.table_reader as tr_spark import programs.das_setup as ds config = ConfigParser() config.read_string(s_config) config.remove_option(READER, remove_option) setup_instance = ds.DASDecennialSetup(config=config, name='setup', das=dd_das_stub) with pytest.raises(errtype) as err: tr_spark.DASDecennialReader(config=setup_instance.config, setup=setup_instance, name='reader', das=dd_das_stub) if errtype == KeyError: assert msg.lower() in err.value.args[0].lower() else: assert msg.lower() in err.value.message.lower()
def test_transformRDDForSaving(self, spark, dd_das_stub): dd_das_stub.reader = get_reader_stub() config = ConfigParser() config.read_string(self.config) import programs.das_setup as ds setup_instance = ds.DASDecennialSetup(config=config, name='setup', das=dd_das_stub) w = MDF2020HouseholdWriter(config=setup_instance.config, setup=setup_instance, name='writer', das=dd_das_stub) hholds = hhdata['households'] units = hhdata['units'] node1 = self.makeNode(hholds[:4], units[:4], geocode='0') node2 = self.makeNode(hholds[4:], units[4:], geocode='1') spark = SparkSession.builder.getOrCreate() node_rdd = spark.sparkContext.parallelize([node1, node2]) df = w.transformRDDForSaving(node_rdd) df.show() assert df.count() == len(units) for val in df.select('P18').collect(): assert val['P18'] == 9 for val in df.select('PAC').collect(): assert val['PAC'] == '9' def len_cond(cond): return len(np.where(cond)[0]) num_gq = len_cond(np.array(units)[:, 0] > 1) rtype = np.array(df.select('RTYPE').collect()) assert len_cond(rtype[:, 0] == '4') == num_gq assert len_cond(rtype[:, 0] == '2') == len(units) - num_gq
def test_transformRDDForSaving(self, spark, dd_das_stub): dd_das_stub.reader = get_reader_stub() config = ConfigParser() config.read_string(self.config) import programs.das_setup as ds setup_instance = ds.DASDecennialSetup(config=config, name='setup', das=dd_das_stub) w = MDF2020PersonWriter(config=setup_instance.config, setup=setup_instance, name='writer', das=dd_das_stub) persons = pdata['persons'] node1 = self.makeNode(persons[:2], geocode='0123456789abcdef') node2 = self.makeNode(persons[2:], geocode='0123456789abcdeg') spark = SparkSession.builder.getOrCreate() node_rdd = spark.sparkContext.parallelize([node1, node2]) df = w.transformRDDForSaving(node_rdd) df.show() assert df.count() == len(persons) for val in df.select('EPNUM').collect(): assert val['EPNUM'] == 999999999 for val in df.select('RELSHIP').collect(): assert val['RELSHIP'] == '99' def len_cond(cond): return len(np.where(cond)[0]) num_gq = len_cond(np.array(persons)[:, 0] > 0) rtype = np.array(df.select('RTYPE').collect()) assert len_cond(rtype[:, 0] == '5') == num_gq assert len_cond(rtype[:, 0] == '3') == len(persons) - num_gq
print( f'section:schema: {str(list(config.items(section=CC.SCHEMA)))}' ) print( f'Converting experiment at (full path) {full_path} to {output_path}' ) # print(f'str(nodes_dict_rdd.take(1)): {str(nodes_dict_rdd.take(1))}') das_stub = DASStub() das_stub.t0 = time.time() das_stub.output_paths = [] setup_instance = ds.DASDecennialSetup(config=config, name='setup', das=das_stub) if not files_shipped: setup_instance = setup_instance.setup_func( ) # This ships files to spark files_shipped = True print(f"Reading pickled data: {full_path}") nodes_dict_rdd = spark.sparkContext.pickleFile(full_path) a_node_dict = nodes_dict_rdd.take(1)[0] #if not (experiment.type is PERSON): #if INVAR not in a_node_dict and '_invar' not in a_node_dict: # if not invar_loaded: # invar_rdd = spark\
def reader_instance(spark, config, dd_das_stub): import programs.reader.table_reader as tr_spark import programs.das_setup as ds setup_instance = ds.DASDecennialSetup(config=config, name='setup', das=dd_das_stub) return tr_spark.DASDecennialReader(config=setup_instance.config, setup=setup_instance, name='reader', das=dd_das_stub)
def old_main(s3path, config_path): print('Beginning of pickle picker') logging.info("Beginning of pickle picker") spark = SparkSession.builder.getOrCreate() files_shipped = False logging.basicConfig( filename="convert.log", format="%(asctime)s %(filename)s:%(lineno)d (%(funcName)s) %(message)s" ) invar_loaded = False print(f'Source data: {s3path}') print(f'Config file located at: {config_path}') config = ConfigParser() config.read_string(s3open(config_path).read()) """ print(f'existing writer section: {str(list(config.items(section=CC.WRITER_SECTION)))}') output_datafile_name = config.get(CC.WRITER_SECTION, CC.OUTPUT_DATAFILE_NAME) print(f'section:writer, output_datafile_name: {output_datafile_name}') output_path = f'{experiment.folder}_unpickled/{sub_folder}/run_000{str(run_number)}' config.set(CC.WRITER_SECTION, CC.OUTPUT_PATH, output_path) config.set(CC.WRITER_SECTION, CC.S3CAT, '1') config.set(CC.WRITER_SECTION, CC.S3CAT_SUFFIX, '.csv') config.set(CC.WRITER_SECTION, CC.OVERWRITE_FLAG, '0') config.set(CC.WRITER_SECTION, CC.WRITE_METADATA, '1') config.set(CC.WRITER_SECTION, CC.CLASSIFICATION_LEVEL, 'C_U_I//CENS') config.set(CC.WRITER_SECTION, CC.NUM_PARTS, '5000') print(f'modified writer section: {str(list(config.items(section=CC.WRITER_SECTION)))}') print(f'section:schema: {str(list(config.items(section=CC.SCHEMA)))}') # print(f'str(nodes_dict_rdd.take(1)): {str(nodes_dict_rdd.take(1))}') print(f"Reading pickled data: {s3path}") """ # Ship the files to spark and get the setup object das_stub = DASStub() das_stub.t0 = time.time() das_stub.output_paths = [] setup = ds.DASDecennialSetup(config=config, name='setup', das=das_stub) setup_data = setup.setup_func() nodes_dict_rdd = spark.sparkContext.pickleFile(s3path) """ a_node_dict = nodes_dict_rdd.take(1)[0] if not (experiment.type is PERSON): if INVAR not in a_node_dict and '_invar' not in a_node_dict: if not invar_loaded: invar_rdd = spark\ .sparkContext\ .pickleFile('s3://uscb-decennial-ite-das/users/sexto015/experiments/full_household/Sept12_TestMUD_VA_PLB_Experiment/td001/run_0000/data') \ .map(lambda nd: (nd[GEOCODE], nd['_invar'])) invar_loaded = True nodes_dict_rdd = nodes_dict_rdd\ .map(lambda nd: (nd[GEOCODE], nd[SYN]))\ .join(invar_rdd)\ .map(lambda g_sk: {GEOCODE: g_sk[0], SYN: g_sk[1][0], INVAR: g_sk[1][1]}) # print(nodes_dict_rdd.count()) # from rdd_like_list import RDDLikeList # nodes_dict_rdd = RDDLikeList(nodes_dict_rdd.take(10)) if experiment.type is PERSON: print('Using Person Writer') w = NonConvertingMDF2020PersonWriter(config=config, setup=setup_data, name='writer', das=das_stub) else: print('Using Household Writer') w = NonConvertingMDF2020HouseholdWriter(config=config, setup=setup_data, name='writer', das=das_stub) print('Writing') """ # calls programs.writer.write() which takes an engine_tuple # engine_tuple is (blocknoderdd, feas_dict) # w.write((nodes_dict_rdd, None)) # For testing, just take the first record and print it record = nodes_dict_rdd.take(1) print("record:", record)