def test_missing_table_list(data_dir): settings = inject.get_injectable('settings') assert isinstance(settings, dict) with pytest.raises(AssertionError) as excinfo: input.read_input_table('households') assert 'no input_table_list found' in str(excinfo.value)
def test_missing_filename(seed_households, data_dir): settings_yaml = """ input_table_list: - tablename: households index_col: household_id rename_columns: HHID: household_id """ settings = yaml.load(settings_yaml, Loader=yaml.SafeLoader) inject.add_injectable('settings', settings) with pytest.raises(AssertionError) as excinfo: input.read_input_table('households') assert 'no input file provided' in str(excinfo.value)
def test_create_input_store(seed_households, data_dir): settings_yaml = """ create_input_store: True input_table_list: - tablename: households h5_tablename: seed_households filename: households.csv index_col: household_id rename_columns: HHID: household_id """ settings = yaml.load(settings_yaml, Loader=yaml.SafeLoader) inject.add_injectable('settings', settings) hh_file = os.path.join(data_dir, 'households.csv') seed_households.to_csv(hh_file, index=False) assert os.path.isfile(hh_file) df = input.read_input_table('households') assert df.index.name == 'household_id' output_store = os.path.join(inject.get_injectable('output_dir'), 'input_data.h5') assert os.path.exists(output_store) store_df = pd.read_hdf(output_store, 'seed_households') assert store_df.equals(seed_households)
def accessibility(land_use): """ If 'accessibility' is in input_tables list, then read it in, otherwise create skeleton table with same index as landuse. This allows loading of pre-computed accessibility table, which is particularly useful for single-process small household sample runs when there are many zones in landuse skeleton table only required if multiprocessing wants to slice accessibility, otherwise it will simply be replaced when accessibility model is run """ accessibility_df = read_input_table("accessibility", required=False) if accessibility_df is None: accessibility_df = pd.DataFrame(index=land_use.index) logger.info("created placeholder accessibility table %s" % (accessibility_df.shape, )) else: assert accessibility_df.sort_index().index.equals(land_use.to_frame().sort_index().index), \ f"loaded accessibility table index does not match index of land_use table" logger.info("loaded land_use %s" % (accessibility_df.shape, )) # replace table function with dataframe inject.add_table('accessibility', accessibility_df) return accessibility_df
def read_raw_persons(households): df = read_input_table("persons") if inject.get_injectable('households_sliced', False): # keep only persons in the sampled households df = df[df.household_id.isin(households.index)] return df
def land_use(): df = read_input_table("land_use") logger.info("loaded land_use %s" % (df.shape, )) # replace table function with dataframe inject.add_table('land_use', df) return df
def initialize_tours(network_los, households, persons, trace_hh_id): trace_label = 'initialize_tours' tours = read_input_table("tours") # FIXME can't use households_sliced injectable as flag like persons table does in case of resume_after. # FIXME could just always slice... slice_happened = \ inject.get_injectable('households_sample_size', 0) > 0 \ or inject.get_injectable('households_sample_size', 0) > 0 if slice_happened: logger.info("slicing tours %s" % (tours.shape,)) # keep all persons in the sampled households tours = tours[tours.person_id.isin(persons.index)] # annotate before patching tour_id to allow addition of REQUIRED_TOUR_COLUMNS defined above model_settings = config.read_model_settings('initialize_tours.yaml', mandatory=True) expressions.assign_columns( df=tours, model_settings=model_settings.get('annotate_tours'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_tours')) skip_patch_tour_ids = model_settings.get('skip_patch_tour_ids', False) if skip_patch_tour_ids: pass else: tours = patch_tour_ids(tours) assert tours.index.name == 'tour_id' # replace table function with dataframe inject.add_table('tours', tours) pipeline.get_rn_generator().add_channel('tours', tours) tracing.register_traceable_table('tours', tours) logger.debug(f"{len(tours.household_id.unique())} unique household_ids in tours") logger.debug(f"{len(households.index.unique())} unique household_ids in households") assert not tours.index.duplicated().any() tours_without_persons = ~tours.person_id.isin(persons.index) if tours_without_persons.any(): logger.error(f"{tours_without_persons.sum()} tours out of {len(persons)} without persons\n" f"{pd.Series({'person_id': tours_without_persons.index.values})}") raise RuntimeError(f"{tours_without_persons.sum()} tours with bad person_id") if trace_hh_id: tracing.trace_df(tours, label='initialize_tours', warn_if_empty=True)
def zone_data(): """ Pipeline table containing zone info. Specify with 'input_table_list' in settings.yaml. Must contain columns for at least zone id, latitude, and longitude. """ df = read_input_table('zone_data') logger.info('loaded zone data %s' % (df.shape,)) # replace table function with dataframe inject.add_table('zone_data', df) return df
def land_use(): df = read_input_table("land_use") # try to make life easy for everybody by keeping everything in canonical order # but as long as coalesce_pipeline doesn't sort tables it coalesces, it might not stay in order # so even though we do this, anyone downstream who depends on it, should look out for themselves... if not df.index.is_monotonic_increasing: df = df.sort_index() logger.info("loaded land_use %s" % (df.shape, )) # replace table function with dataframe inject.add_table('land_use', df) return df
def test_hdf_reader1(seed_households, data_dir): settings_yaml = """ input_table_list: - tablename: households filename: households.h5 index_col: household_id rename_columns: HHID: household_id """ settings = yaml.load(settings_yaml, Loader=yaml.SafeLoader) inject.add_injectable('settings', settings) hh_file = os.path.join(data_dir, 'households.h5') seed_households.to_hdf(hh_file, key='households', mode='w') assert os.path.isfile(hh_file) df = input.read_input_table('households') assert df.index.name == 'household_id'
def test_csv_reader(seed_households, data_dir): settings_yaml = """ input_table_list: - tablename: households filename: households.csv index_col: household_id column_map: HHID: household_id """ settings = yaml.load(settings_yaml, Loader=yaml.SafeLoader) inject.add_injectable('settings', settings) hh_file = os.path.join(data_dir, 'households.csv') seed_households.to_csv(hh_file, index=False) assert os.path.isfile(hh_file) df = input.read_input_table('households') assert df.index.name == 'household_id'
def households(households_sample_size, override_hh_ids, trace_hh_id): df_full = read_input_table("households") households_sliced = False logger.info("full household list contains %s households" % df_full.shape[0]) # only using households listed in override_hh_ids if override_hh_ids is not None: # trace_hh_id will not used if it is not in list of override_hh_ids logger.info("override household list containing %s households" % len(override_hh_ids)) df = df_full[df_full.index.isin(override_hh_ids)] households_sliced = True if df.shape[0] < len(override_hh_ids): logger.info("found %s of %s households in override household list" % (df.shape[0], len(override_hh_ids))) if df.shape[0] == 0: raise RuntimeError('No override households found in store') # if we are tracing hh exclusively elif trace_hh_id and households_sample_size == 1: # df contains only trace_hh (or empty if not in full store) df = tracing.slice_ids(df_full, trace_hh_id) households_sliced = True # if we need a subset of full store elif households_sample_size > 0 and df_full.shape[0] > households_sample_size: logger.info("sampling %s of %s households" % (households_sample_size, df_full.shape[0])) """ Because random seed is set differently for each step, sampling of households using Random.global_rng would sample differently depending upon which step it was called from. We use a one-off rng seeded with the pseudo step name 'sample_households' to provide repeatable sampling no matter when the table is loaded. Note that the external_rng is also seeded with base_seed so the sample will (rightly) change if the pipeline rng's base_seed is changed """ prng = pipeline.get_rn_generator().get_external_rng('sample_households') df = df_full.take(prng.choice(len(df_full), size=households_sample_size, replace=False)) households_sliced = True # if tracing and we missed trace_hh in sample, but it is in full store if trace_hh_id and trace_hh_id not in df.index and trace_hh_id in df_full.index: # replace first hh in sample with trace_hh logger.debug("replacing household %s with %s in household sample" % (df.index[0], trace_hh_id)) df_hh = df_full.loc[[trace_hh_id]] df = pd.concat([df_hh, df[1:]]) else: df = df_full # persons table inject.add_injectable('households_sliced', households_sliced) logger.info("loaded households %s" % (df.shape,)) # FIXME - pathological knowledge of name of chunk_id column used by chunked_choosers_by_chunk_id assert 'chunk_id' not in df.columns df['chunk_id'] = pd.Series(list(range(len(df))), df.index) # replace table function with dataframe inject.add_table('households', df) pipeline.get_rn_generator().add_channel('households', df) if trace_hh_id: tracing.register_traceable_table('households', df) tracing.trace_df(df, "raw.households", warn_if_empty=True) return df