Пример #1
0
def write_data_dictionary(output_dir):
    """
    Write table_name, number of rows, columns, and bytes for each checkpointed table

    Parameters
    ----------
    output_dir: str

    """
    pd.options.display.max_columns = 500
    pd.options.display.max_rows = 100

    output_tables = pipeline.checkpointed_tables()

    records = []

    # write data dictionary for all checkpointed_tables
    with open(os.path.join(output_dir, 'data_dict.txt'), 'w') as file:
        for table_name in output_tables:
            df = inject.get_table(table_name, None).to_frame()

            print >> file, "\n### %s %s" % (table_name, df.shape)
            print >> file, df.dtypes

            rows, columns = df.shape
            bytes = df.memory_usage(index=True).sum()
            records.append((table_name, rows, columns, bytes))

    df = pd.DataFrame.from_records(
        records, columns=['table_name', 'rows', 'columns', 'bytes'])
    df.sort_values(by='table_name', inplace=True)
    df.to_csv(os.path.join(output_dir, 'data_dict.csv'))
Пример #2
0
def previous_write_data_dictionary(output_dir):
    """
    Write table_name, number of rows, columns, and bytes for each checkpointed table

    Parameters
    ----------
    output_dir: str

    """

    model_settings = config.read_model_settings('write_data_dictionary')
    txt_format = model_settings.get('txt_format', 'data_dict.txt')
    csv_format = model_settings.get('csv_format', 'data_dict.csv')

    if txt_format:

        output_file_path = config.output_file_path(txt_format)

        pd.options.display.max_columns = 500
        pd.options.display.max_rows = 100

        output_tables = pipeline.checkpointed_tables()

        # write data dictionary for all checkpointed_tables

        with open(output_file_path, 'w') as output_file:
            for table_name in output_tables:
                df = inject.get_table(table_name, None).to_frame()

                print("\n### %s %s" % (table_name, df.shape), file=output_file)
                print('index:',
                      df.index.name,
                      df.index.dtype,
                      file=output_file)
                print(df.dtypes, file=output_file)
Пример #3
0
def write_data_dictionary(output_dir):

    output_tables = pipeline.checkpointed_tables()

    # write data dictionary for all checkpointed_tables
    with open(os.path.join(output_dir, 'data_dict.csv'), 'a') as file:
        for table_name in output_tables:
            df = inject.get_table(table_name, None).to_frame()
            print >> file, "\n### %s (%s)\n" % (table_name,
                                                df.shape), df.dtypes
Пример #4
0
def write_tables(output_dir):

    output_tables_settings_name = 'output_tables'

    output_tables_settings = setting(output_tables_settings_name)

    output_tables = pipeline.checkpointed_tables()

    if output_tables_settings is not None:

        action = output_tables_settings.get('action')
        tables = output_tables_settings.get('tables')

        if action not in ['include', 'skip']:
            raise "expected %s action '%s' to be either 'include' or 'skip'" % \
                  (output_tables_settings_name, action)

        if action == 'include':
            output_tables = tables
        elif action == 'skip':
            output_tables = [t for t in output_tables if t not in tables]

    # should provide option to also write checkpoints?
    # output_tables.append("checkpoints.csv")

    for table_name in output_tables:
        table = inject.get_table(table_name, None)

        if table is None:
            logger.warn("Skipping '%s': Table not found." % table_name)
            continue

        df = table.to_frame()
        file_name = "%s.csv" % table_name
        logger.info("writing output file %s" % file_name)
        file_path = os.path.join(output_dir, file_name)
        write_index = df.index.name is not None
        df.to_csv(file_path, index=write_index)
Пример #5
0
def write_data_dictionary(output_dir):
    """
    Write table_name, number of rows, columns, and bytes for each checkpointed table

    Parameters
    ----------
    output_dir: str

    """
    pd.options.display.max_columns = 500
    pd.options.display.max_rows = 100

    output_tables = pipeline.checkpointed_tables()

    # write data dictionary for all checkpointed_tables

    mode = 'wb' if sys.version_info < (3, ) else 'w'
    with open(config.output_file_path('data_dict.txt'), mode) as output_file:
        for table_name in output_tables:
            df = inject.get_table(table_name, None).to_frame()

            print("\n### %s %s" % (table_name, df.shape), file=output_file)
            print('index:', df.index.name, df.index.dtype, file=output_file)
            print(df.dtypes, file=output_file)
Пример #6
0
def write_data_dictionary(output_dir):
    """
    Write table_name, number of rows, columns, and bytes for each checkpointed table

    Parameters
    ----------
    output_dir: str

    """
    pd.options.display.max_columns = 500
    pd.options.display.max_rows = 100

    output_tables = pipeline.checkpointed_tables()

    # write data dictionary for all checkpointed_tables

    mode = 'wb' if sys.version_info < (3,) else 'w'
    with open(config.output_file_path('data_dict.txt'), mode) as output_file:
        for table_name in output_tables:
            df = inject.get_table(table_name, None).to_frame()

            print("\n### %s %s" % (table_name, df.shape), file=output_file)
            print('index:', df.index.name, df.index.dtype, file=output_file)
            print(df.dtypes, file=output_file)
Пример #7
0
def write_data_dictionary(output_dir):
    """
    Write table schema for all tables

    model settings
        txt_format: output text file name (default data_dict.txt) or empty to suppress txt output
        csv_format: output csv file name (default data_dict.tcsvxt) or empty to suppress txt output

        schema_tables: list of tables to include in output (defaults to all checkpointed tables)

    for each table, write column names, dtype, and checkpoint added)

    text format writes individual table schemas to a single text file
    csv format writes all tables together with an additional table_name column

    Parameters
    ----------
    output_dir: str

    """

    model_settings = config.read_model_settings('write_data_dictionary')
    txt_format = model_settings.get('txt_format', 'data_dict.txt')
    csv_format = model_settings.get('csv_format', 'data_dict.csv')

    if not (csv_format or txt_format):
        logger.warning(
            f"write_data_dictionary step invoked but neither 'txt_format' nor 'csv_format' specified"
        )
        return

    table_names = pipeline.checkpointed_tables()

    # use table_names list from model_settings, if provided
    schema_tables = model_settings.get('tables', None)
    if schema_tables:
        table_names = [c for c in schema_tables if c in table_names]

    # initialize schema as dict of dataframe[table_name, column_name, dtype, checkpoint]
    schema = dict()
    final_shapes = dict()
    for table_name in table_names:
        df = pipeline.get_table(table_name)

        final_shapes[table_name] = df.shape

        if df.index.name and df.index.name not in df.columns:
            df = df.reset_index()
        info = df.dtypes.astype(str).to_frame('dtype').reset_index().rename(
            columns={'index': 'column_name'})
        info['checkpoint'] = ''

        info.insert(loc=0, column='table_name', value=table_name)
        schema[table_name] = info

    # annotate schema.info with name of checkpoint columns were first seen
    for _, row in pipeline.get_checkpoints().iterrows():

        checkpoint_name = row[pipeline.CHECKPOINT_NAME]

        for table_name in table_names:

            # no change to table in this checkpoint
            if row[table_name] != checkpoint_name:
                continue

            # get the checkpointed version of the table
            df = pipeline.get_table(table_name, checkpoint_name)

            if df.index.name and df.index.name not in df.columns:
                df = df.reset_index()

            info = schema.get(table_name, None)

            # tag any new columns with checkpoint name
            prev_columns = info[info.checkpoint != ''].column_name.values
            new_cols = [c for c in df.columns.values if c not in prev_columns]
            is_new_column_this_checkpoont = info.column_name.isin(new_cols)
            info.checkpoint = np.where(is_new_column_this_checkpoont,
                                       checkpoint_name, info.checkpoint)

            schema[table_name] = info

    schema_df = pd.concat(schema.values())

    if csv_format:
        schema_df.to_csv(config.output_file_path(csv_format),
                         header=True,
                         index=False)

    if txt_format:
        with open(config.output_file_path(txt_format), 'w') as output_file:

            # get max schema column widths from omnibus table
            col_width = {
                c: schema_df[c].str.len().max() + 2
                for c in schema_df
            }

            for table_name in table_names:
                info = schema.get(table_name, None)

                columns_to_print = ['column_name', 'dtype', 'checkpoint']
                info = info[columns_to_print].copy()

                # normalize schema columns widths across all table schemas for unified output formatting
                for c in info:
                    info[c] = info[c].str.pad(col_width[c], side='right')
                info.columns = [c.ljust(col_width[c]) for c in info.columns]

                info = info.to_string(index=False)

                print(
                    f"###\n### {table_name} {final_shapes[table_name]}\n###\n",
                    file=output_file)
                print(f"{info}\n", file=output_file)
Пример #8
0
def write_tables(output_dir):
    """
    Write pipeline tables as csv files (in output directory) as specified by output_tables list
    in settings file.

    'output_tables' can specify either a list of output tables to include or to skip
    if no output_tables list is specified, then no checkpointed tables will be written

    To write all output tables EXCEPT the households and persons tables:

    ::

      output_tables:
        action: skip
        tables:
          - households
          - persons

    To write ONLY the households table:

    ::

      output_tables:
        action: include
        tables:
           - households

    To write tables into a single HDF5 store instead of individual CSVs, use the h5_store flag:

    ::

      output_tables:
        h5_store: True
        action: include
        tables:
           - households

    Parameters
    ----------
    output_dir: str

    """

    output_tables_settings_name = 'output_tables'

    output_tables_settings = setting(output_tables_settings_name)

    if output_tables_settings is None:
        logger.info(
            "No output_tables specified in settings file. Nothing to write.")
        return

    action = output_tables_settings.get('action')
    tables = output_tables_settings.get('tables')
    prefix = output_tables_settings.get('prefix', 'final_')
    h5_store = output_tables_settings.get('h5_store', False)
    sort = output_tables_settings.get('sort', False)

    checkpointed_tables = pipeline.checkpointed_tables()
    if action == 'include':
        output_tables_list = tables
    elif action == 'skip':
        output_tables_list = [
            t for t in checkpointed_tables if t not in tables
        ]
    else:
        raise "expected %s action '%s' to be either 'include' or 'skip'" % \
              (output_tables_settings_name, action)

    for table_name in output_tables_list:

        if table_name == 'checkpoints':
            df = pipeline.get_checkpoints()
        else:
            if table_name not in checkpointed_tables:
                logger.warning("Skipping '%s': Table not found." % table_name)
                continue
            df = pipeline.get_table(table_name)

            if sort:
                traceable_table_indexes = inject.get_injectable(
                    'traceable_table_indexes', {})

                if df.index.name in traceable_table_indexes:
                    df = df.sort_index()
                    logger.debug(
                        f"write_tables sorting {table_name} on index {df.index.name}"
                    )
                else:
                    # find all registered columns we can use to sort this table
                    # (they are ordered appropriately in traceable_table_indexes)
                    sort_columns = [
                        c for c in traceable_table_indexes if c in df.columns
                    ]
                    if len(sort_columns) > 0:
                        df = df.sort_values(by=sort_columns)
                        logger.debug(
                            f"write_tables sorting {table_name} on columns {sort_columns}"
                        )
                    else:
                        logger.debug(
                            f"write_tables sorting {table_name} on unrecognized index {df.index.name}"
                        )
                        df = df.sort_index()

        if h5_store:
            file_path = config.output_file_path('%soutput_tables.h5' % prefix)
            df.to_hdf(file_path, key=table_name, mode='a', format='fixed')
        else:
            file_name = "%s%s.csv" % (prefix, table_name)
            file_path = config.output_file_path(file_name)

            # include the index if it has a name or is a MultiIndex
            write_index = df.index.name is not None or isinstance(
                df.index, pd.MultiIndex)

            df.to_csv(file_path, index=write_index)
Пример #9
0
def write_tables(output_dir):
    """
    Write pipeline tables as csv files (in output directory) as specified by output_tables list
    in settings file.

    'output_tables' can specify either a list of output tables to include or to skip
    if no output_tables list is specified, then no checkpointed tables will be written

    To write all output tables EXCEPT the households and persons tables:

    ::

      output_tables:
        action: skip
        tables:
          - households
          - persons

    To write ONLY the households table:

    ::

      output_tables:
        action: include
        tables:
           - households

    To write tables into a single HDF5 store instead of individual CSVs, use the h5_store flag:

    ::

      output_tables:
        h5_store: True
        action: include
        tables:
           - households

    Parameters
    ----------
    output_dir: str

    """

    output_tables_settings_name = 'output_tables'

    output_tables_settings = setting(output_tables_settings_name)

    if output_tables_settings is None:
        logger.info(
            "No output_tables specified in settings file. Nothing to write.")
        return

    action = output_tables_settings.get('action')
    tables = output_tables_settings.get('tables')
    prefix = output_tables_settings.get('prefix', 'final_')
    h5_store = output_tables_settings.get('h5_store', False)

    if action not in ['include', 'skip']:
        raise "expected %s action '%s' to be either 'include' or 'skip'" % \
              (output_tables_settings_name, action)

    checkpointed_tables = pipeline.checkpointed_tables()
    if action == 'include':
        output_tables_list = tables
    elif action == 'skip':
        output_tables_list = [
            t for t in checkpointed_tables if t not in tables
        ]

    for table_name in output_tables_list:

        if table_name == 'checkpoints':
            df = pipeline.get_checkpoints()
        else:
            if table_name not in checkpointed_tables:
                logger.warning("Skipping '%s': Table not found." % table_name)
                continue
            df = pipeline.get_table(table_name)

        if h5_store:
            file_path = config.output_file_path('%soutput_tables.h5' % prefix)
            df.to_hdf(file_path, key=table_name, mode='a', format='fixed')
        else:
            file_name = "%s%s.csv" % (prefix, table_name)
            file_path = config.output_file_path(file_name)

            # include the index if it has a name or is a MultiIndex
            write_index = df.index.name is not None or isinstance(
                df.index, pd.MultiIndex)

            df.to_csv(file_path, index=write_index)
Пример #10
0
MODELS = setting('models')


# If you provide a resume_after argument to pipeline.run
# the pipeline manager will attempt to load checkpointed tables from the checkpoint store
# and resume pipeline processing on the next submodel step after the specified checkpoint
resume_after = setting('resume_after', None)

if resume_after:
    print "resume_after", resume_after

pipeline.run(models=MODELS, resume_after=resume_after)

print "\n#### run completed"

# write final versions of all checkpointed dataframes to CSV files to review results
for table_name in pipeline.checkpointed_tables():
    file_name = "final_%s_table.csv" % table_name
    file_path = os.path.join(orca.get_injectable("output_dir"), file_name)
    pipeline.get_table(table_name).to_csv(file_path)

# write checkpoints
file_path = os.path.join(orca.get_injectable("output_dir"), "checkpoints.csv")
pipeline.get_checkpoints().to_csv(file_path)

# tables will no longer be available after pipeline is closed
pipeline.close_pipeline()

t0 = print_elapsed_time("all models", t0)
Пример #11
0
def write_tables(output_dir):
    """
    Write pipeline tables as csv files (in output directory) as specified by output_tables list
    in settings file.

    Pipeline tables are intermediate computational tables, not to be confused with the
    synthetic population tables written by the write_synthetic_population step.

    'output_tables' can specify either a list of output tables to include or to skip
    if no output_tables list is specified, then no checkpointed tables will be written

    Intermediate tables likely to be of particular interest or utility are the controls and weights
    tables for the various geographies. For example, if one of your geographies is TRACT, then:
    TRACT_controls has control totals for every TRACT (and aggregated subzone) controls.
    TRACT_weights has balanced_weight and integer_weight for every TRACT.

    To write all output tables EXCEPT the households and persons tables:

    ::

      output_tables:
        action: skip
        tables:
          - households
          - persons

    To write ONLY the expanded_household_ids table:

    ::

      output_tables:
        action: include
        tables:
           - expanded_household_ids

    Parameters
    ----------
    output_dir: str

    """

    output_tables_settings_name = 'output_tables'

    output_tables_settings = setting(output_tables_settings_name)

    output_tables_list = pipeline.checkpointed_tables()

    if output_tables_settings is None:
        logger.info(
            "No output_tables specified in settings file. Nothing to write.")
        return

    action = output_tables_settings.get('action')
    tables = output_tables_settings.get('tables')

    if action not in ['include', 'skip']:
        raise RuntimeError(
            "expected %s action '%s' to be either 'include' or 'skip'" %
            (output_tables_settings_name, action))

    if action == 'include':
        output_tables_list = tables
    elif action == 'skip':
        output_tables_list = [t for t in output_tables_list if t not in tables]

    # should provide option to also write checkpoints?
    # output_tables_list.append("checkpoints.csv")

    for table_name in output_tables_list:
        table = inject.get_table(table_name, None)

        if table is None:
            logger.warn("Skipping '%s': Table not found." % table_name)
            continue

        df = table.to_frame()
        file_name = "%s.csv" % table_name
        logger.info("writing output file %s" % file_name)
        file_path = os.path.join(output_dir, file_name)
        write_index = df.index.name is not None
        df.to_csv(file_path, index=write_index)
Пример #12
0
def write_tables(output_dir):
    """
    Write pipeline tables as csv files (in output directory) as specified by output_tables list
    in settings file.

    'output_tables' can specify either a list of output tables to include or to skip
    if no output_tables list is specified, then no checkpointed tables will be written

    To write all output tables EXCEPT the households and persons tables:

    ::

      output_tables:
        action: skip
        tables:
          - households
          - persons

    To write ONLY the households table:

    ::

      output_tables:
        action: include
        tables:
           - households

    Parameters
    ----------
    output_dir: str

    """

    output_tables_settings_name = 'output_tables'

    output_tables_settings = setting(output_tables_settings_name)

    if output_tables_settings is None:
        logger.info("No output_tables specified in settings file. Nothing to write.")
        return

    action = output_tables_settings.get('action')
    tables = output_tables_settings.get('tables')
    prefix = output_tables_settings.get('prefix', 'final_')

    if action not in ['include', 'skip']:
        raise "expected %s action '%s' to be either 'include' or 'skip'" % \
              (output_tables_settings_name, action)

    checkpointed_tables = pipeline.checkpointed_tables()
    if action == 'include':
        output_tables_list = tables
    elif action == 'skip':
        output_tables_list = [t for t in checkpointed_tables if t not in tables]

    for table_name in output_tables_list:

        if table_name == 'checkpoints':
            df = pipeline.get_checkpoints()
        else:
            if table_name not in checkpointed_tables:
                logger.warning("Skipping '%s': Table not found." % table_name)
                continue
            df = pipeline.get_table(table_name)

        file_name = "%s%s.csv" % (prefix, table_name)
        file_path = config.output_file_path(file_name)

        # include the index if it has a name or is a MultiIndex
        write_index = df.index.name is not None or isinstance(df.index, pd.core.index.MultiIndex)

        df.to_csv(file_path, index=write_index)
Пример #13
0
def write_tables(output_dir):
    """
    Write pipeline tables as csv files (in output directory) as specified by output_tables list
    in settings file.

    Pipeline tables are intermediate computational tables, not to be confused with the
    synthetic population tables written by the write_synthetic_population step.

    'output_tables' can specify either a list of output tables to include or to skip
    if no output_tables list is specified, then no checkpointed tables will be written

    Intermediate tables likely to be of particular interest or utility are the controls and weights
    tables for the various geographies. For example, if one of your geographies is TRACT, then:
    TRACT_controls has control totals for every TRACT (and aggregated subzone) controls.
    TRACT_weights has balanced_weight and integer_weight for every TRACT.

    To write all output tables EXCEPT the households and persons tables:

    ::

      output_tables:
        action: skip
        tables:
          - households
          - persons

    To write ONLY the expanded_household_ids table:

    ::

      output_tables:
        action: include
        tables:
           - expanded_household_ids

    Parameters
    ----------
    output_dir: str

    """

    output_tables_settings_name = 'output_tables'

    output_tables_settings = setting(output_tables_settings_name)

    output_tables_list = pipeline.checkpointed_tables()

    if output_tables_settings is None:
        logger.info("No output_tables specified in settings file. Nothing to write.")
        return

    action = output_tables_settings.get('action')
    tables = output_tables_settings.get('tables')

    if action not in ['include', 'skip']:
        raise RuntimeError("expected %s action '%s' to be either 'include' or 'skip'" %
                           (output_tables_settings_name, action))

    if action == 'include':
        output_tables_list = tables
    elif action == 'skip':
        output_tables_list = [t for t in output_tables_list if t not in tables]

    logger.debug("output_tables_list: %s" % str(output_tables_list))
    # should provide option to also write checkpoints?
    # output_tables_list.append("checkpoints.csv")

    # columns: geography, id, variable, control, result, diff
    summary_melt_df = pd.DataFrame()

    for table_name in output_tables_list:
        table = inject.get_table(table_name, None)

        if table is None:
            logger.warn("Skipping '%s': Table not found." % table_name)
            continue

        df = table.to_frame()
        file_name = "%s.csv" % table_name
        logger.info("writing output file %s" % file_name)
        file_path = os.path.join(output_dir, file_name)
        write_index = df.index.name is not None
        df.to_csv(file_path, index=write_index)

        try:
            # create the melt
            # find the control variables
            control_vars = []
            for column in list(df.columns.values):
                if column[-8:] == "_control": control_vars.append(column[:-8])
            logger.debug("control variables for melt %s" % str(control_vars))

            control_col_names = list("%s_control" % cv for cv in control_vars)
            result_col_names  = list("%s_result"  % cv for cv in control_vars)
            diff_col_names    = list("%s_diff"    % cv for cv in control_vars)

            control_melt_df = df.melt(id_vars=["geography","id"], value_vars=control_col_names, value_name="control").replace(to_replace=dict(zip(control_col_names, control_vars)) )
            result_melt_df  = df.melt(id_vars=["geography","id"], value_vars=result_col_names,  value_name="result" ).replace(to_replace=dict(zip(result_col_names,  control_vars)) )
            diff_melt_df    = df.melt(id_vars=["geography","id"], value_vars=diff_col_names,    value_name="diff"   ).replace(to_replace=dict(zip(diff_col_names,    control_vars)) )

            melt_df = pd.merge(left=control_melt_df, right=result_melt_df, how="left", on=["geography","id","variable"])
            melt_df = pd.merge(left=melt_df,         right=diff_melt_df,   how="left", on=["geography","id","variable"])
            summary_melt_df = summary_melt_df.append(melt_df)

        except:
            # if something doesn't work, it's ok
            pass

    if len(summary_melt_df) > 0:
        file_name = "summary_melt.csv"
        logger.info("writing output file %s" % file_name)
        file_path = os.path.join(output_dir, file_name)
        write_index = df.index.name is not None
        summary_melt_df.to_csv(file_path, index=write_index)
Пример #14
0
def write_tables(output_dir):
    """
    Write pipeline tables as csv files (in output directory) as specified by output_tables list
    in settings file.

    'output_tables' can specify either a list of output tables to include or to skip
    if no output_tables list is specified, then no checkpointed tables will be written

    To write all output tables EXCEPT the households and persons tables:

    ::

      output_tables:
        action: skip
        tables:
          - households
          - persons

    To write ONLY the households table:

    ::

      output_tables:
        action: include
        tables:
           - households

    Parameters
    ----------
    output_dir: str

    """

    output_tables_settings_name = 'output_tables'

    output_tables_settings = setting(output_tables_settings_name)

    output_tables_list = pipeline.checkpointed_tables()

    if output_tables_settings is None:
        logger.info(
            "No output_tables specified in settings file. Nothing to write.")
        return

    action = output_tables_settings.get('action')
    tables = output_tables_settings.get('tables')
    prefix = output_tables_settings.get('prefix', 'final_')

    if action not in ['include', 'skip']:
        raise "expected %s action '%s' to be either 'include' or 'skip'" % \
              (output_tables_settings_name, action)

    if action == 'include':
        output_tables_list = tables
    elif action == 'skip':
        output_tables_list = [t for t in output_tables_list if t not in tables]

    # should provide option to also write checkpoints?
    # output_tables_list.append("checkpoints.csv")

    for table_name in output_tables_list:
        table = inject.get_table(table_name, None)

        if table is None:
            logger.warn("Skipping '%s': Table not found." % table_name)
            continue

        df = table.to_frame()
        file_name = "%s%s.csv" % (prefix, table_name)
        logger.info("writing output file %s" % file_name)
        file_path = os.path.join(output_dir, file_name)
        write_index = df.index.name is not None
        df.to_csv(file_path, index=write_index)

    if (action == 'include') == ('checkpoints' in tables):
        # write checkpoints
        file_name = "%s%s.csv" % (prefix, 'checkpoints')
        pipeline.get_checkpoints().to_csv(os.path.join(output_dir, file_name))