Пример #1
0
def create_df(data_type,
              db_table,
              pivot=False,
              index=['age', 'race_ethn', 'sex']):
    """
    Create pandas DataFrame from database SQL query to select base population
    or rate versions to be used in model.

    Args:
        data_type : string
            type of data (e.g. birth, migration, population)
        db_table : string
            database table name
        pivot : boolean, optional (default False)

    Returns:
        df_sql_result : pandas DataFrame
            SQL query result
    """

    # connect to database using SQLAlchemy
    db_connection_string = database.get_connection_string(
        'model_config.yml', 'in_db')
    sql_in_engine = create_engine(db_connection_string)

    # retrieve rate versions for current model and database table names to query
    rate_versions = util.yaml_to_dict('model_config.yml', 'rate_versions')
    tables = util.yaml_to_dict('model_config.yml', 'db_tables')

    # build query from sql.py
    # use database table name and rate versions from .yml file
    in_query = getattr(
        sql, data_type) % (tables[db_table], rate_versions[data_type])

    # pandas DataFrame from query
    df_sql_result = pd.read_sql(in_query, sql_in_engine)

    # Special case for migration rates: pivot DataFrame since 4 rates in cols
    #       rates are: domestic in, domestic out, foreign in, foreign out
    if pivot:
        df_sql_result = util.apply_pivot(df_sql_result)

    # create MultiIndex on cohort attributes
    if index is not None:
        df_sql_result = df_sql_result.set_index(index)

    return df_sql_result
Пример #2
0
def create_df(data_type,db_table,pivot=False):
    """
    Create pandas DataFrame from database SQL query to select base population
    or rate versions to be used in model.

    Args:
        data_type : string
            type of data (e.g. birth, migration, population)
        db_table : string
            database table name
        pivot : boolean, optional (default False)

    Returns:
        df_sql_result : pandas DataFrame
            SQL query result
    """

    # connect to database using SQLAlchemy
    db_connection_string = database.get_connection_string('model_config.yml', 'in_db')
    sql_in_engine = create_engine(db_connection_string)

    # retrieve rate versions for current model and database table names to query
    rate_versions = util.yaml_to_dict('model_config.yml', 'rate_versions')
    tables = util.yaml_to_dict('model_config.yml', 'db_tables')

    # build query from sql.py
    # use database table name and rate versions from .yml file
    in_query = getattr(sql,data_type) % (tables[db_table],rate_versions[data_type])

    # pandas DataFrame from query
    df_sql_result = pd.read_sql(in_query, sql_in_engine)

    # Special case for migration rates: pivot DataFrame since 4 rates in cols
    #       rates are: domestic in, domestic out, foreign in, foreign out
    if pivot:
        df_sql_result = util.apply_pivot(df_sql_result)

    # create MultiIndex on cohort attributes
    df_sql_result = df_sql_result.set_index(['age','race_ethn','sex'])

    return df_sql_result
Пример #3
0
    def run(self):

        engine = create_engine(
            get_connection_string("model_config.yml", 'output_database'))
        db_connection_string = database.get_connection_string(
            'model_config.yml', 'in_db')
        sql_in_engine = create_engine(db_connection_string)

        in_query = getattr(sql, 'max_run_id')
        db_run_id = pd.read_sql(in_query, engine, index_col=None)

        run_id = pd.Series([db_run_id['max'].iloc[0]])
        run_id.to_hdf('temp/data.h5', 'run_id', mode='a')

        rate_versions = util.yaml_to_dict('model_config.yml', 'rate_versions')
        tables = util.yaml_to_dict('model_config.yml', 'db_tables')

        in_query = getattr(
            sql, 'inc_mil_gc_pop') % (tables['inc_pop_table'], run_id[0])
        in_query2 = getattr(sql, 'inc_mil_hh_pop') % (
            tables['population_table'], rate_versions['population'])

        pop = pd.read_sql(in_query,
                          engine,
                          index_col=['age', 'race_ethn', 'sex'])
        pop_mil = pd.read_sql(in_query2,
                              sql_in_engine,
                              index_col=['age', 'race_ethn', 'sex'])

        pop_mil = pop_mil.loc[pop_mil['mildep'] == 'Y']
        pop = pop.join(pop_mil)

        pop.rename(columns={'persons': 'mil_gc_pop'}, inplace=True)
        pop.rename(columns={'mil_mildep': 'mil_hh_pop'}, inplace=True)

        pop = pop.reset_index(drop=False)

        pop = pd.DataFrame(pop[['mil_gc_pop',
                                'mil_hh_pop']].groupby([pop['yr']]).sum())
        pop.to_hdf('temp/data.h5', 'mil_pop', mode='a')
Пример #4
0
    def run(self):

        my_file = Path('temp/data.h5')
        if my_file.is_file():
            print'File exists'
        else:
            db_run_id = log.new_run(name='inc_run_log')
            run_id = pd.Series([db_run_id])
            run_id.to_hdf('temp/data.h5', 'run_id',  mode='a')
            engine = create_engine(get_connection_string("model_config.yml", 'output_database'))
            db_connection_string = database.get_connection_string('model_config.yml', 'in_db')
            sql_in_engine = create_engine(db_connection_string)

            rate_versions = util.yaml_to_dict('model_config.yml', 'rate_versions')
            tables = util.yaml_to_dict('model_config.yml', 'db_tables')
            in_query = getattr(sql, 'inc_pop') % (tables['inc_pop_table'], rate_versions['inc_pop'])
            in_query2 = getattr(sql, 'inc_pop_mil') % (tables['population_table'], rate_versions['population'])

            pop = pd.read_sql(in_query, engine, index_col=['age', 'race_ethn', 'sex', 'mildep'])
            pop_mil = pd.read_sql(in_query2, sql_in_engine, index_col=['age', 'race_ethn', 'sex', 'mildep'])

            pop = pop.join(pop_mil)
            pop['persons'] = (pop['persons'] - pop['mil_mildep'])
            pop = pop.reset_index(drop=False)
            pop = pop[pop['age'] >= 18]

            pop['age_cat'] = ''

            pop.loc[pop['age'].isin(list(range(18, 25))), ['age_cat']] = '18_24'
            pop.loc[pop['age'].isin(list(range(25, 35))), ['age_cat']] = '25_34'
            pop.loc[pop['age'].isin(list(range(35, 45))), ['age_cat']] = '35_44'
            pop.loc[pop['age'].isin(list(range(45, 55))), ['age_cat']] = '45_54'
            pop.loc[pop['age'].isin(list(range(55, 60))), ['age_cat']] = '55_59'
            pop.loc[pop['age'].isin(list(range(60, 65))), ['age_cat']] = '60_64'
            pop.loc[pop['age'].isin(list(range(65, 75))), ['age_cat']] = '65_74'
            pop.loc[pop['age'].isin(list(range(75, 103))), ['age_cat']] = '75_99'

            pop = pd.DataFrame(pop['persons'].groupby([pop['yr'], pop['age_cat']]).sum())

            pop.to_hdf('temp/data.h5', 'pop', mode='a')
Пример #5
0
def new_run(name='runs'):
    Base = declarative_base()
    table_name = name

    class Run(Base):
        __tablename__ = table_name
        __table_args__ = {'schema': 'defm'}
        # define columns for the table
        id = Column(Integer, primary_key=True)
        base_rate_version = Column(Integer)
        birth_rate_version = Column(Integer)
        death_rate_version = Column(Integer)
        migration_rate_version = Column(Integer)
        householder_rate_version = Column(Integer)

    #metadata = MetaData(schema="defm")

    db_dir = 'results/'
    if not os.path.exists(db_dir):

        os.makedirs(db_dir)

    engine = create_engine(
        get_connection_string(
            "model_config.yml",
            'output_database')).execution_options(schema_translate_map={
                None: "defm",  # no schema name -> "defm"
            })
    Base.metadata.schema = 'defm'
    if not engine.has_table(table_name, schema='defm'):
        Base.metadata.create_all(engine)

    db_session = sessionmaker(bind=engine)
    session = db_session()

    # Rate versions from yml file
    rate_versions = util.yaml_to_dict('model_config.yml', 'rate_versions')

    # Insert versions in database
    model_run = Run(base_rate_version=rate_versions['population'],
                    birth_rate_version=rate_versions['birth'],
                    death_rate_version=rate_versions['death'],
                    migration_rate_version=rate_versions['migration'],
                    householder_rate_version=rate_versions['householder'])

    session.add(model_run)
    session.commit()
    run_id = model_run.id
    return run_id
Пример #6
0
    def run(self):

            engine = create_engine(get_connection_string("model_config.yml", 'output_database'))
            db_connection_string = database.get_connection_string('model_config.yml', 'in_db')
            sql_in_engine = create_engine(db_connection_string)

            in_query = getattr(sql, 'max_run_id')
            db_run_id = pd.read_sql(in_query, engine, index_col=None)
            # db_run_id = log.new_run(name='inc_run_log', run_id=db_run_id['max'].iloc[0])

            run_id = pd.Series([db_run_id['id'].iloc[0]])
            run_id.to_hdf('temp/data.h5', 'run_id',  mode='a')

            dem_sim_rates = extract.create_df('dem_sim_rates', 'dem_sim_rates_table',
                                              rate_id=self.dem_id, index=None)
            dem_sim_rates.to_hdf('temp/data.h5', 'dem_sim_rates', mode='a')

            econ_sim_rates = extract.create_df('econ_sim_rates', 'econ_sim_rates_table',
                                              rate_id=self.econ_id, index=None)
            econ_sim_rates.to_hdf('temp/data.h5', 'econ_sim_rates', mode='a')

            tables = util.yaml_to_dict('model_config.yml', 'db_tables')

            in_query = getattr(sql, 'inc_pop') % (tables['inc_pop_table'], run_id[0])
            in_query2 = getattr(sql, 'inc_mil_hh_pop') % (tables['population_table'], dem_sim_rates.base_population_id[0])

            pop = pd.read_sql(in_query, engine, index_col=['age', 'race_ethn', 'sex', 'mildep'])
            pop_mil = pd.read_sql(in_query2, sql_in_engine, index_col=['age', 'race_ethn', 'sex', 'mildep'])

            pop = pop.join(pop_mil)
            pop['persons'] = (pop['persons'] - pop['mil_mildep'])
            pop = pop.reset_index(drop=False)
            pop = pop[pop['age'] >= 18]

            pop['age_cat'] = ''

            pop.loc[pop['age'].isin(list(range(18, 25))), ['age_cat']] = '18_24'
            pop.loc[pop['age'].isin(list(range(25, 35))), ['age_cat']] = '25_34'
            pop.loc[pop['age'].isin(list(range(35, 45))), ['age_cat']] = '35_44'
            pop.loc[pop['age'].isin(list(range(45, 55))), ['age_cat']] = '45_54'
            pop.loc[pop['age'].isin(list(range(55, 60))), ['age_cat']] = '55_59'
            pop.loc[pop['age'].isin(list(range(60, 65))), ['age_cat']] = '60_64'
            pop.loc[pop['age'].isin(list(range(65, 75))), ['age_cat']] = '65_74'
            pop.loc[pop['age'].isin(list(range(75, 103))), ['age_cat']] = '75_99'

            pop = pd.DataFrame(pop['persons'].groupby([pop['yr'], pop['age_cat']]).sum())

            pop.to_hdf('temp/data.h5', 'pop', mode='a')
Пример #7
0
def new_run(db_name):
    Base = declarative_base()
    table_name = 'run_log'
    class Run(Base):
        __tablename__ = table_name
        # define columns for the table
        id = Column(Integer, primary_key=True)
        base_rate_version = Column(Integer)
        birth_rate_version = Column(Integer)
        death_rate_version = Column(Integer)
        migration_rate_version = Column(Integer)
        householder_rate_version = Column(Integer)

    db_dir = 'results/'
    if not os.path.exists(db_dir):

        os.makedirs(db_dir)

    engine = create_engine('sqlite:///' + db_dir + db_name)

    if not engine.has_table(table_name):
        Base.metadata.create_all(engine)

    db_session = sessionmaker(bind=engine)
    session = db_session()

    # Rate versions from yml file
    rate_versions = util.yaml_to_dict('model_config.yml', 'rate_versions')

    # Insert versions in database
    model_run = Run(
        base_rate_version=rate_versions['population'],
        birth_rate_version=rate_versions['birth'],
        death_rate_version=rate_versions['death'],
        migration_rate_version=rate_versions['migration'],
        householder_rate_version=rate_versions['householder'])

    session.add(model_run)
    session.commit()
    run_id = model_run.id
    return run_id
Пример #8
0
    def run(self):
        birth_rates = pd.read_hdf('temp/data.h5', 'birth_rates')
        pop = pd.read_hdf('temp/data.h5', 'non_mig_pop')
        pop = pop[(pop['type'] == 'HHP') & (pop['mildep'] == 'N')]
        birth_rates = utils.rates_for_yr(pop, birth_rates, self.year)
        birth_rates = birth_rates[(birth_rates['yr'] == self.year)]

        rate_versions = util.yaml_to_dict('model_config.yml', 'rate_versions')

        random_numbers = extract.create_df('random_numbers', 'random_numbers_table', rate_id=rate_versions['random_numbers'])

        random_numbers = random_numbers[(random_numbers['yr'] == self.year)]
        random_numbers = random_numbers[['random_number']]
        births_per_cohort = cp.births_all(birth_rates, pop_col='non_mig_pop', rand_df=random_numbers)

        death_rates = pd.read_hdf('temp/data.h5', 'death_rates')
        death_rates = death_rates[(death_rates['yr'] == self.year)]
        # sum newborn population across cohorts
        newborn = cp.births_sum(births_per_cohort, self.year)

        newborn = newborn.join(death_rates)
        newborn['new_deaths'] = (newborn['new_born'] * newborn['death_rate']).round()
        newborn['new_born_survived'] = (newborn['new_born'] - newborn['new_deaths']).round()

        dead_pop = pd.read_hdf('temp/data.h5', 'dead_pop')
        dead_pop = dead_pop.join(newborn['new_deaths'])

        dead_pop = dead_pop.fillna(0)
        dead_pop['deaths_hhp_non_mil'] = (dead_pop['deaths_hhp_non_mil'] + dead_pop['new_deaths']).round()

        dead_pop = dead_pop.drop(['new_deaths'], 1)

        dead_pop.to_hdf('temp/data.h5', 'dead_pop', mode='a')

        newborn = newborn.drop(['new_deaths', 'death_rate'], 1)
        newborn.to_hdf('temp/data.h5', 'new_born', mode='a')
Пример #9
0
# measure script time
start_time = time.time()

# change to current directory to find .yml input config file
full_path = os.path.abspath(inspect.getfile(inspect.currentframe()))
os.chdir(os.path.dirname(full_path))

# set console display to show MultiIndex for every row
pd.set_option('display.multi_sparse', False)

# rate versions to result database & return primary key for table
# db_run_id = log.new_run('model_summary.db')
db_run_id = log.new_run('defm.db')

years = util.yaml_to_dict('model_config.yml', 'years')

# Load rates for all years: SQL query to pandas DataFrame
#   columns:  'age', 'race_ethn', 'sex' (cohort), 'rate', 'year'
#   pivot migration DataFrame w 4 rates: domestic in & out, foreign in & out

birth_rates = extract.create_df('birth', 'rate_table')
death_rates = extract.create_df('death', 'rate_table')
mig_rates = extract.create_df('migration', 'rate_table', pivot=True)


# Load base population: SQL query to pandas DataFrame
#   columns:  'age', 'race_ethn', 'sex' (cohort),
#   'gq.type', 'mildep', 'persons', 'households'

population = extract.create_df('population', 'population_table')
Пример #10
0
    def run(self):

        engine = create_engine(
            get_connection_string("model_config.yml", 'output_database'))
        db_connection_string = database.get_connection_string(
            'model_config.yml', 'in_db')
        sql_in_engine = create_engine(db_connection_string)

        in_query = getattr(sql, 'max_run_id')
        db_run_id = pd.read_sql(in_query, engine, index_col=None)
        # db_run_id = log.new_run(name='emp_run_log', run_id=db_run_id['max'].iloc[0])

        run_id = pd.Series([db_run_id['max'].iloc[0]])
        run_id.to_hdf('temp/data.h5', 'run_id', mode='a')

        rate_versions = util.yaml_to_dict('model_config.yml', 'rate_versions')
        tables = util.yaml_to_dict('model_config.yml', 'db_tables')

        in_query = getattr(sql,
                           'inc_pop') % (tables['inc_pop_table'], run_id[0])
        in_query2 = getattr(sql, 'inc_mil_hh_pop') % (
            tables['population_table'], rate_versions['population'])

        pop = pd.read_sql(in_query,
                          engine,
                          index_col=['age', 'race_ethn', 'sex', 'mildep'])
        pop_mil = pd.read_sql(in_query2,
                              sql_in_engine,
                              index_col=['age', 'race_ethn', 'sex', 'mildep'])

        pop = pop.join(pop_mil)
        pop['persons'] = (pop['persons'] - pop['mil_mildep'])
        pop = pop.reset_index(drop=False)

        pop['age_cat'] = ''
        pop.loc[pop['age'].isin(list(range(0, 5))), ['age_cat']] = '00_04'
        pop.loc[pop['age'].isin(list(range(5, 10))), ['age_cat']] = '05_09'
        pop.loc[pop['age'].isin(list(range(10, 15))), ['age_cat']] = '10_14'
        pop.loc[pop['age'].isin(list(range(15, 18))), ['age_cat']] = '15_17'
        pop.loc[pop['age'].isin(list(range(18, 20))), ['age_cat']] = '18_19'
        pop.loc[pop['age'].isin(list(range(20, 21))), ['age_cat']] = '20_20'
        pop.loc[pop['age'].isin(list(range(21, 22))), ['age_cat']] = '21_21'
        pop.loc[pop['age'].isin(list(range(22, 25))), ['age_cat']] = '22_24'
        pop.loc[pop['age'].isin(list(range(25, 30))), ['age_cat']] = '25_29'
        pop.loc[pop['age'].isin(list(range(30, 35))), ['age_cat']] = '30_34'
        pop.loc[pop['age'].isin(list(range(35, 40))), ['age_cat']] = '35_39'
        pop.loc[pop['age'].isin(list(range(40, 45))), ['age_cat']] = '40_44'
        pop.loc[pop['age'].isin(list(range(45, 50))), ['age_cat']] = '45_49'
        pop.loc[pop['age'].isin(list(range(50, 55))), ['age_cat']] = '50_54'
        pop.loc[pop['age'].isin(list(range(55, 60))), ['age_cat']] = '55_59'
        pop.loc[pop['age'].isin(list(range(60, 62))), ['age_cat']] = '60_61'
        pop.loc[pop['age'].isin(list(range(62, 65))), ['age_cat']] = '62_64'
        pop.loc[pop['age'].isin(list(range(65, 67))), ['age_cat']] = '65_66'
        pop.loc[pop['age'].isin(list(range(67, 70))), ['age_cat']] = '67_69'
        pop.loc[pop['age'].isin(list(range(70, 75))), ['age_cat']] = '70_74'
        pop.loc[pop['age'].isin(list(range(75, 80))), ['age_cat']] = '75_79'
        pop.loc[pop['age'].isin(list(range(80, 85))), ['age_cat']] = '80_84'
        pop.loc[pop['age'].isin(list(range(85, 103))), ['age_cat']] = '85_99'

        pop = pd.DataFrame(pop['persons'].groupby(
            [pop['yr'], pop['age_cat'], pop['sex'], pop['race_ethn']]).sum())
        pop.to_hdf('temp/data.h5', 'pop', mode='a')
Пример #11
0
# measure script time
start_time = time.time()

# change to current directory to find .yml input config file
full_path = os.path.abspath(inspect.getfile(inspect.currentframe()))
os.chdir(os.path.dirname(full_path))

# set console display to show MultiIndex for every row
pd.set_option('display.multi_sparse', False)

# rate versions to result database & return primary key for table
# db_run_id = log.new_run('model_summary.db')
db_run_id = log.new_run()

years = util.yaml_to_dict('model_config.yml', 'years')

# Load rates for all years: SQL query to pandas DataFrame
#   columns:  'age', 'race_ethn', 'sex' (cohort), 'rate', 'year'
#   pivot migration DataFrame w 4 rates: domestic in & out, foreign in & out

birth_rates = extract.create_df('birth', 'rate_table')
death_rates = extract.create_df('death', 'rate_table')
mig_rates = extract.create_df('migration', 'rate_table', pivot=True)

# Load base population: SQL query to pandas DataFrame
#   columns:  'age', 'race_ethn', 'sex' (cohort),
#   'gq.type', 'mildep', 'persons', 'households'

population = extract.create_df('population', 'population_table')
Пример #12
0
 def requires(self):
     years = util.yaml_to_dict('model_config.yml', 'years')
     return [ExportTables(y) for y in range(years['y1'], years['yf'] + 1)]