Пример #1
0
def get_icd_to_bundle_map(map_version):
    """
    read in a specific version of the map from our mapping table
    """
    cc = clinical_mapping.get_clinical_process_data("cause_code_icg",
                                                    map_version=map_version)

    cc = cc[cc.code_system_id < 3].copy()

    dur = clinical_mapping.create_bundle_durations(map_version=map_version)

    bun = clinical_mapping.get_clinical_process_data("icg_bundle",
                                                     map_version=map_version)

    df = bun.merge(cc, how='left', on=['icg_id', 'icg_name', 'map_version'])
    df = df.merge(dur, how='left', on=['bundle_id', 'map_version'])

    keeps = [
        'bundle_id', 'cause_code', 'icg_measure', 'bundle_duration',
        'code_system_id', 'map_version'
    ]
    df = df[keeps].drop_duplicates()
    df['cause_code'] = df['cause_code'].str.replace("\W", "")

    return df
Пример #2
0
    def create_bundle_file(self):
        """Create a bundle file at:
            `FILEPATH`

        Parameters
        ----------
        None

        Returns
        -------
        None

        """


        df = clinical_mapping.get_clinical_process_data('icg_bundle', prod=True)
        mdf = clinical_mapping.get_clinical_process_data('icg_durations', prod=True)
        mdf = mdf.drop(['icg_name', 'icg_duration', 'map_version'], axis=1).drop_duplicates()
        df = df.merge(mdf, how='left', on='icg_id')

        prev_bundles = [545, 543, 544]
        warnings.warn("We're forcing the bundles {} to have measure prevalence for the modeler".format(prev_bundles))
        df.loc[df.bundle_id.isin(prev_bundles), 'icg_measure'] = 'prev'
        df = df[['bundle_id', 'icg_measure']].drop_duplicates()
        df = df.drop_duplicates()
        assert df[df.bundle_id.duplicated(keep=False)].shape[0] == 0,\
            "Bundles have duped icg measures"
        df.rename(columns={'icg_measure': 'measure'}, inplace=True)
        df['measure'] = df['measure'].astype(str)

        stata_path = "FILEPATH"\
                     "FILEPATH".format(self.run_id)
        df.to_stata(stata_path, write_index=False)
        return
Пример #3
0
def get_map_data(map_version, append_unmapped_bundles):
    """
    For a given map version, return the bundles which are present in it, along with more detailed info
    like which code systems a bundle codes to, and the min length of ICD codes it requires

    Params:
            map_version: (int) use in sql where clause to get specific map version
            append_unmapped_bundles: (bool) should the parent injuries and maternal ratios be appended on?

    Returns:
            df : a pd.dataframe of bundle IDs present in our mapping table
            bundle_icg_detail: a more detailed pd.dataframe containing icg_id and code system. We can
                               use this to determine when bundles aren't expected by code system
                               and the minimum cause_code (just icd9/10) that an icg maps to
    """

    bundle_df = clinical_mapping.get_clinical_process_data(
        'icg_bundle', map_version=map_version)
    cc_icg_df = clinical_mapping.get_clinical_process_data(
        'cause_code_icg', map_version=map_version)

    min_cc_len = cc_icg_df.copy()
    min_cc_len['min_cc_length'] = min_cc_len['cause_code'].apply(len)
    min_cc_len = min_cc_len.query("code_system_id in (1, 2)").groupby(
        'icg_id').agg({
            'min_cc_length': 'min'
        }).reset_index()
    pre_cc = cc_icg_df.shape
    cc_icg_df = cc_icg_df.merge(min_cc_len, how='left', on=['icg_id'])
    assert pre_cc[0] == cc_icg_df.shape[0]

    cc_icg_df = cc_icg_df[[
        'code_system_id', 'icg_id', 'min_cc_length', 'map_version'
    ]].drop_duplicates()

    df = bundle_df.merge(cc_icg_df, how='left', on=['icg_id', 'map_version'])

    if append_unmapped_bundles:
        unmapped = [
            264, 269, 270, 272, 275, 276, 362, 6113, 6116, 6119, 6122, 6125
        ]
        tmp = pd.DataFrame({'bundle_id': unmapped})
        df = pd.concat([df, tmp], sort=False, ignore_index=True)

    bundle_icg_detail = df.copy()

    df = df[['bundle_id', 'map_version']].drop_duplicates()
    if append_unmapped_bundles:
        assert df.shape[0] == bundle_df.bundle_id.unique().size + len(unmapped)
    else:
        assert df.shape[0] == bundle_df.bundle_id.unique().size

    df['present_in_map'] = True

    confirm_bundles_exist(df)
    confirm_bundles_exist(bundle_icg_detail)

    return df, bundle_icg_detail
Пример #4
0
def update_map_version(dev=True, tables=None):
    '''
    Pulls the lastest version of desired table, updates the version id and then appends to
    the desired tables. Looks for row with map_value of -1 (a dummy value signalling
    either a new or updated row) updates to next map version, by a value of 1, then
    removes any instance of rows with map_vaule of -1. If there are no new / updated
    rows then the map version is incremented by 1 and appended to table(s).

    Args:
        table(list str): Identfies which table(s) should be updated. Keyword 'all'
        can be invoked to update all map related tables in the database.
        Defaults to None

    Raises:
        TypeError: When the tables is not in list format. This exludes the usage of
        the term 'all'
    '''
    engine = create_engine()

    tables = [
        'cause_code_icg', 'icg_bundle', 'icg_durations', 'age_sex_restrictions'
    ]

    if table == 'all' and dev:
        tables = [e + '_dev' for e in tables]

    if type(tables) is not list:
        raise TypeError('tables must be in list format')

    for table in tables:
        tbl = cm.get_clinical_process_data(table)
        try:
            tbl_new = cm.get_clinical_process_data(table, map_version=-1)
        except AssertionError:
            tbl_new = None

        if type(tbl_new) == pd.core.frame.DataFrame:
            cols = list(tbl.columns)
            cols.remove('map_version')
            df = pd.concat([tbl, tbl_new])
            df.sort_values(by=['map_version'], inplace=True)
            tbl = df.drop_duplicates(subset=col, keep='first')
            q = '''
                DELETE FROM {}
                WHERE map_version = -1
                '''.format(table)
            engine.execute(q)

        tbl['map_version'] = max(tbl.map_version.unique()) + 1
        tbl = remove_nulls(tbl)
        tbl.to_sql(table, con=engine, if_exists='append', index=False)
def write_acause_rei_to_bundle_map(run_id):
    old_file = pd.read_csv("FILEPATH")

    bun = clinical_mapping.get_clinical_process_data('icg_bundle',
                                                     prod=True,
                                                     map_version='current')
    bun = bun[['bundle_id', 'map_version']].drop_duplicates()

    bun_map = query("SQL")

    acause_df = query("SQL")
    rei_df = query("SQL")

    new_map = bun_map[['bundle_id', 'cause_id', 'rei_id']].copy()
    new_map = new_map.merge(acause_df, how='outer', on='cause_id')
    new_map = new_map.merge(rei_df, how='outer', on='rei_id')

    new_map['bundle_acause_rei'] = "Missing"

    mask = "new_map['cause_id'].notnull()"
    new_map.loc[eval(mask), 'bundle_acause_rei'] = new_map.loc[eval(mask),
                                                               'acause']
    mask = "new_map['rei_id'].notnull()"
    new_map.loc[eval(mask), 'bundle_acause_rei'] = new_map.loc[eval(mask),
                                                               'rei']

    print("beginning a few tests")
    for acause in acause_df.acause.unique():
        cid = new_map.query("acause == @acause").cause_id.drop_duplicates()
        assert cid.size == 1
        cid = cid.iloc[0]
        assert acause == new_map.query("cause_id == @cid").acause.iloc[0]
    for rei in rei_df.rei.unique():
        rid = new_map.query("rei == @rei").rei_id.drop_duplicates()
        assert rid.size == 1
        rid = rid.iloc[0]
        assert rei == new_map.query("rei_id == @rid").rei.iloc[0]

    new_map.drop(['cause_id', 'rei_id', 'acause', 'rei'], axis=1, inplace=True)

    assert new_map.shape[0] == new_map.bundle_id.unique(
    ).size, "duplicated bundles"
    assert (new_map['bundle_acause_rei'] == "Missing").sum() == 0

    print("tests passed, writing to run {}".format(run_id))
    write_path = "FILEPATH".format(run_id)
    new_map.to_csv(write_path, index=False)
    return
Пример #6
0
    def convert_obsolete_special_map_icgs(self, df):
        """Converts the out of use ICGs from special maps to ones currently in ICD 9/10 mapping

        Parameters
        ----------
        df : pd.DataFrame
            A DataFrame of inpatient data

        Returns
        -------
        pd.DataFrame
            The `df` after it has had old icgs removed
        """
        fix_dict = {

            'e-code, (inj_poisoning_other)': 'poisoning_other',
            'lower respiratory infection(all)': 'lower respiratory infection(unspecified)',
            'neoplasm non invasive other': 'other benign and in situ neoplasms',

            'e-code, (inj_trans)': 'z-code, (inj_trans)',

            'digest, gastritis and duodenitis': '_none',


            'resp, copd, bronchiectasis': 'resp, other'
            }

        chk_df = clinical_mapping.get_clinical_process_data("icg_durations", map_version=self.map)
        chk_df = chk_df[['icg_name', 'icg_id']]

        chk_df = chk_df[chk_df['icg_name'].isin(list(fix_dict.values()))]


        exp_id_dict = {1267: 864, 1266: 1116, 1265: 159, 1264: 1241, 1263: 1, 1262: 434}

        sdiff = set(chk_df['icg_id']).symmetric_difference(set(exp_id_dict.values()))
        assert not sdiff, "The icg_ids don't match our hardcoding. This is unexpected"


        for key, value in list(fix_dict.items()):

            an_id = chk_df.query("icg_name == @value")
            assert len(an_id) == 1
            an_id = an_id.icg_id.iloc[0]
            df.loc[df['icg_name'] == key, ['icg_name', 'icg_id']] = (value, an_id)

        return df
Пример #7
0
def simple_sex_split(df, map_version='current'):
    """
    The age-sex restricted weights should have a value of zero
    perform a super simple sex splitting on values where sex is unknown/both
    but the icg maps to only 1 sex

    Params:
        df : (pandas.dataframe) icg level hospital data that's being fed into age-sex splitting
        map_version : (str or int)  Can be a specific int, but defaults to 'current' which matches our process

    """
    pre = df.shape
    cases = df['val'].sum()
    if 'icg_id' and 'icg_name' not in df.columns:
        raise ValueError("We need icg id and name to merge on restricts")
    sex_diff = set(df.sex_id.unique()) - set([1, 2, 3])
    if sex_diff:
        raise ValueError(
            f"There are some unexpected sex values here {sex_diff}")

    restrict = clinical_mapping.get_clinical_process_data(
        'age_sex_restrictions', map_version=map_version, prod=True)

    restrict = restrict.query("male == 0 or female == 0").copy()
    assert len(
        restrict.query("male == 1 and female == 1")) == 0, "logic was wrong"

    restrict = restrict[['icg_id', 'icg_name', 'male', 'female']]

    df = df.merge(restrict,
                  how='left',
                  on=['icg_id', 'icg_name'],
                  validate='m:1')
    assert len(df) == pre[0]

    df.loc[(df['male'] == 0) & (df['sex_id'] == 3), 'sex_id'] = 2
    df.loc[(df['female'] == 0) & (df['sex_id'] == 3), 'sex_id'] = 1

    df.drop(['male', 'female'], axis=1, inplace=True)
    post = df.shape

    if pre[0] != post[0] or pre[1] != post[1]:
        raise ValueError(f"This df changed shape!! from {pre} to {post}")
    assert cases == df['val'].sum(), "CASES changed!!!"

    return df
Пример #8
0
def add_icg_ids(df):
    '''
    The nfc maps are inconsistent for bs_ids, that is different ids are assigned to the same
    icg across different versions of the maps. We enforce consistent bs_ids as well as adding
    new ids for new icgs

    Args:
        df(pd.df): Must have

    Return:
        pd.DataFrame
    '''
    maps = df.copy()
    dur = cm.get_clinical_process_data('icg_durations', prod=prod)

    dur.drop(['map_version', 'icg_measure'], axis=1, inplace=True)

    df = maps.merge(dur, on="icg_name", how='left')
    msg = 'Some rows went missing'
    assert temp.shape[0] == df.shape[0], msg

    m_icgs = maps.icg_name.unique().tolist()
    d_icgs = d_cp.icg_name.unique().tolist()

    m = set(m_icgs) - set(d_icgs)
    print(len(m))
    msg = 'Mismatch of new ICGs'
    assert len(m) == df.loc[df.icg_id.isnull(), 'icg_name'].unique().size, msg

    start = dur.icg_id.max() + 1
    stop = start + len(new_icg)
    new_icg = list(m)
    new_icg_id = np.arange(start, stop, 1)
    r = list(zip(new_icg, new_icg_id))

    for e in r:
        df.loc[df.icg_name == e[0], 'icg_id'] = e[1]

    msg = 'There are still ICGs without an ID'
    assert df.icg_id.isnull().sum() == 0, msg

    return df
Пример #9
0
def insert_icg_helper(tbl_name, df):
    '''
    Helper function which assists in adding an icg to the database.

    Args:
        tbl_name(str) : mirrors the name of the table that is to be updated.

        df(pd.df) : DataFrame of the new values. Saved in a long format

    Return:
        result: Either a string or a list based on the results of the testing
        df_sub(pd.sub)
    '''

    tbl = cm.get_clinical_process_data(tbl_name)
    tbl_cols = tbl.columns.tolist()
    tbl_cols.remove('icg_id')
    tbl_v = tbl['map_version'].unique().item()

    df_sub = df[df.col_name.isin(tbl_cols)]
    df_sub = pd.pivot_table(df_sub,
                            index='icg_id',
                            values='value',
                            columns='col_name',
                            aggfunc=np.sum).reset_index()

    tbl = pd.concat([tbl, df_sub], sort=False).reset_index(drop=True)
    tbl['map_version'] = tbl_v
    tbl = cast_to_int(tbl, tbl_name)

    if tbl_name.startswith('cause_code'):
        result = cm.test_cc_icg_map(tbl)
    elif tbl_name.startswith('icg_bundle'):
        result = cm.test_icg_bundle(tbl)
    elif tbl_name.startswith('icg_durations'):
        result = cm.test_icg_vals(tbl, tbl_v)

    df_sub = cast_to_int(df_sub, tbl_name)
    return result, df_sub
Пример #10
0
def test_df(df, tbl_name):
    '''
    Generates a dataframe that is used to tested for data quality and ensure mapping rules.
    In addition, a second dataframe is produced and contains only the updated / new table
    rows.

    Args:
        df(pd.df): Dataframe that contains the column to be updated as well as the old and new
        values

        tbl_name(str): Name of the table that is being updated

    Returns:
        tbl: pd.DataFrame
        update_df: pd.DataFrame

    '''
    tbl = cm.get_clinical_process_data(tbl_name)
    ids = df.icg_id.unique().tolist()
    assert tbl[tbl.icg_ids.isin(
        ids)], 'ICG IDs are not present in the database table'

    df = df.replace(None, -1)
    tbl['update'] = 0
    for i, r in df.iterrows():
        col = r['col_name']
        tbl.loc[(tbl.icg_id == r['icg_id']) & (tbl[col] == r['old_val']),
                [col, 'update']] = [r['new_val'], 1]

    tbl = cast_to_int(tbl, tbl_name)

    tbl = tbl.replace(-1, np.nan)

    update_df = tbl[tbl.update == 1]
    update_df.drop(['update'], axis=1, inplace=True)
    tbl.drop(['update'], axis=1, inplace=True)
    tbl = remove_nulls(tbl)

    return tbl, update_df
Пример #11
0

# this is what we need our data to look like. each file is a cf type
example_df = pd.read_csv(FILEPATH)
# cols = ['cf_location_id', 'sex_id', 'age_start', 'mean_incidence', 'icg_name', 'icg_id']

files = glob.glob(FILEPATH)

# get the draw data
back = pd.concat([pd.read_csv(f) for f in files],
                 sort=False,
                 ignore_index=True)

df = add_UI(back.copy())

icg_df = clinical_mapping.get_clinical_process_data(
    "icg_durations", map_version=20)[['icg_id', 'icg_name']]

df = df.merge(icg_df, how='left', on='icg_id')
df.drop(['upper', 'lower', 'age_group_id'], axis=1, inplace=True)
for col in ['age_start', 'sex_id']:
    df[col] = pd.to_numeric(df[col], downcast='integer')

for cft in df.cf_type.unique():
    if cft == 'cf1':
        cfname = 'indvcf'
    elif cft == 'cf2':
        cfname = 'incidence'
    elif cft == 'cf3':
        cfname = 'prevalence'
    else:
        assert False, 'the hell?!'
Пример #12
0
def insert_icg(df, dev=True, update_version=False):
    '''
    Inserts a new ICG to the map. By default the map version is updated for any new
    insert. There are, however, instances where several inerts may be applied to the map
    so the user has the opportunity not to update the version. The ICG is used as the key
    to bind the new mappings.

    Args:
        df(pd.df): Contains the complete mapping of the icd, that is there at least one associated
        cause_code and bundle_id and measure / duration

        update_version(bol): Updates the entire map version to the next iteration for
        all tables. Defaults to False
    '''

    count = 0
    cols = [
        'cause_code', 'code_system_id', 'icg_name', 'icg_measure',
        'icg_duration', 'bundle_id', 'male', 'female', 'yld_age_start',
        ' yld_age_end'
    ]
    ids = df.icg_id.unique().tolist()

    for id in ids:
        r = df.loc[df.icg_id == id, 'col_name'].unique().tolist()
        assert cols == r, 'Missing columns for icg_id {}'.format(id)

    tbl_cur = cm.get_clinical_process_data('icg_durations_dev')
    try:
        tbl_new = cm.get_clinical_process_data('icg_durations_dev',
                                               map_version=-1)
        tbl = tbl = pd.concat([tbl_cur, tbl_new])
    except AssertionError:
        print('There are no pending icg inserts')
        tbl = tbl_cur

    icg_new = df.icg_id.unique().tolist()
    assert tbl[tbl.icg_id.isin(
        icg_new)].empty, 'You are adding ICGs that already exisit'

    tbls = [
        'cause_code_icg', 'icg_bundle', 'icg_durations', 'age_sex_restrictions'
    ]
    if dev:
        tbls = [e + '_dev' for e in tbls]

    insert_dict = {}
    for e in tbls:
        result, insert_df = insert_icg_helper(e, df)
        if type(result) == str:
            insert_dict[e] = insert_df
            count += 1
        else:
            return result

    if count == 3:
        for k, v in list(insert_dict.items()):
            update_table(df=v, tbl_name=k, map_version=-1)

    if update_version and count == 3:
        update_map_version(update_version)