def aggregate_output(run_id, groups=350, agg_to_national=True, cause_type='icg', cf_prep=True):
    """
    MS is processed by enrolid now, which results in very large output files
    This function waits until all jobs are finished then aggregates them all together.

    Params:
        groups: (int) the number of groups that were used to process the MS claims data. This
                      is dependent on the ms db helpers module, so do not change it unless you're certain
        agg_to_national (bool) if True, re-assign all location IDs to the national level in order to save space
    """

    job_holder()
    print("All jobs have finished, aggregating data to the national level and writing the condensed CSV and H5 files.")

    files = glob.glob("FILEPATH".format(run_id, cause_type))
    if len(files) != groups:
        warnings.warn("The number of files we read back in doesn't match the number of jobs sent out")
        assert False, "Look to the logs for failed jobs"

    # read in 1 file to append all groups to
    df = pd.read_hdf(files[0])
    df = hosp_prep.age_binning(df, terminal_age_in_data=False, drop_age=True)

    if agg_to_national:
        df['location_id'] = 102

    # case sum to check aggregation
    pre = df[(df['estimate_type'] == 'inp_any_indv_cases')].val.sum()

    # make first otp_cf_df outside of loop
    if cf_prep:
        otp_cf_df = outpatient_aggregations(df)

    counter = 1
    for f in files[1:]:
        print("starting {}".format(f))
        tmp = pd.read_hdf(f)
        if len(tmp) == 0:
            continue

        tmp = hosp_prep.age_binning(tmp, terminal_age_in_data=False, drop_age=True)
        if agg_to_national:
            tmp['location_id'] = 102

        if cf_prep:
            otp_tmp = outpatient_aggregations(tmp)
            otp_cf_df = pd.concat([otp_cf_df, otp_tmp],
                                  ignore_index=True, sort=False)

        pre += tmp[(tmp['estimate_type'] == 'inp_any_indv_cases')].val.sum()

        df = pd.concat([df, tmp], ignore_index=True, sort=False)

        # we can drop inp_otp claims cases cause they're unused
        df = df[df['estimate_type'] != 'inp_otp_any_adjusted_otp_only_claims_cases']

        df = df.fillna(0)
        if counter % 25 == 0 or counter == groups:
            df = df.groupby(df.columns.drop('val').tolist()).agg({'val': 'sum'}).reset_index()
            if cf_prep:
                otp_cf_df = otp_cf_df.groupby(otp_cf_df.drop(['val'], axis=1).columns.tolist()).agg({'val': 'sum'}).reset_index()

        print("There are {} rows in df object".format(df.shape[0]))
        print("{}% done".format(round(counter/float(groups), 3) * 100))
        counter += 1

    if cf_prep:
        # one last groupby
        otp_cf_df = otp_cf_df.groupby(otp_cf_df.drop(['val'], axis=1).columns.tolist()).agg({'val': 'sum'}).reset_index()

        # save
        otp_cf_base_dir = "FILEPATH".format(run_id)
        otp_cf_filepath = "{}/bundle_condensed_otp_repreped_claims_process.csv".format(otp_cf_base_dir)
        otp_cf_df.to_csv(otp_cf_filepath, index=False)

    # a final groupby for good measure, I think the counter was perhaps not hitting the groups value above so the last
    # groupby didn't occur creating a bit of a larger data set than necessary
    df = df.groupby(df.columns.drop('val').tolist()).agg({'val': 'sum'}).reset_index()

    df_pri = df[(df['estimate_type'] == 'inp_any_indv_cases')].val.sum()
    print("pre equals {} and df.val sum equals {} It's {} they're equal!".format(pre, df_pri, pre == df_pri))
    assert df_pri == pre, "we expect primary inpatient indv cases to be equal and they're not"

    # write to a csv for use with R scripts
    filepath = "FILEPATH".format(run_id, cause_type)
    filepath = filepath.replace("\r", "")
    df.to_csv(filepath, index=False)

    # Saving the file to H5 as well and back it up
    write_path = "FILEPATH".format(run_id, cause_type)
    write_path = write_path.replace("\r", "")
    hosp_prep.write_hosp_file(df, write_path, backup=False)

    if cause_type == 'icg':
        prep_for_upload(df=df, run_id=run_id)

    return
예제 #2
0
str_cols = ['source', 'facility_id', 'outcome_id']

if df[str_cols].isnull().any().any():
    warnings.warn("\n\n There are NaNs in the column(s) {}".format(
        df[str_cols].columns[df[str_cols].isnull().any()]) +
                  "\n These NaNs will be converted to the string 'nan' \n")

for col in int_cols:
    df[col] = pd.to_numeric(df[col], errors='raise', downcast='integer')
for col in str_cols:
    df[col] = df[col].astype(str)

df.loc[df['age'] > 95, 'age'] = 95

df = hosp_prep.age_binning(df)

df.loc[(df['sex_id'] != 1) & (df['sex_id'] != 2), 'sex_id'] = 3

diagnosis_feats = df.columns[df.columns.str.startswith('dx_')]

for feat in diagnosis_feats:
    df[feat] = hosp_prep.sanitize_diagnoses(df[feat])

if len(diagnosis_feats) > 1:

    df = hosp_prep.stack_merger(df)

elif len(diagnosis_feats) == 1:
    df.rename(columns={'dx_1': 'cause_code'}, inplace=True)
    df['diagnosis_id'] = 1
예제 #3
0
def prep_2014():
    """
    Funtion to read in and prepare data from GEO for the years 2012-2013.

    Returns:
        Pandas DataFrame with data for the years 2012-2013-2011 in a format
        consistent with the other years of data.
    """

    df = pd.read_excel(r"FILEPATH/"
                       r"GEO_HOSPITAL_DISCHARGE_DATA_2014_Y2015M06D29.XLSX")

    # Select features from raw data to keep
    keep = [
        'Sex',
        'Age (years)',
        'Main Diagnosis (ICD10)',
        'External causes (ICD10)',
        'Disharge status',  # typo: Discharge
        'Complication (ICD 10)',
        'Comorbidity (ICD 10)',
        'Beddays'
    ]
    df = df[keep].copy()

    # we need to provide 2014 wide on diagnoses for EN matrix, but long on
    # diagnoses for the final product.  And we don't want to format twice,
    # so we're going to make it so all years can be stuck together, save 2014
    # wide, then make the whole thing long.

    # rename the diagnosis columns
    df = df.rename(
        columns={
            'Main Diagnosis (ICD10)': 'dx_1',
            'External causes (ICD10)': 'dx_2',
            'Complication (ICD 10)': 'dx_3',
            'Comorbidity (ICD 10)': 'dx_4'
        })

    # make outcome id
    # this is from the codebook on the second sheet of the file
    outcome_dict = {1: 'discharge', 2: 'discharge', 3: 'discharge', 4: 'death'}
    df['outcome_id'] = df["Disharge status"].map(outcome_dict)

    # now can drop "Disharge status"
    df.drop("Disharge status", axis=1, inplace=True)

    # Need to get rid of bed days. before that, need to use bed days
    # can't have bed days of 0, unless they died.
    df = df[(df.outcome_id != "discharge") |
            (df.Beddays >= 1)]  # stuff to keep

    # now can drop Beddays
    df = df.drop("Beddays", axis=1)

    # rename and make columns
    df = df.rename(columns={"Sex": "sex_id", "Age (years)": "age"})
    df['year_start'] = 2014
    df['year_end'] = 2014

    # make a val column. every row is an admission
    df["val"] = 1

    df['age'] = pd.to_numeric(df['age'], errors="coerce")

    # bin the ages
    df.loc[df.age > 99, 'age'] = 99  # 100 and up were not binning
    df = hosp_prep.age_binning(df)

    # don't need age after binning.
    df = df.drop("age", axis=1)

    # take care of some nulls
    df.loc[df.sex_id.isnull(), "sex_id"] = 3
    df = df[df.dx_1.notnull()]
    df = df[df.outcome_id.notnull()]  # just 4 rows

    return df
예제 #4
0
def apply_restrictions(df,
                       age_set,
                       cause_type,
                       map_version='current',
                       prod=True):
    """
    Apply age and sex restrictions by ICG or bundle to a dataframe of clinical data

    Params:
        df: (pd.DataFrame) clinical data
        age_set: (str) is the data in indv year ages, binned age groups with start/end or
                       age_group_ids
                        acceptable params are "indv", "binned", "age_group_id"
        cause_type: (str) do we want icg restricts or bundle restricts

    Returns:
        df: (pd.DataFrame) with rows that fall outside of age-sex restrictions dropped
    """
    warnings.warn("apply_restrictions needs a testing suite!!")
    sex_diff = set(df.sex_id.unique()).symmetric_difference([1, 2])
    if sex_diff:
        warnings.warn(
            f"There are sex_id values that won't have restrictions applied to them. These are {sex_diff}"
        )

    assert age_set in ['indv', 'binned', 'age_group_id'
                       ], "{} is not an acceptable age set".format(age_set)

    check_map_version(map_version)

    start_cols = df.columns

    if age_set == "age_group_id":
        import gbd_hosp_prep

        df = gbd_hosp_prep.all_group_id_start_end_switcher(df)
    elif age_set == 'indv':
        df = hosp_prep.age_binning(df,
                                   drop_age=False,
                                   terminal_age_in_data=False)

    df['to_keep'] = 1

    if cause_type == 'icg':
        restrict = get_clinical_process_data('age_sex_restrictions',
                                             map_version,
                                             prod=prod)
    elif cause_type == 'bundle':
        restrict = create_bundle_restrictions(map_version)
    else:
        assert False, "pick an acceptable restriction type"

    assert set(restrict.loc[restrict['yld_age_start'] < 1,
                            'yld_age_start'].unique()) == {0}

    keep_cols = [
        cause_type + '_id', 'male', 'female', 'yld_age_start', 'yld_age_end'
    ]

    pre = df.shape[0]
    df = df.merge(restrict[keep_cols], how='left', on=cause_type + '_id')
    assert pre == df.shape[0], ("merge made more rows, there's something wrong"
                                " in the restrictions file")

    df.loc[(df['male'] == 0) & (df['sex_id'] == 1), 'to_keep'] = np.nan

    df.loc[(df['female'] == 0) & (df['sex_id'] == 2), 'to_keep'] = np.nan

    df.loc[df['age_end'] <= df['yld_age_start'], 'to_keep'] = np.nan

    df.loc[df['age_start'] > df['yld_age_end'], 'to_keep'] = np.nan

    df = df[df['to_keep'].notnull()]

    df.drop(['male', 'female', 'yld_age_start', 'yld_age_end', 'to_keep'],
            axis=1,
            inplace=True)

    if age_set == "age_group_id":

        df = gbd_hosp_prep.all_group_id_start_end_switcher(df)
    elif age_set == "indv":
        df.drop(['age_start', 'age_end'], axis=1, inplace=True)

    diff_cols = set(start_cols).symmetric_difference(set(df.columns))
    assert not diff_cols, "The diff columns are {}".format(diff_cols)

    return df
예제 #5
0
    2012: 121282,
    2013: 150449,
    2014: 220205,
    2015: 281773
}
df = hosp_prep.fill_nid(df, nid_dict)

df.los.value_counts(dropna=False).head()
df.los.isnull().sum()

df = df[df['los'] > 0]
final_admits = len(df)

df['facility_id'] = 'hospital'

df = hosp_prep.age_binning(df, drop_age=True)

int_cols = [
    'location_id', 'year_start', 'year_end', 'age_group_unit', 'age_start',
    'age_end', 'sex_id', 'nid', 'representative_id', 'metric_id'
]
str_cols = ['source', 'facility_id', 'outcome_id']

for col in int_cols:
    df[col] = pd.to_numeric(df[col], errors='raise', downcast='integer')
for col in str_cols:
    df[col] = df[col].astype(str)

df.loc[~df['sex_id'].isin([1, 2]), 'sex_id'] = 3

diagnosis_feats = df.columns[df.columns.str.startswith('dx_')]
예제 #6
0
def prep_2014():
    """
    Funtion to read in and prepare data from GEO for the years 2012-2013.

    Returns:
        Pandas DataFrame with data for the years 2012-2013-2011 in a format
        consistent with the other years of data.
    """

    df = pd.read_excel(root + r"FILENAME"
                       r"FILEPATH")


    
    keep = ['Sex', 'Age (years)', 'Main Diagnosis (ICD10)',
            'External causes (ICD10)', 'Disharge status',  
            'Complication (ICD 10)', 'Comorbidity (ICD 10)', 'Beddays']
    df = df[keep].copy()

    
    
    
    


    
    df = df.rename(columns={'Main Diagnosis (ICD10)': 'dx_1',
                            'External causes (ICD10)': 'dx_2',
                            'Complication (ICD 10)': 'dx_3',
                            'Comorbidity (ICD 10)': 'dx_4'})


    
    
    outcome_dict = {1: 'discharge',
                    2: 'discharge',
                    3: 'discharge',
                    4: 'death'}
    df['outcome_id'] = df["Disharge status"].map(outcome_dict)


    
    df.drop("Disharge status", axis=1, inplace=True)


    
    
    df = df[(df.outcome_id != "discharge")|(df.Beddays >= 1)]  

    
    df = df.drop("Beddays", axis=1)


    
    df = df.rename(columns={"Sex": "sex_id", "Age (years)": "age"})
    df['year_start'] = 2014
    df['year_end'] = 2014

    
    df["val"] = 1


    
    df['age'] = pd.to_numeric(df['age'], errors="coerce")

    
    df.loc[df.age > 99, 'age'] = 99  
    df = hosp_prep.age_binning(df)

    
    df = df.drop("age", axis=1)


    
    df.loc[df.sex_id.isnull(), "sex_id"] = 3  
    df = df[df.dx_1.notnull()]  
    df = df[df.outcome_id.notnull()]  

    return df