def create_neural_network(output_nodes, hidden_layers, hidden_nodes_1, hidden_nodes_2): if hidden_nodes_2: hidden_nodes = [hidden_nodes_1, hidden_nodes_2] else: hidden_nodes = [hidden_nodes_1] node_dict = dict(zip(range(0, hidden_layers), hidden_nodes)) model = Sequential() for layer in range(0, hidden_layers): print_log_message(f"adding {layer} layer") model.add(Dense(node_dict[layer], activation="relu")) model.add(Dense(output_nodes, activation="softmax")) print_log_message("compiling model") model.compile(optimizer="adam", loss="categorical_crossentropy") return model
def get_computed_dataframe(self): """Return computations.""" split_type = self.needs_splitting() if not split_type: print_log_message("No location splitting required.") self.diag_df = None return self.df assert 'sample_size' in self.df.columns start_deaths = (self.df.sample_size * self.df.cf).sum() if split_type == "UKR": env_wide = self.prep_envelope(split_type) df = self.adjust_ukr(env_wide, split_type) df = pd.concat([df, self.df], ignore_index=True) end_deaths = (df.sample_size * df.cf).sum() assert np.isclose((end_deaths / start_deaths), 2, atol=0.05) else: df_list = [] for split_type in ["urban", "rural"]: env_wide = self.prep_envelope(split_type) orig_id = self.split_ids[split_type][0] new_id = self.split_ids[split_type][1] df_ap = self.df.loc[self.df['location_id'] == orig_id] df = self.adjust_ap_telangana(orig_id, new_id, env_wide, df_ap) df_list.append(df) ap_ids = [self.split_ids["urban"][0] ] + [self.split_ids["rural"][0]] df_no_ap = self.df.loc[~(self.df['location_id'].isin(ap_ids))] df_list.append(df_no_ap) df = pd.concat(df_list, ignore_index=True) end_deaths = (df.sample_size * df.cf).sum() assert np.isclose(start_deaths, end_deaths, rtol=0.001) df = df[self.orig_cols] return df
def create_train_test(df, test, int_cause, icd_feature, age_group_id, most_detailed): """Create train/test datasets, if running tests, randomly sample from all locations so models don't take forever to run""" locs = get_location_metadata(gbd_round_id=6, location_set_id=35) # identify column corresponding to ICD attributes of interest icd_col = f"{icd_feature}_cause_info" keep_cols = DEM_COLS + [icd_col, int_cause ] + [x for x in list(df) if "multiple_cause" in x] df = df.loc[(df.age_group_id != 283) & (df.age_group_id != 160)] df = df[keep_cols] df = create_age_bins(df, AGG_AGES) df = drop_age_restricted_cols(df) if not most_detailed: print_log_message("aggregating to country level") df = get_country_names(df) if age_group_id: print_log_message(f"subsetting to just age group id {age_group_id}") df = df.loc[df["age_group_id"] == age_group_id] print_log_message(f"resulting df is {len(df)} rows") df["cause_age_info"] = df[[icd_col, "age_group_id" ]].astype(str).apply(lambda x: " ".join(x), axis=1) df["dem_info"] = df[[ icd_col, "location_id", "sex_id", "year_id", "age_group_id" ]].astype(str).apply(lambda x: " ".join(x), axis=1) garbage_df = df.query(f"cause_id==743 & {int_cause}==1") df = df.query(f"cause_id!=743 & {int_cause}!=1") if test: print_log_message( "THIS IS A TEST.. only using 5000 rows from each loc") df = df.merge(locs[["location_id", "parent_id", "level"]], on="location_id", how="left") # map subnationals to parent so # random sampling will be at country level df["location_id"] = np.where(df["level"] > 3, df["parent_id"], df["location_id"]) df.drop(columns=["parent_id", "level"], inplace=True) # get a random sample from each location # bc full dataset takes forever to run dfs = [] for loc in list(df.location_id.unique()): subdf = df.query(f"location_id=={loc}") random_df = subdf.sample(n=7000, replace=False) dfs.append(random_df) df = pd.concat(dfs, ignore_index=True, sort=True) # split train 75%, test 25% train_df, test_df = train_test_split(df, test_size=0.25) return train_df, test_df, garbage_df
def create_training_data(self, df, age_group_id=None): if age_group_id: write_dir = f"{self.model_dir}/{age_group_id}" else: write_dir = f"{self.model_dir}" makedirs_safely(write_dir) train_df, test_df, int_cause_df = create_train_test( df, test=self.test, int_cause=self.int_cause, icd_feature=self.icd_features, age_group_id=age_group_id, most_detailed=self.most_detailed_locs) print_log_message(f"writing train/test to df for {age_group_id}") train_df.to_csv(f"{write_dir}/train_df.csv", index=False) test_df.to_csv(f"{write_dir}/test_df.csv", index=False) int_cause_df.to_csv(f"{write_dir}/int_cause_df.csv", index=False)
def create_test_datatsets(test_df, dirichlet_dict, write_dir, dataset_num, df_size, age_feature, dem_feature): """Generate a test dataset of same length as the original test dataset Arguments: test_df: the actual test dataframe dirichlet_dict: dictionary mapping each cause id in actual test data to its respective proportion in the test data (generated from a Dirichlet distribution) write_dir: a directory to write each dataset to dataset_num: which dataset (of the 500) to create df_size: from the ModelLauncher, the desired size of the generated test data (should be same size as the actual test data) age_feature: (Bool) - Do you want to include age as a feature? dem_feature: (Bool) - Do you want to include all demographic cols (age, sex, year, and location) as features? """ # create df of desired length df = pd.DataFrame({"cause_id": [np.NaN] * df_size}) dfs = [] # loop through each cause and generate rows with # multiple cause and demographic information for cause in dirichlet_dict.keys(): # proportion from dirichlet dictates how many # rows are assigned to a given cause subdf = df.sample(frac=dirichlet_dict[cause], replace=True).assign(cause_id=cause) print_log_message(f"generating multiple cause rows for {cause}") mcause_df = generate_multiple_cause_rows(subdf, test_df, cause, age_feature, dem_feature) dfs.append(mcause_df) # if rerunning, remove previous dataset information remove_if_output_exists(write_dir, f"dataset_{dataset_num}.csv") remove_if_output_exists( write_dir, f"dataset_{dataset_num}_dirichlet_distribution.pkl") dfs = pd.concat(dfs, sort=True, ignore_index=True) print_log_message(f"writing dataset {dataset_num} to a df") # write generated test dataset to csv dfs.to_csv(f"{write_dir}/dataset_{dataset_num}.csv", index=False) # save randomly generated dirichlet distribution # in case need to exactly replicate joblib.dump( dirichlet_dict, f"{write_dir}/dataset_{dataset_num}_dirichlet_distribution.pkl")
def __init__(self, code_system_id, cause_set_version_id): self.code_system_id = code_system_id misc.print_log_message("Getting metadata") CONF = Configurator() self.art_path = CONF.get_resource("age_restriction_targets") self.age_df = ages.get_cod_ages(**self.standard_cache_options) self.cause_meta_df = causes.get_current_cause_hierarchy( cause_set_version_id=cause_set_version_id, **self.standard_cache_options) if code_system_id in self.allowed_code_system_ids: misc.print_log_message( "Creating age restriction mapping dataframe") invalid_ages_df = self.get_invalid_ages_df() misc.print_log_message("Creating for {}".format({ 1: "ICD10", 6: "ICD9_detail" }[code_system_id])) art_df = self.read_age_restriction_targets_df(code_system_id) self.art_mapping_df = self.get_age_restriction_target_mapping_df( art_df, invalid_ages_df) else: self.art_mapping_df = None
def drop_age_restricted_cols(df): start = len(df) age_meta_df = get_ages(force_rerun=False, block_rerun=True) # secret causes in restrictions cause_meta_df = get_current_cause_hierarchy(cause_set_id=4, **{ 'block_rerun': True, 'force_rerun': False }) restrict_df = pd.read_csv( "/homes/agesak/thesis/maps/injuries_overrides.csv") restrict_df = add_cause_metadata(restrict_df, add_cols='cause_id', merge_col='acause', cause_meta_df=cause_meta_df) restrict_df["age_start_group"] = restrict_df["age_start_group"].fillna(0) orig_cols = df.columns df = add_age_metadata( df, add_cols=['age_group_years_start', 'age_group_years_end'], age_meta_df=age_meta_df) df = df.merge(restrict_df, on='cause_id', how='left') # age_group_years_end is weird, 0-14 means age_group_years_end 15 too_young = df["age_group_years_end"] <= df["age_start_group"] too_old = df["age_group_years_start"] > df["age_end_group"] df = df[~(too_young | too_old)] df = df[orig_cols] end = len(df) print_log_message( f"dropping {start - end} cols that violate age restrictions") return df
def read_in_data(int_cause, inj_garbage=False, code_system_id=None): """Read in and append all MCoD data""" # col and ita dont have icd 9 print_log_message("reading in not limited use data") if inj_garbage: print_log_message( "reading in formatted df with only nonX59/Y34 garbage codes as UCOD" ) subdirs = f"{int_cause}/thesis/inj_garbage" else: subdirs = f"{int_cause}/thesis" # it"s not good the sources are hard-coded if code_system_id != 6: # col and ita dont have icd 9 udf = get_mcause_data(phase="format_map", source=["COL_DANE", "ITA_ISTAT"], sub_dirs=subdirs, data_type_id=9, code_system_id=code_system_id, assert_all_available=True, verbose=True, **BLOCK_RERUN) else: udf = pd.DataFrame() print_log_message("reading in limited use data") datasets = get_datasets( **{ "force_rerun": True, "block_rerun": False, "source": MCauseLauncher.limited_sources, "code_system_id": code_system_id }) limited_metadata = datasets.apply( lambda x: str(x['nid']) + "_" + str(x['extract_type_id']), axis=1).values dfs = [] for source in MCauseLauncher.limited_sources: limited_dir = get_limited_use_directory(source, int_cause, inj_garbage) csvfiles = glob.glob(os.path.join(limited_dir, "*.csv")) for file in csvfiles: if any(meta in file for meta in limited_metadata): df = pd.read_csv(file) dfs.append(df) ldf = pd.concat(dfs, ignore_index=True, sort=True) df = pd.concat([udf, ldf], sort=True, ignore_index=True) return df
def main(model_param, model_name, write_dir, train_dir, int_cause, short_name, age_feature, dem_feature): """Run gridsearch pipeline for a given classifier * Note this script is parallelized by parameter set (to allow for feasible run times) so each gridsearch object is fed only 1 set of model parameters, but this is done over a range of parameters Arguments: model_param: (str) - a single set of model parameters for a given classifier model_name: the classifier name as defined by SciKit Learn write_dir: a directory to write the model object and summary to train_dir: a directory where the training dataset lives int_cause: the injuries garbage code of interest short_name: the abbreviated name for each classifier defined in the ModelLauncher age_feature: (Bool) - Do you want to include age as a feature? dem_feature: (Bool) - Do you want to include all demographic cols (age, sex, year, and location) as features? """ # determine the model's feature vector if age_feature: x_col = "cause_age_info" elif dem_feature: x_col = "dem_info" else: x_col = "cause_info" print_log_message("reading in data") model_df = pd.read_csv(f"{train_dir}/train_df.csv")[[ "cause_id", f"{x_col}", f"{int_cause}" ]] print_log_message("formatting parameters") model_params = format_gridsearch_params(short_name, model_param) print_log_message("running pipeline") results, grid_results = run_pipeline(model_name, short_name, model_df, model_params, write_dir, int_cause, age_feature, dem_feature) results.to_csv(f"{write_dir}/summary_stats.csv", index=False) joblib.dump(grid_results, f"{write_dir}/grid_results.pkl")
def get_computed_dataframe(self): """Main method to execute computations and return result. Notes: UNDECIDED HOW TO DO THIS WITHOUT ALL YEARS IN MEMORY LIKE STATA HAD Potential solutions: 1. Don't do this at all, just correct ANY cause-age-sex-location-year that exceeds the global reference rate - this would potentially change results slightly, but does not seem unreasonable, and in fact seems more correct 2. Prime HIV correction by assembling the list ahead of time - might take a long time and need to be rerun every time, which would essentially double the required time for this step - advantage is that it mimics last years results without needing any additional years of data - could eliminate some of the problems with this method by running it very infrequently instead of every time the data changes 3. Take a 'source' argument in the class and pull the other data that we pulled last year to pool years necessary to generate this list 4. Run HIV correction with all the data for a 'source' altogether, like the stata code did, but still update versions based on nid-year FOR NOW: Follow method 1 and expect to test the similarity later """ keep_cols = self.df.columns if not self.country_needs_correction(): print_log_message("Country doesn't need hiv correction") self.diag_df = None return self.df print_log_message("Getting rates df") rates_df = self.get_rates_df(self.cause_meta_df) if self.correct_garbage: df = add_code_metadata(self.df, add_cols=['value'], code_system_id=self.code_system_id, force_rerun=False, block_rerun=True, cache_dir=self.cache_dir) df = self.identify_sepsis_gc(df, self.code_system_id) df = self.identify_injury_gc(df, self.code_system_id) df = self.identify_hivrd_gc(df, self.code_system_id) # do a groupby to collapse down to cause_id level for next steps group_cols = [ x for x in keep_cols if x not in ['code_id', 'deaths'] ] df_by_code = df.copy() df_by_cause = df.groupby(group_cols, as_index=False)['deaths'].sum() else: df_by_cause = self.df df = add_population(df_by_cause, pop_df=self.pop_df) print_log_message("Flagging correct dem groups for " "{0} rows of data".format(len(df))) df = flag_correct_dem_groups(df, self.code_system_id, self.cause_meta_df, self.loc_meta_df, self.age_meta_df, rates_df, self.reference_ages, self.move_gc_age_restrictions, self.value_cols, self.pop_col, self.cause_selections_path, correct_garbage=self.correct_garbage) cause_to_targets_map = self.get_cause_to_targets_map( self.cause_meta_df) print_log_message("Identifying positive excess") df = identify_positive_excess(df, rates_df, cause_to_targets_map, self.reference_ages, self.loc_meta_df, self.cause_meta_df, self.value_cols, self.pop_col, self.correct_garbage) if self.correct_garbage: df = self.calculate_garbage_positive_excess( df, df_by_code, group_cols) print_log_message("Moving excess to target") df = move_excess_to_target(df, self.value_cols, cause_to_targets_map, self.correct_garbage) computed_df = assign_code_to_created_target_deaths( df, self.code_system_id, self.cause_meta_df) else: print_log_message("Moving excess to target") computed_df = move_excess_to_target(df, self.value_cols, cause_to_targets_map, self.correct_garbage) self.diag_df = computed_df return computed_df[keep_cols]
def launch(self): if self.phase == "train_test": df = read_in_data(int_cause=self.int_cause, code_system_id=self.code_system_id) if self.by_age: for age_group_id in ModelLauncher.agg_ages: print_log_message(f"working on age: {age_group_id}") self.create_training_data(df, age_group_id) else: self.create_training_data(df) if self.phase == "launch_training_model": for short_name in self.model_types: model_name = ModelLauncher.model_dict[short_name] if model_name in [ "MultinomialNB", "BernoulliNB", "ComplementNB" ]: params = naive_bayes_params(short_name) else: get_params = getattr( import_module(f"thesis_utils.modeling"), f"{short_name}_params") params = get_params(short_name) print_log_message(f"launching {model_name}") print_log_message(f"{len(params)} sets of model parameters") for parameter in params: param = format_argparse_params( parameter, ModelLauncher.param_dict[short_name]) if self.by_age: for age_group_id in ModelLauncher.agg_ages: print_log_message( f"launching models for age: {age_group_id}") self.launch_training_models( model_name, short_name, param, age_group_id) else: self.launch_training_models(model_name, short_name, param) if self.phase == "create_test_datasets": if self.by_age: for age_group_id in ModelLauncher.agg_ages: print_log_message(f"working on age: {age_group_id}") self.launch_create_testing_datasets(age_group_id) else: self.launch_create_testing_datasets() if self.phase == "launch_testing_models": for short_name in self.model_types: model_name = ModelLauncher.model_dict[short_name] # get parameters of best model fit for given model if self.by_age: for age_group_id in ModelLauncher.agg_ages: best_model_params = self.get_best_model( short_name, age_group_id) self.launch_testing_models(model_name, short_name, best_model_params, age_group_id) else: best_model_params = self.get_best_model(short_name, age_group_id=None) self.launch_testing_models(model_name, short_name, best_model_params, age_group_id=None) if self.phase == "launch_int_cause_predictions": for short_name in self.model_types: if self.by_age: for age_group_id in ModelLauncher.agg_ages: self.launch_int_cause_predictions( short_name=short_name, age_group_id=age_group_id) else: self.launch_int_cause_predictions(short_name=short_name, age_group_id=None)
# will only need to run this once ever tbh # for int_cause in ["x59", "y34"]: # rd = format_gbd_results(int_cause) # rd = pretty_print(rd) # rd.to_csv(f"/home/j/temp/agesak/thesis/model_results/{int_cause}_gbd_2019.csv", index=False) model_dict = {"x59": "", "y34": ""} # inconsistency here with short name for naive bayes # here short name is "nb" until the best naive bayes # is identified (then short name will be either # multi_nb, bernoulli_nb, or complement_nb like normal) for int_cause in ["x59", "y34"]: print_log_message(f"working on {int_cause}") for short_name in ["rf", "nb", "xgb", "nn"]: if short_name == "nb": update_model_dict(int_cause) # get the short name associated with the best naive bayes model short_name = model_dict[int_cause] print_log_message(f"working on {short_name}") df = format_classifier_results(int_cause, short_name) rd = format_gbd_results(int_cause) rd.rename(columns={ "prop": "prop_GBD2019", f"{int_cause}": f"{int_cause}_deaths_GBD2019" }, inplace=True) # merge on 2019 results # df = df.merge(rd, on=["age_group_id", "sex_id", "location_id", "year_id", "cause_id"], how="left")
def main(data_dir, predicted_test_dir, int_cause, short_name, model_name, age_feature, dem_feature): """Summarize evaluation metrics across 500 test datasets Refit the classifier on all observed data Predict on the unobserved data """ # determine the model's feature vector if age_feature: x_col = "cause_age_info" elif dem_feature: x_col = "dem_info" else: x_col = "cause_info" ## comment out for quick run ## summaries = read_in_summary_stats(predicted_test_dir) ## comment out for quick run ## summarize evaluation metrics across the datasets ## aggregate_evaluation_metrics(summaries, predicted_test_dir) # read in test df test_df = pd.read_csv( f"{data_dir}/test_df.csv")[DEM_COLS + ["cause_id", f"{x_col}", f"{int_cause}"]] # read in train df train_df = pd.read_csv( f"{data_dir}/train_df.csv")[DEM_COLS + ["cause_id", f"{x_col}", f"{int_cause}"]] print_log_message("read in train and test") # concat train/test to refit a model on all the observed data df = pd.concat([train_df, test_df], sort=True, ignore_index=True) print_log_message("reading in params df") param_df = pd.read_csv("/homes/agesak/thesis/maps/parameters.csv") param_df = param_df[[x for x in list(param_df) if short_name in x]] param_df[f"{short_name}"] = param_df[f"{short_name}"].str.replace( "clf__estimator__", "") ## comment out for quick run ## params = summaries.best_model_params.iloc[0] ## add for quick run params = get_best_fit(data_dir, short_name) # format best params to feed to classifier if isinstance(params, six.string_types): best_params = params.split("_") else: best_params = [params] param_kwargs = dict(zip(param_df.iloc[:, 0], best_params)) if short_name == "nn": # these feed into create_neural_network hidden_nodes_1 = int(param_kwargs["hidden_nodes_1"]) hidden_layers = int(param_kwargs["hidden_layers"]) hidden_nodes_2 = int(param_kwargs["hidden_nodes_2"]) # parameters with clf__ are only fed to keras classifier param_kwargs = {k: v for k, v in param_kwargs.items() if "clf__" in k} # ensure column dtypes are correct measure_dict = {"int": int, "float": float, "str": str} for key, value in param_kwargs.items(): dtype = param_df.loc[param_df[ f"{short_name}"] == key, f"{short_name}_dtype"].iloc[0] param_kwargs[key] = measure_dict[dtype](param_kwargs[key]) # run Neural network separately because classifier # takes secondary arguments related to build if short_name == "nn": param_kwargs = {k.replace("clf__", ""): v for k, v in param_kwargs.items() if "clf__" in k} cv = CountVectorizer(lowercase=False, token_pattern=r"(?u)\b\w+\b") tf = cv.fit_transform(df[f"{x_col}"]) print_log_message("converting to dense matrix") tf = tf.todense() # just hard code classifer name because this only works for keras model = KerasClassifier(build_fn=create_neural_network, output_nodes=len( df.cause_id.unique()), hidden_layers=hidden_layers, hidden_nodes_1=hidden_nodes_1, hidden_nodes_2=hidden_nodes_2, **param_kwargs) print_log_message("fitting KerasClassifier") model.fit(tf, df["cause_id"].values, **param_kwargs) else: # refit all other classifiers cv = CountVectorizer(lowercase=False) tf = cv.fit_transform(df[f"{x_col}"]) print_log_message(f"fitting {model_name}") model = eval(model_name)(**param_kwargs).fit(tf, df["cause_id"]) # now predict on the unobserved data print_log_message("reading in unobserved_df") unobserved_df = pd.read_csv( f"{data_dir}/int_cause_df.csv")[DEM_COLS + ["cause_id", f"{x_col}", f"{int_cause}"]] new_counts = cv.transform(unobserved_df[f"{x_col}"]) if short_name == "nn": print_log_message("converting unobserved data to dense matrix") new_counts = new_counts.todense() unobserved_df["predictions"] = model.predict(new_counts) ## add for quick run makedirs_safely(predicted_test_dir) print_log_message("writing to df") unobserved_df.to_csv(f"{predicted_test_dir}/model_predictions.csv") joblib.dump( model, f"{predicted_test_dir}/model_fit.pkl") print_log_message("wrote model fit")
def main(best_model_dir, dataset_dir, testing_model_dir, best_model_params, int_cause, dataset_num, age_feature, dem_feature): """Predict on each test dataset Arguments: best_model_dir: directory that houses model object of best model dataset_dir: directory that houses the generated test datasets (used for all classifiers) testing_model_dir: classifier-specific directory to write predictions best_model_params: (str) - the best model parameters ex. formatted as param_param_param int_cause: the injuries garbage code of interest dataset_num: which dataset (of the 500) to work on age_feature: (Bool) - Do you want to include age as a feature? dem_feature: (Bool) - Do you want to include all demographic cols (age, sex, year, and location) as features? """ # determine the model's feature vector if age_feature: x_col = "cause_age_info" elif dem_feature: x_col = "dem_info" else: x_col = "cause_info" # read in model object of best models print_log_message("reading in grid results object") grid_results = joblib.load(f"{best_model_dir}/grid_results.pkl") # read in test dataset print_log_message("reading in data") dataset = pd.read_csv(f"{dataset_dir}/dataset_{dataset_num}.csv") # predit on test dataset print_log_message("predicting") dataset["predicted"] = grid_results.predict(dataset[f"{x_col}"]) # determine values of evaluation metrics macro_precision = precision_score(y_true=dataset.cause_id, y_pred=dataset.predicted, average="macro") micro_precision = precision_score(y_true=dataset.cause_id, y_pred=dataset.predicted, average="micro") macro_recall = recall_score(y_true=dataset.cause_id, y_pred=dataset.predicted, average="macro") micro_recall = recall_score(y_true=dataset.cause_id, y_pred=dataset.predicted, average="micro") accuracy = accuracy_score(y_true=dataset.cause_id, y_pred=dataset.predicted) cccsmfa = calculate_cccsmfa(y_true=dataset.cause_id, y_pred=dataset.predicted) concordance = calculate_concordance(y_true=dataset.cause_id, y_pred=dataset.predicted, int_cause=int_cause) # save information about each prediction df = pd.DataFrame({"Concordance": [concordance], "CCCSMFA": [cccsmfa], "Macro Recall": [macro_recall], "Micro Recall": [micro_recall], "Macro Precision": [macro_precision], "Micro Precision": [micro_precision], "Accuracy": [accuracy], "best_model_params": [best_model_params]}) print_log_message("writing dfs") df.to_csv( f"{testing_model_dir}/dataset_{dataset_num}_summary_stats.csv", index=False)
def get_computed_dataframe(self): keep_cols = self.df.columns if not self.country_needs_correction(): print_log_message("Country doesn't need hiv correction") self.diag_df = None return self.df print_log_message("Getting rates df") rates_df = self.get_rates_df(self.cause_meta_df) if self.correct_garbage: df = add_code_metadata(self.df, add_cols=['value'], code_system_id=self.code_system_id, force_rerun=False, block_rerun=True, cache_dir=self.cache_dir) df = self.identify_sepsis_gc(df, self.code_system_id) df = self.identify_injury_gc(df, self.code_system_id) df = self.identify_hivrd_gc(df, self.code_system_id) group_cols = [ x for x in keep_cols if x not in ['code_id', 'deaths'] ] df_by_code = df.copy() df_by_cause = df.groupby(group_cols, as_index=False)['deaths'].sum() else: df_by_cause = self.df df = add_population(df_by_cause, pop_df=self.pop_df) print_log_message("Flagging correct dem groups for " "{0} rows of data".format(len(df))) df = flag_correct_dem_groups(df, self.code_system_id, self.cause_meta_df, self.loc_meta_df, self.age_meta_df, rates_df, self.reference_ages, self.move_gc_age_restrictions, self.value_cols, self.pop_col, self.cause_selections_path, correct_garbage=self.correct_garbage) cause_to_targets_map = self.get_cause_to_targets_map( self.cause_meta_df) print_log_message("Identifying positive excess") df = identify_positive_excess(df, rates_df, cause_to_targets_map, self.reference_ages, self.loc_meta_df, self.cause_meta_df, self.value_cols, self.pop_col, self.correct_garbage) if self.correct_garbage: df = self.calculate_garbage_positive_excess( df, df_by_code, group_cols) print_log_message("Moving excess to target") df = move_excess_to_target(df, self.value_cols, cause_to_targets_map, self.correct_garbage) computed_df = assign_code_to_created_target_deaths( df, self.code_system_id, self.cause_meta_df) else: print_log_message("Moving excess to target") computed_df = move_excess_to_target(df, self.value_cols, cause_to_targets_map, self.correct_garbage) self.diag_df = computed_df return computed_df[keep_cols]
def run_pipeline(model, short_name, model_df, model_params, write_dir, int_cause, age_feature, dem_feature): n_jobs_dict = { "nn": 2, "rf": -1, "xgb": -1, "bernoulli_nb": -1, "multi_nb": -1, "complement_nb": -1 } if short_name == "svm_bag": model = { 'model': BaggingClassifier, 'kwargs': { 'base_estimator': eval(model)() }, 'parameters': model_params } # create pipeline with bagging classifier pipeline = Pipeline([ # token pattern allows pattern of length 1 character ("bow", CountVectorizer(lowercase=False, token_pattern=r"(?u)\b\w+\b")), ('name', model['model'](**model['kwargs'])) ]) cv_params = model['parameters'] elif short_name == "nn": hidden_layers = int(model_params["hidden_layers"]) print_log_message( "deleting hidden layers from keras gridsearch params") del model_params["hidden_layers"] hidden_nodes_1 = int(model_params["hidden_nodes_1"]) print_log_message( "deleting hidden nodes 1 from keras gridsearch params") del model_params["hidden_nodes_1"] if hidden_layers > 1: hidden_nodes_2 = int(model_params["hidden_nodes_2"]) print_log_message( "deleting hidden nodes 2 from keras gridsearch params") else: hidden_nodes_2 = None del model_params["hidden_nodes_2"] pipeline = Pipeline([ ("bow", CountVectorizer(lowercase=False)), ("dense", FunctionTransformer(lambda x: x.todense(), accept_sparse=True)), ("clf", KerasClassifier(build_fn=create_neural_network, output_nodes=len(model_df.cause_id.unique()), hidden_layers=hidden_layers, hidden_nodes_1=hidden_nodes_1, hidden_nodes_2=hidden_nodes_2)) ]) cv_params = model_params.copy() print_log_message(f"cv_params are {cv_params}") else: pipeline = Pipeline([("bow", CountVectorizer(lowercase=False)), ("clf", ClfSwitcher())]) model_params.update({"clf__estimator": [eval(model)()]}) cv_params = model_params.copy() scorer_list = create_custom_scorers(int_cause) scoring = { "macro_precision": scorer_list[0], "micro_precision": scorer_list[1], "macro_recall": scorer_list[2], "micro_recall": scorer_list[3], "accuracy": scorer_list[4], "cccsmfa": scorer_list[5], "concordance": scorer_list[6] } print_log_message("creating gridsearch object") gscv = GridSearchCV(pipeline, cv_params, cv=5, scoring=scoring, n_jobs=n_jobs_dict[short_name], pre_dispatch=6, refit="concordance", verbose=6) print_log_message("fitting model") if age_feature: grid_results = gscv.fit(model_df["cause_age_info"], model_df["cause_id"]) elif dem_feature: grid_results = gscv.fit(model_df["dem_info"], model_df["cause_id"]) else: grid_results = gscv.fit(model_df["cause_info"], model_df["cause_id"]) print_log_message("saving model results") results = pd.DataFrame.from_dict(grid_results.cv_results_) return results, grid_results
def get_computed_dataframe(self): """Return computations.""" split_type = self.needs_splitting() if not split_type: print_log_message("No location splitting required.") self.diag_df = None return self.df # quick check that sample size has been created assert 'sample_size' in self.df.columns # grab total deaths of incoming data start_deaths = (self.df.sample_size * self.df.cf).sum() if split_type == "UKR": # prep envelope df env_wide = self.prep_envelope(split_type) # split data df = self.adjust_ukr(env_wide, split_type) # append ukraine w/o crimea/sevastopol to incoming data # national data not modeled, but needed for CodViz df = pd.concat([df, self.df], ignore_index=True) # expected result is national data + national w/o crimea/sevastopol # plus crimea, plus sevastopol end_deaths = (df.sample_size * df.cf).sum() assert np.isclose((end_deaths / start_deaths), 2, atol=0.05) else: df_list = [] for split_type in ["urban", "rural"]: # prep envelope df env_wide = self.prep_envelope(split_type) # get location_ids to split orig_id = self.split_ids[split_type][0] new_id = self.split_ids[split_type][1] # separate out andhra pradesh data df_ap = self.df.loc[self.df['location_id'] == orig_id] # do location splittling df = self.adjust_ap_telangana(orig_id, new_id, env_wide, df_ap) df_list.append(df) # remove ap from incoming data ap_ids = [self.split_ids["urban"][0] ] + [self.split_ids["rural"][0]] df_no_ap = self.df.loc[~(self.df['location_id'].isin(ap_ids))] df_list.append(df_no_ap) df = pd.concat(df_list, ignore_index=True) # compare start and end deaths end_deaths = (df.sample_size * df.cf).sum() assert np.isclose(start_deaths, end_deaths, rtol=0.001) # make sure not to add any new columns df = df[self.orig_cols] return df