def multiple_imputation(train_df, test_df, add_indicator=False, estimator=None,\ imputation_order='ascending', initial_strategy='mean',\ max_iter=10, missing_values=np.nan,\ n_nearest_features=None, random_state=None,\ sample_posterior=False): ''' This function is used to provide multivariate imputation method to fill in the missing values of a given DataFrame Inputs: df: DataFrame missing_values: indicator of the missing value in the data initial_strategy: the initial strategy used to impute the value n_nearest_features: select n features used in the multivariate method which have n highest correlation with the column contains missing values. Returns: dataframe with missing values filled ''' imp_model = impute.IterativeImputer(add_indicator=add_indicator, \ estimator=estimator, imputation_order='ascending',\ initial_strategy='mean', max_iter=10, \ missing_values=np.nan, n_nearest_features=None,\ random_state=None, sample_posterior=False) columns = list(df.columns) train_df = imp_model.fit_transform(train_df) test_df = imp_model.transform(test_df) train_df = pd.DataFrame(train_df, columns=columns) test_df = pd.DataFrame(test_df, columns=columns) return train_df, test_df
def data_pipeline(train: pd.DataFrame, test: pd.DataFrame, random_state=42, **kwargs) -> tuple: """Fits a transformer pipeline on the train data, then transform both the train and test data. This makes sure that the test data is not contaminated. Args: train (pd.DataFrame): Train data. test (pd.DataFrame): Test data, never to be inspected. random_state (int, optional): Not in use. Defaults to 42. Returns: tuple: Returns the transformed train and test data. """ transformer_pipeline = make_pipeline( # remove_outliers(), # impute.SimpleImputer(add_indicator=False), impute.IterativeImputer(random_state=random_state, **kwargs), StandardScaler(), ) # transformer_pipeline.fit(train) train = pd.DataFrame(transformer_pipeline.fit_transform(train)) test = pd.DataFrame(transformer_pipeline.transform(test)) return train, test
def multiple_imputation(train_df, test_df, continuous_columns, estimator=None, max_iter=10, n_nearest_features=None): ''' This function is used to provide multivariate imputation method to fill in the missing values of a given DataFrame Inputs: df: DataFrame missing_values: indicator of the missing value in the data initial_strategy: the initial strategy used to impute the value n_nearest_features: select n features used in the multivariate method which have n highest correlation with the column contains missing values. Returns: dataframe with missing values filled ''' imp_model = impute.IterativeImputer(estimator=estimator,\ max_iter=max_iter, missing_values=np.nan, n_nearest_features=n_nearest_features) columns = list(continuous_columns) new_train_df = imp_model.fit_transform(train_df[columns]) new_test_df = imp_model.transform(test_df[columns]) train_df = train_df.drop([columns], axis=1) test_df = test_df.drop([columns], axis=1) train_df = train_df.join(pd.DataFrame(data=new_train_df, columns=columns)) test_df = test_df.join(pd.DataFrame(data=new_test_df,columns=columns)) return train_df, test_df
def fill_missing_values(train_data, test_data): # Fill enmbarked column embarked_impouter = impute.SimpleImputer(missing_values=np.nan, strategy='most_frequent') train_data_filled = train_data.copy() test_data_filled = test_data.copy() train_data_filled[['embarked']] = embarked_impouter.fit_transform( train_data[['embarked']]) test_data_filled[['embarked']] = embarked_impouter.fit_transform( test_data[['embarked']]) # Delete cabin column train_data_filled = train_data_filled.drop(['cabin'], axis=1) test_data_filled = test_data_filled.drop(['cabin'], axis=1) # fare column fill fare_imputer = impute.IterativeImputer(missing_values=np.nan, random_state=42) train_data_filled[['fare', 'TicketClass']] = fare_imputer.fit_transform( train_data[['fare', 'TicketClass']]) test_data_filled[['fare', 'TicketClass']] = fare_imputer.fit_transform( test_data[['fare', 'TicketClass']]) # Age column fill # plt.figure("before fill") # plt.hist(train_data_filled['age'], bins=80) age_impouter = impute.IterativeImputer(missing_values=np.nan, random_state=42) train_data_filled[['age', 'parch', 'sibsp', 'fare', 'TicketClass' ]] = age_impouter.fit_transform(train_data[[ 'age', 'parch', 'sibsp', 'fare', 'TicketClass' ]]) test_data_filled[['age', 'parch', 'sibsp', 'fare', 'TicketClass']] = age_impouter.fit_transform(test_data[[ 'age', 'parch', 'sibsp', 'fare', 'TicketClass' ]]) # plt.figure("after fill") # plt.hist(train_data_filled['age'], bins=80) # plt.show() return train_data_filled, test_data_filled
def _model_impute(self): for col in self.target: m_impute = impute.IterativeImputer(estimator=self.estimator, random_state=42) m_impute.fit(self.df[col].values) self.output_df.loc[:, col] = m_impute.fit_transform( self.df[col].values) return self.output_df
def fixData(trainFileName, testFileName, features, imputer="simple", strategy="mean"): print("Fixing Data\n") #Read files into pandas array training_data = pd.read_csv(trainFileName) testing_data = pd.read_csv(testFileName) featuresForDummies = ["Embarked", "Sex"] trainSurvived = training_data["Survived"] passengerID = testing_data["PassengerId"] features2 = [] for i in range(len(features)): features2.append( features[i]) #Appends feature selected to the features to use training_data = training_data[features2] testing_data = testing_data[features2] tr_data = pd.get_dummies( training_data, columns=featuresForDummies) #Get dummies for required ones te_data = pd.get_dummies(testing_data, columns=featuresForDummies) if imputer.lower() == "simple": imp = impute.SimpleImputer(missing_values=np.NaN, strategy=strategy) #Imputes data elif imputer.lower() == "knn": imp = impute.KNNImputer(missing_values=np.NaN) elif imputer.lower() == "iterative": imp = impute.IterativeImputer(missing_values=np.NaN, initial_strategy=strategy) else: print("You did not enter a correct imputation method.") print( "Correct imputation methods include: \"Simple\", \"KNN\", \"Iterative\"" ) imp.fit(te_data) dummied_test = imp.transform(te_data) #Fits data imp.fit(tr_data) dummied_train = imp.transform(tr_data) return (dummied_test, dummied_train, trainSurvived, passengerID ) #Returns the completed arrays
def fit(self, X, y=None): X = X.copy() columns = X.columns.values indices = X.index #toto sme uz riesili v preprocessing notebooku - chceme, aby nam null hodnoty neinkrementovali encoding hodnoty v strede datasetu, #ale aby sme mali urcity range celociselnych hodnot, bez dier, ktore sa pouzije v imputerovi #je to klucove aj pri KNN imputerovi, aj pri Iterative imputerovi, lebo pri iterative pracujeme so ciselnymi hodnotami, #ktore su kludne aj desatinne, a teda nakoniec sa vysledok imputera rounduje #a pri knn sice pracujeme s celocislenymi cislami, no nakoniec imputuje sa priemer ziskany z danych #n-susedov, co znova moze byt desatinne cislo #takze, aby sme nahodou pri roundovani sa nedostali na encoding hodnotu, ktora patri null hodnote, tak #feedujeme danemu ordinal encodingu hned na zaciatku null hodnoty null_values = pd.DataFrame(index=pd.Index([-1]), columns=columns, data=[[np.nan for i in range(len(columns))]]) X = pd.concat([null_values, X]) self.ordinal_encoder = ce.ordinal.OrdinalEncoder( handle_missing="return_nan", handle_unknown="return_nan") X = self.ordinal_encoder.fit_transform(X) X = X[1:] if self.imputer_type == "knn": self.imputer = impute.KNNImputer() X = self.imputer.fit(X) elif self.imputer_type == "iterative": self.imputer = impute.IterativeImputer( max_iter=20, random_state=42, initial_strategy="most_frequent", min_value=X.min(), max_value=X.max()) try: X = self.imputer.fit(X) except (ValueError, np.linalg.LinAlgError): print( "Jeden error bol trapnuty, kedy funkcii vadili NaNs. Tento error je ale divny, lebo mu to vadi", "len prvy krat, a potom to uz ide...") X = self.imputer.fit(X) return self
def impute_fit_transform(train: pd.DataFrame, test: pd.DataFrame, random_state=42, **kwargs) -> tuple: """Fits imputing on the train data, and then fits this both on the train and test data. Args: train (pd.DataFrame): Train data to be fitted and transformed test (pd.DataFrame): Test data to be transformed random_state (int, optional): Defaults to 42. Returns: tuple: [description] """ imputer = impute.IterativeImputer(random_state=random_state, **kwargs) imputer = imputer.fit(train) train = pd.DataFrame(imputer.transform(train)) test = pd.DataFrame(imputer.transform(test)) return train, test
def train(dname, mname, rseed, shuffle_params=None): # ICU preprocessigng is now in its own function # mtype = MTYPES[mname] # kwargs = {} # if dname=='icu' and ('linear' in mname or 'nn' in mname or 'cwcf' in mname): kwargs['onehot']=True # CWCF now runs in parallel across several GPUs # if mname=='cwcf' and 'CUDA_VISIBLE_DEVICES' not in os.environ: # ngpu = len(tf.config.list_physical_devices('GPU')) # cur_gpu = rseed%ngpu if rseed is not None else 0 # os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" # see issue #152 # os.environ["CUDA_VISIBLE_DEVICES"]=str(cur_gpu) ################## # DATA ################## # Load data we're using (Xtrain, ytrain), (Xvalid, yvalid), (Xtest, ytest), costs, groups, extras = LOADERS[dname]( split_seed=rseed) #,**kwargs) # If we're using PACT we need some of the extra (redundant) features that were unused in our study if mname == 'pact': (Xtrain, ytrain), (Xvalid, yvalid), (Xtest, ytest), costs, groups, extras = load_ed( name=config.ED_NAME, costtype=config.ED_COSTTYPE, drop_redundant=False, split_seed=rseed) # print([(n,c) for n,c in zip(Xtrain.columns,costs) if c>0.01]) # Xtrain_raw,Xvalid_raw,Xtest_raw = Xtrain,Xvalid,Xtest # For bootstrapping, we don't do this anymore and do train/test splits instead # Xtrain_raw, ytrain = bootstrap_set(Xtrain,ytrain,rseed=rseed) # Xvalid_raw, yvalid = bootstrap_set(Xvalid,yvalid,rseed=rseed) # Xtest_raw, ytest = bootstrap_set(Xtest,ytest,rseed=rseed) # If we're using a non-GBM AI method, we need to impute NaNs and scale # Don't do this if using ICU data because we're using a Pipeline in that case # that handles this stuff if ('linear' in mname or 'nn' in mname or 'cwcf' in mname or 'node' in mname) and (dname != 'icu'): imputer = impute.SimpleImputer() scaler = preprocessing.StandardScaler() Xtrain_np = scaler.fit_transform(imputer.fit_transform(Xtrain)) Xvalid_np = scaler.transform(imputer.transform(Xvalid)) Xtest_np = scaler.transform(imputer.transform(Xtest)) for df, npy in zip([Xtrain, Xvalid, Xtest], [Xtrain_np, Xvalid_np, Xtest_np]): df.iloc[:] = npy # Hackier code for preprocessing features, can probably remove # Xtrain,Xvalid,Xtest = [pd.DataFrame(data=npy,columns=df.columns,index=df.index) for df,npy in zip( # [Xtrain_raw,Xvalid_raw,Xtest_raw],[Xtrain_np,Xvalid_np,Xtest_np])] # else: # (Xtrain,Xvalid,Xtest) = Xtrain_raw,Xvalid_raw,Xtest_raw # Concatenated data for training cost-aware models after tuning Xtv = pd.concat([Xtrain, Xvalid]) ytv = np.hstack((ytrain, yvalid)) # Grouped costs for datasets tht feature it # Outpatient dataset # Or linear/NN on ICU (one-hot encoding of admission dx) unique_costs = np.array([ costs[groups == g].mean() for g in np.unique(groups) ]) if (dname == 'outpatient') or ( dname == 'icu' and mname in ('linear', 'linearh', 'nn')) else costs ################## # PARAMETER TUNING ################## # If we've precomputed best parameters, just load those if TUNING == 'LOAD' and (('gbm' in mname) or (mname in ('fixedmodel', 'imputemodel')) or ('linear' in mname) or ('nn' in mname) or ('cegb' in mname)): loadname = 'gbmsage' if ((mname in ('fixedmodel', 'imputemodel')) or ('cegb' in mname) or ('gbmsage' in mname)) else mname with open(f'{OUTPATH}/{loadname}-{dname}-{rseed}.pkl', 'rb') as w: model = pickle.load(w) # Otherwise do some parameter tuning else: # Tune GBM if ('gbm' in mname) or (mname in ('cegb', 'fixedmodel', 'imputemodel')): model = tune(Xtrain, Xvalid, ytrain, yvalid) # Linear model needs onehotencoding pipeline if we're doing ICU elif ('linear' in mname): if (dname == 'icu'): model = lintune(Xtrain, Xvalid, ytrain, yvalid, mfunc=icu_preprocessing(get_linear_model)) else: model = lintune(Xtrain, Xvalid, ytrain, yvalid) # NN model needs onehotencoding pipeline if we're doing ICU elif 'nn' in mname: if (dname == 'icu'): model = tftune(Xtrain, Xvalid, ytrain, yvalid, mfunc=icu_preprocessing(get_tf_model), return_extras=False) else: model = tftune(Xtrain, Xvalid, ytrain, yvalid, return_extras=False) # NODE model doesn't need tuning elif 'node' in mname: model = {} # If we indicated we want to save the model, do so # print(model) if TUNING == 'SAVE' and (('gbm' in mname) or (mname in ('cegb', 'fixedmodel', 'imputemodel')) or ('linear' in mname) or ('nn' in mname)): with open(f'{OUTPATH}/{mname}-{dname}-{rseed}.pkl', 'wb') as w: pickle.dump(model, w) exit() # Limit number of jobs for processor-hungry models print(mname) if mname not in ('qsofa', 'aps', 'apacheiii', 'apacheiva'): if (('gbm' in mname) or ('cegb' in mname) or ('linear' in mname) or ('imputemodel' in mname)): model['n_jobs'] = 4 if dname == 'trauma' else 2 # else: model['n_jobs']=10 ################## # Setup for CoAI ################## # Instantiate predictive models if ('gbm' in mname) or ('cegb' in mname) or (mname in ('fixedmodel', 'imputemodel')): bst = lgb.LGBMClassifier(**model) elif 'linear' in mname: bst = icu_preprocessing(FastLinearClassifier)( **model) if dname == 'icu' else FastLinearClassifier(**model) elif 'nn' in mname: bst = icu_preprocessing(get_fast_keras)( **model) if dname == 'icu' else get_fast_keras(**model) elif 'node' in mname: bst = icu_preprocessing(NodeClassifier)( experiment_name=f'trauma{rseed}', ** model) if dname == 'icu' else NodeClassifier( experiment_name=f'trauma{rseed}', **model) # Get our explainer (using SAGE entirely now, shap code is old & may not work perfectly) if ('sage' in mname) or (mname in ('cegb', 'fixedmodel', 'imputemodel')): #sage_params={'imputetype':'default'} #if 'gbm' in mname: sage_params={'imputetype':'marginal'} # SAGE explainer. N_permutations set super low for NODE bc we're # just testing it right now exp = labelless_sage_wrapper( imputetype='marginal', refsize=64, batch_size=32, wrap_categorical=(dname == 'icu'), n_permutations=(128 if 'node' in mname else None)) # NODE debugging line # print(dict(imputetype=('default' if 'node' in mname else 'marginal'),refsize=(1 if 'node' in mname else 64))) # Mostly deprecated elif mname == 'gbmshap': exp = OneDimExplainer elif mname == 'linearshap': exp = get_pipeline_explainer(LinearExplainer) # Prepare to perturb costs if required (robustness experiments) if shuffle_params is not None: # Negative numbers indicate individiual robustness if ((shuffle_params[0] < 0) and (shuffle_params[1] < 0)): costs, shuffle_costs = cost_pair(-shuffle_params[0], -shuffle_params[1], Xtrain) # Positive indicate swap robustness - # swaps and seed else: shuffle_costs = cost_swaps(costs, shuffle_params[0], shuffle_params[1]) # Pick thresholds for CoAI dthresh = np.linspace(0, np.sum(unique_costs) + 1, 100) ##################### # Actually train/test ##################### if 'sage' in mname or 'shap' in mname: # Wrap model with CoAI if 'greedy' in mname: GRP = knapsack.GroupGreedy(bst, exp) else: GRP = knapsack.GroupOptimizer(bst, exp, scale_ints=1000 * 100 if ('sage' in mname) else 1000) # NN needs preprocessing pipeline if ICU, also pass # epochs, verbosity if 'nn' in mname: if dname == 'icu': GRP.fit(Xtv, ytv, costs, groups, dthresh, model__epochs=10, model__verbose=False) else: GRP.fit(Xtv, ytv, costs, groups, dthresh, epochs=10, verbose=False) # NODE needs preprocessing for ICU. # Also requires eval set for stopping time # Current max_iter is short for prototyping elif 'node' in mname: dthresh = np.linspace(0, np.sum(unique_costs) + 1, 10) if dname == 'icu': GRP.fit(Xtrain, ytrain, costs, groups, dthresh, model__eval_set=(Xvalid, yvalid), model__max_iter=15) else: GRP.fit(Xtrain, ytrain, costs, groups, dthresh, eval_set=(Xvalid, yvalid), max_iter=15) # All other CoAI methods get a standardized fit process else: GRP.fit(Xtv, ytv, costs, groups, dthresh) # Evaluate CoAI models GRP.score_models_proba(Xtest, ytest, roc_auc_score) # If costs get shuffled, each model's deployment cost will change if shuffle_params: GRP.recalculate_costs(shuffle_costs) # Impute-CoAI with mean imputation elif 'fixed' in mname: bst = bst.fit(Xtv, ytv) GRP = knapsack.FixedModelExactRetainer(bst, exp) GRP.fit(Xtv, ytv, costs, dthresh) if shuffle_params: GRP.refit(Xtv, ytv, shuffle_costs) GRP.score_models_proba(Xtest, ytest, roc_auc_score) # Impute-CoAI with model-based imputation (IterativeImputer) elif 'impute' in mname: imputer = impute.IterativeImputer(random_state=0, estimator=linear_model.RidgeCV()) bst = bst.fit(Xtv, ytv) imputer.fit(Xtv) GRP = knapsack.FixedModelImputer(bst, exp, imputer) GRP.fit(Xtv, ytv, costs, dthresh) if shuffle_params: GRP.refit(Xtv, ytv, shuffle_costs) GRP.score_models_proba(Xtest, ytest, roc_auc_score) # GRP.fit(Xtv,ytv,costs,groups,dthresh) if mname=='default' else GRP.fit(Xtv,ytv,costs,dthresh) # CEGB doesn't use an explainer elif ('cegb' in mname): GRP = cegb.CEGBOptimizer(model=bst, lambdas=np.logspace(-5, 5, 101)) GRP.fit(Xtv, ytv, costs, groups=(groups if 'group' in mname else None)) GRP.score_models_proba(Xtest, ytest, roc_auc_score) # Account for grouped costs if in outpatient data if (dname == 'outpatient'): GRP.recalculate_costs(costs, groups) # Account for any cost perturbations if shuffle_params: GRP.recalculate_costs(shuffle_costs) elif ('cwcf' in mname): # Lots of preprocessing if using ICU data to encode categoricals # as ordinal ints (save memory, handle groups, etc) if dname == 'icu': types = Xtrain.dtypes for col in Xtrain.columns: if str(types[col]) == 'category': l_enc = preprocessing.OrdinalEncoder( handle_unknown='use_encoded_value', unknown_value=np.nan) for df in [Xtrain, Xvalid, Xtest]: if 'UNK' not in df[col].cat.categories: df[col].cat.add_categories(['UNK'], inplace=True) df[col].fillna('UNK', inplace=True) Xtrain[col] = l_enc.fit_transform( np.array(Xtrain[col]).reshape(-1, 1)) Xvalid[col] = l_enc.transform( np.array(Xvalid[col]).reshape(-1, 1)) Xtest[col] = l_enc.transform( np.array(Xtest[col]).reshape(-1, 1)) # Old mode imputation code, better now (broken by dtype) # for df in [Xtrain,Xvalid,Xtest]: # if df[col].isna().any(): # df[col][df[col].isna()] = Xtrain[col].mode().iloc[0] # Xtrain[col] = Xtrain[col].fillna(Xtrain[col].mode().iloc[0]) # Xvalid[col] = Xvalid[col].fillna(Xtrain[col].mode().iloc[0]) # Xtest[col] = Xtest[col].fillna(Xtrain[col].mode().iloc[0]) elif str(types[col]) == 'int64': Xtrain[col].fillna(Xtrain[col].mode(), inplace=True) Xvalid[col].fillna(Xtrain[col].mode(), inplace=True) Xtest[col].fillna(Xtrain[col].mode(), inplace=True) else: Xtrain[col].fillna(Xtrain[col].mean(), inplace=True) Xvalid[col].fillna(Xtrain[col].mean(), inplace=True) Xtest[col].fillna(Xtrain[col].mean(), inplace=True) # CWCF only takes nparrays for labels ytrain, yvalid, ytest = [np.array(x) for x in (ytrain, yvalid, ytest)] print('Training CWCF...' ) # So we know when jobs get farmed out to other processes # Used to turn "groups" down to 6 for outpatient just to prototype group support if 'lagrange' in mname: data_lmbds = { 'trauma': np.linspace(0, np.sum(unique_costs), 17)[1:], 'icu': np.linspace(0, np.sum(unique_costs), 17)[1:], 'outpatient': np.linspace(0, np.sum(unique_costs), 17)[1:] } else: data_lmbds = { 'trauma': np.logspace(-14, 1, 16), 'icu': np.logspace(-14, 1, 16), 'outpatient': np.logspace(-14, 1, 16) } # This is usually range(2) to get some stability over reps -- doesn't matter as much for outpatient # Can turn down to 1 when prototyping lmbds = np.hstack([data_lmbds[dname] for _ in range(2)]) # Old single threaded mode # GRP = cwcf.CWCFClassifier(costs=costs,dirname=config.CWCF_TMPDIR) # GRP.fit(Xtrain,Xvalid,Xtest,ytrain,yvalid,ytest) # print([x.shape for x in (Xtrain,Xvalid,Xtest,ytrain,yvalid,ytest,costs,lmbds)]) # Run CWCF - groups argument does experimental groups handling (not working yet) # More jobs (even more than GPUs) can be used - gets you through the lambda list faster # Set up right now for L3 gpus 1-6. GRP = cwcf.get_cwcf(Xtrain, Xvalid, Xtest, ytrain, yvalid, ytest, costs, lmbds, gpus=np.random.permutation(8), njobs=16, dirname=config.CWCF_TMPDIR, lagrange=('lagrange' in mname), metric=roc_auc_score, difficulty=1000, groups=(groups if 'group' in mname else None)) print('Done') # Done with external process run # ICU baselines elif mname in ('aps', 'apacheiii', 'apacheiva'): strain, svalid, stest = aps_baselines(split_seed=rseed) mpreds = stest # mpreds = bootstrap_set(mpreds,rseed=rseed) preds = mpreds[mname] score = roc_auc_score(ytest, preds) cost = config.EICU_SCORE_COSTS[mname] GRP = lambda x: x GRP.model_costs, GRP.model_scores = np.array([cost]), np.array([score]) GRP.test_preds = np.array(preds) elif mname in ('qsofa'): qtest = qsofa_score(split_seed=rseed) qpreds = qtest #bootstrap_set(qtest,rseed=rseed) score = roc_auc_score(ytest, qpreds) cost = config.EICU_SCORE_COSTS[mname] GRP = lambda x: x GRP.model_costs, GRP.model_scores = np.array([cost]), np.array([score]) GRP.test_preds = np.array(qpreds) # Trauma baseline (PACT) # Should ignore the resulting cost for now and just use # the hand-calculated one elif mname in ('pact'): cost, score, _, _, _, _, preds = pact_score(Xtrain, Xvalid, Xtest, ytrain, yvalid, ytest, costs) GRP = lambda x: x GRP.model_costs, GRP.model_scores = np.array(cost), np.array(score) GRP.test_preds = np.array(preds) else: raise ValueError("Model name not found!") # Done return GRP #(GRP.model_costs, GRP.model_scores)
def iterative_imputer(self, col, how = "mean", ): if how in self.strategies: imputer = impute.IterativeImputer(initial_strategy=how) self.impute(col, imputer)
def main(): # read in data xtrain = pd.read_csv('../X_train.csv', index_col='id', dtype={'id': np.int32}) ytrain = pd.read_csv('../y_train.csv', index_col='id', dtype={'id': np.int32}) xtest = pd.read_csv('../X_test.csv', index_col='id', dtype={'id': np.int32}) lowerVar = 1e-8 upperVar = 1e100 upperCorr = 0.7 stdFactor = 2.5 imputer = 'simple' outfile = "prediction" norm = 'robust' # read parameters try: opts, args = getopt.getopt( sys.argv[1:], "h", ["uc=", "df=", "imp=", "of=", "help", "norm="]) except getopt.GetoptError: help() sys.exit(2) for opt, arg in opts: if opt in {'-h', '--help'}: help() sys.exit() elif opt == '--uc': upperCorr = float(arg) elif opt == '--df': stdFactor = float(arg) elif opt == '--imp': imputer = arg elif opt == '--of': outfile = arg elif opt == '--norm': norm = arg print("Selected parameters:") print(" lower variance: {:e}".format(lowerVar)) print(" upper variance: {:e}".format(upperVar)) print(" upper covariance: {:e}".format(upperCorr)) print(" std deviation factor: {:f}".format(stdFactor)) print(" imputer: ", imputer) print(" output: ", outfile) print(" norm: ", norm) #------------------------------------------------------------------ # impute if imputer == 'simple': imp = impute.SimpleImputer(missing_values=np.nan, strategy='mean') elif imputer == 'iter': imp = impute.IterativeImputer(missing_values=np.nan, max_iter=50, n_nearest_features=20) imp.fit(xtrain) xtrain.loc[:, :] = imp.transform(xtrain) xtest.loc[:, :] = imp.transform(xtest) #-------------------------------------------------------------------- # drop features because of variance thresholder = VarianceThreshold(threshold=lowerVar) thresholder.fit(xtrain) print("Will drop because of variance < {:e}: ".format(lowerVar)) print(xtrain.columns[np.invert(thresholder.get_support())].tolist()) xtrain2 = pd.DataFrame( data=thresholder.transform(xtrain), index=xtrain.index.tolist(), columns=xtrain.columns[thresholder.get_support()].tolist()) xtest2 = pd.DataFrame( data=thresholder.transform(xtest), index=xtest.index.tolist(), columns=xtest.columns[thresholder.get_support()].tolist()) # drop features with absurdly high variance var = xtrain2.var() tooHigh = var[var > upperVar].index.tolist() # 4 features print("dropping because of variance > {:e}: ".format(upperVar)) print(tooHigh) xtrain3 = xtrain2.drop(columns=tooHigh) xtest3 = xtest2.drop(columns=tooHigh) #---------------------------------------------------------------------- # remove outliers scaler = RobustScaler() scaler.fit(xtrain3) xtrain3[xtrain3 > scaler.center_ + stdFactor * scaler.scale_] = np.nan xtrain3[xtrain3 < scaler.center_ - stdFactor * scaler.scale_] = np.nan xtest3[xtest3 > scaler.center_ + stdFactor * scaler.scale_] = np.nan xtest3[xtest3 < scaler.center_ - stdFactor * scaler.scale_] = np.nan if imputer == 'simple': imp = impute.SimpleImputer(missing_values=np.nan, strategy='mean') elif imputer == 'iter': imp = impute.IterativeImputer(missing_values=np.nan, max_iter=50, n_nearest_features=20) imp.fit(xtrain3) xtrain3.loc[:, :] = imp.transform(xtrain3) xtest3.loc[:, :] = imp.transform(xtest3) #---------------------------------------------------------------------- # normalize data if norm == 'robust': scaler = RobustScaler() elif norm == 'standard': scaler = StandardScaler() else: scaler = RobustScaler() scaler.fit(xtrain3) xtrain3.loc[:, :] = scaler.transform(xtrain3) xtest3.loc[:, :] = scaler.transform(xtest3) # drop highly correlated features corr = xtrain3.corr().abs() corr_triu = corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool)) to_drop = [ column for column in corr_triu.columns if any(corr_triu[column] > upperCorr) ] print("Will drop due to covariance > {:e}: ".format(upperCorr)) print(to_drop) xtrain4 = xtrain3.drop(columns=to_drop) xtest4 = xtest3.drop(columns=to_drop) print("Using ElasticNet to determine features:") netCV = linear_model.ElasticNetCV(l1_ratio=[0.1, 0.3, 0.5, 0.75, 0.85], alphas=(0., 0.1, 0.2, 0.25, 0.3, 0.35, 0.4), cv=5, n_jobs=2, max_iter=5e3) netCV.fit(xtrain4, ytrain.values.ravel()) print("Selected nr of featrues: ", np.count_nonzero(netCV.coef_)) print("Selected alpha: ", netCV.alpha_) print("Selected l1_ratio: ", netCV.l1_ratio_) net = linear_model.ElasticNet(l1_ratio=netCV.l1_ratio_, alpha=netCV.alpha_, max_iter=5e3) score = cross_val_score(net, xtrain4, ytrain.values.ravel(), cv=5, scoring='r2') print("Score of ElasticNet: ", score) net.fit(xtrain4, ytrain.values.ravel()) print("Selected nr of features after cv: ", np.count_nonzero(net.coef_)) xtrain5 = xtrain4.loc[:, np.abs(net.coef_) > 0] xtest5 = xtest4.loc[:, np.abs(net.coef_) > 0] print("Retained features: ", len(xtest5.columns.tolist())) print(xtest5.columns.tolist()) print("Testing new elasticNet:") netCV2 = linear_model.ElasticNetCV( l1_ratio=[0., 0.1, 0.3, 0.5, 0.75, 0.85, 0.9, 1.], alphas=(0., 0.1, 0.2, 0.25, 0.3, 0.35, 0.4), cv=5, n_jobs=2, max_iter=5e3) netCV2.fit(xtrain5, ytrain.values.ravel()) print("Selected nr of features: ", np.count_nonzero(netCV2.coef_)) print("Selected alpha: ", netCV2.alpha_) print("Selected l1_ration: ", netCV2.l1_ratio_) net2 = linear_model.ElasticNet(l1_ratio=netCV2.l1_ratio_, alpha=netCV2.alpha_, max_iter=5e3) score = cross_val_score(net2, xtrain5, ytrain.values.ravel(), cv=5, scoring='r2') print("Score of new elasticNet: ", score) net2.fit(xtrain5, ytrain.values.ravel()) print("Would use: ", np.count_nonzero(net2.coef_), " features")
def ex_1(): X, y = datasets.fetch_openml('diabetes', as_frame=True, return_X_y=True) # print(X) # print(X.info()) # print(X.describe()) X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=42) X_train_2 = X_train.copy() plt.figure() X_train.boxplot() X_train.hist(bins=20) plt.figure() sns.boxplot(x=X_train['mass']) imputer_mass = impute.SimpleImputer(missing_values=0.0, strategy='mean') imputer_skin = impute.SimpleImputer(missing_values=0.0, strategy='mean') X_train[['mass']] = imputer_mass.fit_transform(X_train[['mass']]) X_train[['skin']] = imputer_skin.fit_transform(X_train[['skin']]) X_test[['mass']] = imputer_mass.transform(X_test[['mass']]) X_test[['skin']] = imputer_mass.transform(X_test[['skin']]) df_mass = X_train[['mass']] # print(df_mass.head(5)) # Wykrywanie anomalii czyli odstających danych X_train_isolation = X_train.values X_train_isolation = X_train_isolation[:, [1, 5]] X_test_isolation = X_test.values X_test_isolation = X_test_isolation[:, [1, 5]] isolation_forest = ensemble.IsolationForest(contamination=0.05) isolation_forest.fit(X_train_isolation) y_predicted_outliers = isolation_forest.predict(X_test_isolation) print(y_predicted_outliers) plot_iris2d(X_test_isolation, y_predicted_outliers) clf = svm.SVC(random_state=42) clf.fit(X_train, y_train) y_predicted = clf.predict(X_test) print(metrics.classification_report(y_test, y_predicted)) X_train.hist() imputer_it = impute.IterativeImputer(missing_values=0.0) X_train_2[['mass']] = imputer_it.fit_transform(X_train_2[['mass']]) X_train_2[['skin']] = imputer_it.fit_transform(X_train_2[['skin']]) X_train_2.hist(bins=20) plt.figure() X_train_2.boxplot() clf_rf = ensemble.RandomForestClassifier(random_state=42) clf_rf.fit(X_train, y_train) y_predicted = clf_rf.predict(X_test) print(metrics.classification_report(y_test, y_predicted)) importances = clf_rf.feature_importances_ std = np.std([tree.feature_importances_ for tree in clf_rf.estimators_], axis=0) indices = np.argsort(importances)[::-1] # Print the feature ranking print("Feature ranking:") for f in range(X.shape[1]): print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) # Plot the impurity-based feature importances of the forest plt.figure() plt.title("Feature importances") plt.bar(range(X.shape[1]), importances[indices], color="r", yerr=std[indices], align="center") plt.xticks(range(X.shape[1]), indices) plt.xlim([-1, X.shape[1]]) plt.show()
# Tu uz mame jednotlive skupiny atributov, pre ktore patria rozlicne sposoby aplikacie imputovania missing values. oxygen_attr = [ "mean_oxygen", "std_oxygen", "kurtosis_oxygen", "skewness_oxygen" ] glucose_attr = [ "mean_glucose", "std_glucose", "kurtosis_glucose", "skewness_glucose" ] vztahy_attr = ["relationship", "marital-status"] work_attr = ["workclass", "occupation", "hours-per-week-cat", "income"] edu_attr = ["education", "education-num"] impute_col_transf = compose.ColumnTransformer(transformers=[ ("oxygen_n_glucose_impute", KeepDataFrame(impute.IterativeImputer(max_iter=50)), oxygen_attr + glucose_attr ), ("vztahy_impute", CustomCatImputing(imputer_type="knn"), vztahy_attr), ("work_impute", CustomCatImputing(imputer_type="knn"), work_attr), ("edu_impute", CustomCatImputing(imputer_type="knn"), edu_attr), ("sex_impute", KeepDataFrame(impute.SimpleImputer(strategy="most_frequent")), ["sex"]), ("age_impute", KeepDataFrame(impute.SimpleImputer()), ["age"]) ]) #tento column transformer sa bude pouzivat v pripade, kedy chceme pouzit v ramci celeho datasetu cisto len simpleimputer most_freq_attr = ["sex"] + edu_attr + work_attr + vztahy_attr mean_attr = ["age"] + oxygen_attr + glucose_attr simple_impute_col_transf = compose.ColumnTransformer(transformers=[(
def train(dname, mname, rseed, shuffle_params=None): assert (('gbm' in mname) or (mname in ('cegb', 'fixedmodel', 'imputemodel')) or ('linear' in mname) or ('nn' in mname) or ('tab' in mname) or ('node' in mname)) ################## # DATA ################## # Load data we're using (Xtrain, ytrain), (Xvalid, yvalid), (Xtest, ytest), costs, groups, extras = LOADERS[dname]( split_seed=rseed) #,**kwargs) # If we're using PACT we need some of the extra (redundant) features that were unused in our study if mname == 'pact': (Xtrain, ytrain), (Xvalid, yvalid), (Xtest, ytest), costs, groups, extras = load_ed( name=config.ED_NAME, costtype=config.ED_COSTTYPE, drop_redundant=False, split_seed=rseed) # If we're using a non-GBM AI method, we need to impute NaNs and scale # Don't do this if using ICU data because we're using a Pipeline in that case # that handle sthis stuff if ('linear' in mname or 'nn' in mname or 'cwcf' in mname or 'tab' in mname or 'node' in mname) and (dname != 'icu'): imputer = impute.SimpleImputer() scaler = preprocessing.StandardScaler() Xtrain_np = scaler.fit_transform(imputer.fit_transform(Xtrain)) Xvalid_np = scaler.transform(imputer.transform(Xvalid)) Xtest_np = scaler.transform(imputer.transform(Xtest)) for df, npy in zip([Xtrain, Xvalid, Xtest], [Xtrain_np, Xvalid_np, Xtest_np]): df.iloc[:] = npy # Xtrain,Xvalid,Xtest = [pd.DataFrame(data=npy,columns=df.columns,index=df.index) for df,npy in zip( # [Xtrain_raw,Xvalid_raw,Xtest_raw],[Xtrain_np,Xvalid_np,Xtest_np])] # else: # (Xtrain,Xvalid,Xtest) = Xtrain_raw,Xvalid_raw,Xtest_raw # Concatenated data for post-tuning Xtv = pd.concat([Xtrain, Xvalid]) ytv = np.hstack((ytrain, yvalid)) # Grouped costs for datasets tht feature it unique_costs = np.array([ costs[groups == g].mean() for g in np.unique(groups) ]) if (dname == 'outpatient') or ( dname == 'icu' and mname in ('linear', 'linearh', 'nn')) else costs ################## # PARAMETER TUNING ################## # If we've precomputed best parameters, just load those if TUNING == 'LOAD' and ( ('gbm' in mname) or (mname in ('cegb', 'fixedmodel', 'imputemodel')) or ('linear' in mname) or ('nn' in mname) or ('tab' in mname)): loadname = 'gbmsage' if mname == 'cegb' else mname with open(f'{OUTPATH}/{loadname}-{dname}-{rseed}.pkl', 'rb') as w: model = pickle.load(w) # Otherwise do some parameter tuning else: # Tune GBM if ('gbm' in mname) or (mname in ('cegb', 'fixedmodel', 'imputemodel')): model = tune(Xtrain, Xvalid, ytrain, yvalid) # Linear model needs onehotencoding pipeline if we're doing ICU elif ('linear' in mname): if (dname == 'icu'): model = lintune(Xtrain, Xvalid, ytrain, yvalid, mfunc=icu_preprocessing(get_linear_model)) else: model = lintune(Xtrain, Xvalid, ytrain, yvalid) # NN model needs onehotencoding pipeline if we're doing ICU elif 'nn' in mname: if (dname == 'icu'): model = tftune(Xtrain, Xvalid, ytrain, yvalid, mfunc=icu_preprocessing(get_tf_model), return_extras=False) else: model = tftune(Xtrain, Xvalid, ytrain, yvalid, return_extras=False) elif 'node' in mname: model = nodetune(Xtrain, Xvalid, ytrain, yvalid, mfunc=(icu_preprocessing(NodeClassifier) if dname == 'icu' else NodeClassifier)) if dname != 'icu': bst = NodeClassifier(**model) bst.fit(Xtrain, ytrain, eval_set=(Xvalid, yvalid)) iXtest = bst.dataset.transform(Xtest) preds = bst.predict_proba(iXtest)[:, 1] score = roc_auc_score(ytest, preds) model['test_score'] = score elif ('tab' in mname): cat_name_map = { 'trauma': [ 'agencylevelfromscene', 'agencymodefromscene', 'ageunits', 'causecode', 'ethnicity', 'formfromscene', 'race', 'residencestate', 'scenedestinationreason', 'scenerespassisted', 'sex' ] } cat_idx_map = { 'trauma': [ i for i, c in enumerate(Xtrain.columns) if c in cat_name_map['trauma'] ] } cat_dim_map = { 'trauma': [ Xtrain[c].unique().shape[0] for i, c in enumerate(Xtrain.columns) if c in cat_name_map['trauma'] ] } if (dname == 'icu'): model = tabtune(Xtrain.values, Xvalid.values, ytrain, yvalid, mfunc=icu_preprocessing(get_linear_model)) else: model = tabtune(Xtrain.values, Xvalid.values, ytrain, yvalid, cat_idxs=cat_name_map.get(dname, []), cat_dims=cat_dim_map.get(dname, []), cat_emb_dim=2, return_score=True) # If we indicated we want to save the model, do so if TUNING == 'SAVE' and (('gbm' in mname) or (mname in ('cegb', 'fixedmodel', 'imputemodel')) or ('linear' in mname) or ('nn' in mname) or ('tab' in mname) or ('node' in mname)): with open(f'{OUTPATH}/{mname}-{dname}-{rseed}.pkl', 'wb') as w: pickle.dump(model, w) exit() ################## # Setup for CoAI ################## # Instantiate predictive models if ('gbm' in mname) or (mname in ('cegb', 'fixedmodel', 'imputemodel')): bst = lgb.LGBMClassifier(**model) elif 'linear' in mname: bst = icu_preprocessing(FastLinearClassifier)( **model) if dname == 'icu' else FastLinearClassifier(**model) elif 'nn' in mname: bst = icu_preprocessing(get_fast_keras)( **model) if dname == 'icu' else get_fast_keras(**model) # Get our explainer (using SAGE entirely now, shap is old & may not work perfectly) if ('sage' in mname) or (mname in ('cegb', 'fixedmodel', 'imputemodel')): exp = labelless_sage_wrapper(imputetype='marginal', refsize=64, batch_size=32, wrap_categorical=(dname == 'icu')) elif mname == 'gbmshap': exp = OneDimExplainer elif mname == 'linearshap': exp = get_pipeline_explainer(LinearExplainer) # Prepare to shuffle costs if required if shuffle_params is not None: if ((shuffle_params[0] < 0) and (shuffle_params[1] < 0)): costs, shuffle_costs = cost_pair(-shuffle_params[0], -shuffle_params[1], Xtrain) else: shuffle_costs = cost_swaps(costs, shuffle_params[0], shuffle_params[1]) # Pick thresholds for CoAI dthresh = np.linspace(0, np.sum(unique_costs) + 1, 100) ##################### # Actually train/test ##################### if 'sage' in mname or 'shap' in mname: GRP = knapsack.GroupOptimizer(bst, exp, scale_ints=1000 * 100 if ('sage' in mname) else 1000) if 'nn' in mname: if dname == 'icu': GRP.fit(Xtv, ytv, costs, groups, dthresh, model__epochs=10, model__verbose=False) else: GRP.fit(Xtv, ytv, costs, groups, dthresh, epochs=10, verbose=False) else: GRP.fit(Xtv, ytv, costs, groups, dthresh) GRP.score_models_proba(Xtest, ytest, roc_auc_score) if shuffle_params: GRP.recalculate_costs(shuffle_costs) elif 'fixed' in mname: bst = bst.fit(Xtv, ytv) GRP = knapsack.FixedModelExactRetainer(bst, exp) GRP.fit(Xtv, ytv, costs, dthresh) if shuffle_params: GRP.refit(Xtv, ytv, shuffle_costs) GRP.score_models_proba(Xtest, ytest, roc_auc_score) elif 'impute' in mname: imputer = impute.IterativeImputer(random_state=0, estimator=linear_model.RidgeCV()) bst = bst.fit(Xtv, ytv) imputer.fit(Xtv) GRP = knapsack.FixedModelImputer(bst, exp, imputer) GRP.fit(Xtv, ytv, costs, dthresh) if shuffle_params: GRP.refit(Xtv, ytv, shuffle_costs) GRP.score_models_proba(Xtest, ytest, roc_auc_score) elif mname == 'cegb': GRP = cegb.CEGBOptimizer(model=bst, lambdas=np.logspace(-5, 5, 101)) GRP.fit(Xtv, ytv, costs) GRP.score_models_proba(Xtest, ytest, roc_auc_score) if dname == 'outpatient': GRP.recalculate_costs(costs, groups) if shuffle_params: GRP.recalculate_costs(shuffle_costs) elif mname == 'cwcf': ytrain, yvalid, ytest = [np.array(x) for x in (ytrain, yvalid, ytest)] print('Training CWCF...') lmbds = np.hstack([np.logspace(-14, 1, 16) for _ in range(2)]) GRP = cwcf.get_cwcf(Xtrain, Xvalid, Xtest, ytrain, yvalid, ytest, costs, lmbds, gpus=list(range(8)), njobs=32, dirname=config.CWCF_TMPDIR, metric=roc_auc_score, difficulty=1000) print('Done') elif mname in ('aps', 'apacheiii', 'apacheiva'): strain, svalid, stest = aps_baselines() mpreds = stest mpreds = bootstrap_set(mpreds, rseed=rseed) preds = mpreds[mname] score = roc_auc_score(ytest, preds) cost = config.EICU_SCORE_COSTS[mname] GRP = lambda x: x GRP.model_costs, GRP.model_scores = np.array([cost]), np.array([score]) GRP.test_preds = np.array(preds) elif mname in ('qsofa'): qtest = qsofa_score() qpreds = bootstrap_set(qtest, rseed=rseed) score = roc_auc_score(ytest, qpreds) cost = config.EICU_SCORE_COSTS[mname] GRP = lambda x: x GRP.model_costs, GRP.model_scores = np.array([cost]), np.array([score]) GRP.test_preds = np.array(qpreds) elif mname in ('pact'): cost, score, _, _, _, _, preds = pact_score(Xtrain, Xvalid, Xtest, ytrain, yvalid, ytest, costs) GRP = lambda x: x GRP.model_costs, GRP.model_scores = np.array(cost), np.array(score) GRP.test_preds = np.array(preds) else: raise ValueError("Model name not found!") # Done return GRP