def plotOneTrainingSetDifferentTestSets(results_analyzer, dirData, dirModelsBase, dirResultsBase): data_prefix = 'patrec' dict_options_dataset_training = { 'dir_data': dirData, 'data_prefix': data_prefix, 'dataset': '20122015', 'options_filtering': None } options_training = DatasetOptions(dict_options_dataset_training); # compare different subsets of data: EntlassBereich (only with RandomForest) options_rf = OptionsRF(dirModelsBase, options_training.getFilenameOptions(filteroptions=True)); dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': data_prefix, 'dataset': '20162017', 'options_filtering': None } options_testing_all = DatasetOptions(dict_options_dataset_testing); results_test_all = Results(dirResultsBase, options_training, options_rf, 'test', options_testing_all); dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': data_prefix, 'dataset': '20162017', 'options_filtering': 'EntlassBereich_Med' } options_testing_med = DatasetOptions(dict_options_dataset_testing); results_test_med = Results(dirResultsBase, options_training, options_rf, 'test', options_testing_med); dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': data_prefix, 'dataset': '20162017', 'options_filtering': 'EntlassBereich_SaO' } options_testing_sao = DatasetOptions(dict_options_dataset_testing); results_test_sao = Results(dirResultsBase, options_training, options_rf, 'test', options_testing_sao); dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': data_prefix, 'dataset': '20162017', 'options_filtering': 'EntlassBereich_Gyn' } options_testing_gyn = DatasetOptions(dict_options_dataset_testing); results_test_gyn = Results(dirResultsBase, options_training, options_rf, 'test', options_testing_gyn); analyzer_all = ResultsSingleConfigAnalyzer(results_test_all, 10); analyzer_med = ResultsSingleConfigAnalyzer(results_test_med, 10); analyzer_sao = ResultsSingleConfigAnalyzer(results_test_sao, 10); analyzer_gyn = ResultsSingleConfigAnalyzer(results_test_gyn, 10); analyzer = [analyzer_all, analyzer_med, analyzer_sao, analyzer_gyn]; names = ['All', 'Med', 'SaO', 'Gyn'] title_plot = 'classifier (rf): trained on patrec 2012-2015, tested on subsets of patrec 2016-2017' filename_plot = dirPlotsBase + 'rf_training_all_testing_EntlassBereich.png' results_analyzer.plotROCcurveMulitpleConfigs(analyzer, names, f_plot=filename_plot, titlePlot=title_plot, )
def plotDifferentTrainingSetSingleTestSetNZ(results_analyzer, dirData, dirModelsBase, dirResultsBase): print('plotDifferentTrainingSetSingleTestSetNZ') data_prefix = 'nz' dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': data_prefix, 'dataset': '2017', 'options_filtering': None } options_testing = DatasetOptions(dict_options_dataset_testing); years_training = [2012, 2013, 2014, 2015, 2016]; names = []; analyzers = [] for year in years_training: print(year) dict_options_dataset_training = { 'dir_data': dirData, 'data_prefix': data_prefix, 'dataset': str(year), 'options_filtering': None } options_training_year = DatasetOptions(dict_options_dataset_training); options_rf_year = OptionsRF(dirModelsBase, options_training_year.getFilenameOptions(filteroptions=True)); results_test_year = Results(dirResultsBase, options_training_year, options_rf_year, 'test', options_testing); names.append(str(year)) analyzers.append(ResultsSingleConfigAnalyzer(results_test_year, 10)); title_plot = 'classifier (rf): trained on subsets of nz 2012-2016, tested on subset of nz 2017' filename_plot = dirPlotsBase + 'rf_training_nz_years_20122016_testing_nz_year_2017.png' print('plot ROC curve...') results_analyzer.plotROCcurveMulitpleConfigs(analyzers, names, f_plot=filename_plot, titlePlot=title_plot, )
def simulation(models: dict, ts_data: SimulationDataset, retrain_frequency: int): """Run one portfolio simulation over the input DataFrame. :models: Dictionary of ("Model Name", "Model Object") key-value pairs. Will look for "Model Name".pkl files to load pre-trained models. :ts_data: DataFrame of the time series over which you want to train and test. :returns: Dictionary of ("Model Name", "Model Predictions") across the testing time period """ # Grab the dimensions and initialize the prediction husk oos_ds = ts_data.get_out_of_sample().dataset.raw oos_size, n_coins = oos_ds.shape lag = ts_data.lag buy_and_hold = np.zeros((oos_size, 1)) results = {} for name, model in models.items(): results[name] = Results(oos_ds, model, lag) print('============== Beginning predictions! ==============') for oos_sample, (oos_data, target) in enumerate(ts_data.get_out_of_sample()): # Grab new data if retrain is necessary retrain = (oos_sample % retrain_frequency == 0) if retrain: ds = ts_data.get_training(oos_sample) print( f'~~~~~ Retraining models for {oos_sample}\'th prediction ~~~~~' ) for name, model in models.items(): # Re-train if necessary if (retrain and model.needs_retraining()) or (oos_sample == 0): tic = time.perf_counter() model.train(ds) toc = time.perf_counter() print(f"\tTrained {name} in {toc - tic:0.4f} seconds") # Perform and store the prediction! # NOTE: Assuming predicting 1-at-a-time given hard-coded reshape prediction = model.predict(oos_data) results[name].add_prediction(prediction.reshape(1, -1)) print('=============== Predicted everything! ===============') cret = oos_ds.mean(axis=1) return results, oos_ds.mean(axis=1).iloc[lag:]
def plotSingleConfiguration(results_analyzer, dirData, dirModelsBase, dirResultsBase): dict_options_dataset_training = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20122015' } dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20162017' } options_training = DatasetOptions(dict_options_dataset_training); options_testing = DatasetOptions(dict_options_dataset_testing); options_rf = OptionsRF(dirModelsBase, options_training.getFilenameOptions(filteroptions=True)); results_all_runs_test = Results(dirResultsBase, options_training, options_rf, 'test', options_testing); analyzer_single_config = ResultsSingleConfigAnalyzer(results_all_runs_test, 10); results_analyzer.plotROCcurveSingleConfig(analyzer_single_config, 'rf')
def plotSGDClassifierPerformance(results_analyzer, dirData, dirModelsBase, dirResultsBase): dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': 'nz', 'dataset': '2016', 'encoding': 'categorical', 'newfeatures': {'names': constantsNZ.NEW_FEATURES}, 'featurereduction': None, 'grouping': 'grouping' } options_dataset_testing = DatasetOptions(dict_options_dataset_testing); analyzer = []; years = [2012, 2013, 2014, 2015]; for year in years: dict_options_dataset_training = { 'dir_data': dirData, 'data_prefix': 'nz', 'dataset': str(year), 'encoding': 'categorical', 'newfeatures': {'names': constantsNZ.NEW_FEATURES}, 'featurereduction': None, 'grouping': 'grouping' } options_dataset_training = DatasetOptions(dict_options_dataset_training); dict_opt_sgd = {'loss': 'log', 'penalty': 'l1'}; options_sgd = OptionsSGD(dirModelsBase, options_dataset_training.getFilenameOptions(filteroptions=True),options_clf=dict_opt_sgd); results_year = Results(dirResultsBase, options_dataset_training, options_sgd, 'test', options_dataset_testing); analyzer_sgd_year = ResultsSingleConfigAnalyzer(results_year, 1); analyzer.append(analyzer_sgd_year); names = ['2012', '2013', '2014', '2015']; title_plot = 'performance of batch-based logistic regression' filename_plot = dirPlotsBase + 'sgd_nz_performance_years_training20122015_test2016.png' results_analyzer.plotROCcurveMulitpleConfigs(analyzer, names, f_plot=filename_plot, titlePlot=title_plot)
def predict(flags_obj): """Run Wide-Deep training and eval loop. Args: flags_obj: An object containing parsed flag values. """ dirProject = '/home/thomas/fusessh/scicore/projects/patrec' # dirProject = "Z:\\projects\\PATREC" dirResultsBase = os.path.join(dirProject, 'results/') dirData = os.path.join(dirProject, 'data') dict_options_dataset_training = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20122015', 'grouping': 'verylightgrouping', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': None, 'filtering': 'EntlassBereich_Gyn', 'balanced': False, 'resample': False } dataset_options_training = DatasetOptions(dict_options_dataset_training) dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20162017', 'grouping': 'verylightgrouping', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': None, 'filtering': 'EntlassBereich_Gyn', 'balanced': False, 'resample': False } dataset_options_testing = DatasetOptions(dict_options_dataset_testing) if dict_options_dataset_testing['data_prefix'] == 'nz': feature_columns = FeatureColumnsNZ( dataset_options=dataset_options_testing) # feature_columns_nz_fusion = FeatureColumnsNZFusion(dataset_options=dataset_options_testing); # feature_columns = feature_columns_nz_fusion; elif dict_options_dataset_testing['data_prefix'] == 'patrec': feature_columns = FeatureColumnsPatrec( dataset_options=dataset_options_testing) # feature_columns_patrec_fusion = FeatureColumnsPatrecFusion(dataset_options=dataset_options_testing); # feature_columns = feature_columns_patrec_fusion; else: print('unknown data prefix..exit') sys.exit() dict_dataset_options = { 'train': dataset_options_training, 'eval': None, 'test': dataset_options_testing } nn = NeuralNetModel('test', dict_dataset_options, feature_columns, flags_obj) model_flags = nn.getFlags() if model_flags.model_dir.endswith('/'): trained_model = model_flags.model_dir.split('/')[-2] else: trained_model = model_flags.model_dir.split('/')[-1] if trained_model.startswith('warmstart'): pretrained = 'pretrained' else: pretrained = None print('warmstart: ' + str(trained_model.startswith('warmstart'))) print('hidden units: ' + str(model_flags.hidden_units)) dict_options_nn = { 'hidden_units': model_flags.hidden_units, 'learningrate': model_flags.learningrate, 'dropout': model_flags.dropout, 'batchnorm': model_flags.batchnorm, 'batch_size': model_flags.batch_size, 'training_epochs': model_flags.train_epochs, 'pretrained': pretrained, } options_nn = OptionsNN(model_flags.model_dir, dataset_options_training, options_clf=dict_options_nn) classifier_nn = ClassifierNN(options_nn) results_all_runs_test = Results(dirResultsBase, dataset_options_training, options_nn, 'test', dataset_options_testing) num_runs = 10 test_auc = [] test_avgprecision = [] for k in range(0, num_runs): results = nn.predict() filename_data_testing = nn.getFilenameDatasetBalanced() df_testing_balanced = pd.read_csv(filename_data_testing) predictions = [p['probabilities'] for p in results] predictions = np.array(predictions) print('get labels...: ' + str(filename_data_testing)) labels = df_testing_balanced[ dataset_options_testing.getEarlyReadmissionFlagname()].values res = classifier_nn.setResults(predictions, labels) results_all_runs_test.addResultsSingleRun(res) auc = res.getAUC() avgprecision = res.getAvgPrecision() print('') print('AUC: ' + str(auc)) print('avg precision: ' + str(avgprecision)) print('') test_auc.append(auc) test_avgprecision.append(avgprecision) print('') print('mean test auc: ' + str(np.mean(np.array(test_auc)))) print('mean test avg precision: ' + str(np.mean(np.array(test_avgprecision)))) print('') results_all_runs_test.writeResultsToFileDataset()
def plotDiseasePerformances(results_analyzer, dirData, dirModelsBase, dirResultsBase): dict_opt_lr = {'penalty': 'l1', 'C': 0.5}; dict_opt_rf = {'n_estimators': 500, 'max_depth': 50}; dict_options_nn = { 'hidden_units': [60, 40, 40, 20], 'learningrate': 0.001, 'dropout': 0.5, 'batchnorm': True, 'batch_size': 64, 'training_epochs': 1000, 'pretrained': None, } DIRPROJECT = '/home/thomas/projects/patrec'; model_dir = os.path.join(DIRPROJECT, "patients_model") dict_options_dataset_training = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20122015', 'grouping': 'verylightgrouping', 'encoding': 'categorical', 'newfeatures': {'names': constantsPATREC.NEW_FEATURES}, 'featurereduction': None, 'filtering': None, 'balanced': False, 'resample': False } dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20162017', 'encoding': 'categorical', 'newfeatures': {'names': constantsPATREC.NEW_FEATURES}, 'grouping': 'verylightgrouping', 'featurereduction': None, 'filtering': None, 'balanced': False, 'resample': False } dict_options_all_training = dict_options_dataset_training.copy(); dict_options_all_testing = dict_options_dataset_testing.copy(); options_all_training = DatasetOptions(dict_options_all_training); options_all_testing = DatasetOptions(dict_options_all_testing); options_all_lr = OptionsLogisticRegression(dirModelsBase, options_all_training.getFilenameOptions(filteroptions=True), options_clf=dict_opt_lr); options_all_rf = OptionsRF(dirModelsBase, options_all_training.getFilenameOptions(filteroptions=True), options_clf=dict_opt_rf); options_all_nn = OptionsNN(model_dir, options_all_training, options_clf=dict_options_nn); dict_options_lung_training = dict_options_dataset_training.copy(); dict_options_lung_testing = dict_options_dataset_testing.copy(); dict_options_lung_training['filtering'] = 'chronic_lung'; dict_options_lung_testing['filtering'] = 'chronic_lung'; options_lung_training = DatasetOptions(dict_options_lung_training); options_lung_testing = DatasetOptions(dict_options_lung_testing); options_lung_lr = OptionsLogisticRegression(dirModelsBase, options_lung_training.getFilenameOptions(filteroptions=True), options_clf=dict_opt_lr); options_lung_rf = OptionsRF(dirModelsBase, options_lung_training.getFilenameOptions(filteroptions=True), options_clf=dict_opt_rf); options_lung_nn = OptionsNN(model_dir, options_lung_training, options_clf=dict_options_nn); dict_options_oncology_training = dict_options_dataset_training.copy(); dict_options_oncology_testing = dict_options_dataset_testing.copy(); dict_options_oncology_training['filtering'] = 'oncology'; dict_options_oncology_testing['filtering'] = 'oncology'; options_oncology_training = DatasetOptions(dict_options_oncology_training); options_oncology_testing = DatasetOptions(dict_options_oncology_testing); options_oncology_lr = OptionsLogisticRegression(dirModelsBase, options_oncology_training.getFilenameOptions(filteroptions=True), options_clf=dict_opt_lr); options_oncology_rf = OptionsRF(dirModelsBase, options_oncology_training.getFilenameOptions(filteroptions=True), options_clf=dict_opt_rf); options_oncology_nn = OptionsNN(model_dir, options_oncology_training, options_clf=dict_options_nn); dict_options_cardio_training = dict_options_dataset_training.copy(); dict_options_cardio_testing = dict_options_dataset_testing.copy(); dict_options_cardio_training['filtering'] = 'cardiovascular'; dict_options_cardio_testing['filtering'] = 'cardiovascular'; options_cardio_training = DatasetOptions(dict_options_cardio_training); options_cardio_testing = DatasetOptions(dict_options_cardio_testing); options_cardio_lr = OptionsLogisticRegression(dirModelsBase, options_cardio_training.getFilenameOptions(filteroptions=True), options_clf=dict_opt_lr); options_cardio_rf = OptionsRF(dirModelsBase, options_cardio_training.getFilenameOptions(filteroptions=True), options_clf=dict_opt_rf); options_cardio_nn = OptionsNN(model_dir, options_cardio_training, options_clf=dict_options_nn); results_all_rf = Results(dirResultsBase, options_all_training, options_all_rf, 'test', options_all_testing); results_lung_rf = Results(dirResultsBase, options_lung_training, options_lung_rf, 'test', options_lung_testing); results_oncology_rf = Results(dirResultsBase, options_oncology_training, options_oncology_rf, 'test', options_oncology_testing); results_cardio_rf = Results(dirResultsBase, options_cardio_training, options_cardio_rf, 'test', options_cardio_testing); results_all_lr = Results(dirResultsBase, options_all_training, options_all_lr, 'test', options_all_testing); results_lung_lr = Results(dirResultsBase, options_lung_training, options_lung_lr, 'test', options_lung_testing); results_oncology_lr = Results(dirResultsBase, options_oncology_training, options_oncology_lr, 'test', options_oncology_testing); results_cardio_lr = Results(dirResultsBase, options_cardio_training, options_cardio_lr, 'test', options_cardio_testing); results_all_nn = Results(dirResultsBase, options_all_training, options_all_nn, 'test', options_all_testing); results_lung_nn = Results(dirResultsBase, options_lung_training, options_lung_nn, 'test', options_lung_testing); results_oncology_nn = Results(dirResultsBase, options_oncology_training, options_oncology_nn, 'test', options_oncology_testing); results_cardio_nn = Results(dirResultsBase, options_cardio_training, options_cardio_nn, 'test', options_cardio_testing); analyzer_all_rf = ResultsSingleConfigAnalyzer(results_all_rf, 10); analyzer_lung_rf = ResultsSingleConfigAnalyzer(results_lung_rf, 10); analyzer_oncology_rf = ResultsSingleConfigAnalyzer(results_oncology_rf, 10); analyzer_cardio_rf = ResultsSingleConfigAnalyzer(results_cardio_rf, 10); analyzer_all_lr = ResultsSingleConfigAnalyzer(results_all_lr, 10); analyzer_lung_lr = ResultsSingleConfigAnalyzer(results_lung_lr, 10); analyzer_oncology_lr = ResultsSingleConfigAnalyzer(results_oncology_lr, 10); analyzer_cardio_lr = ResultsSingleConfigAnalyzer(results_cardio_lr, 10); analyzer_all_nn = ResultsSingleConfigAnalyzer(results_all_nn, 10); analyzer_lung_nn = ResultsSingleConfigAnalyzer(results_lung_nn, 10); analyzer_oncology_nn = ResultsSingleConfigAnalyzer(results_oncology_nn, 10); analyzer_cardio_nn = ResultsSingleConfigAnalyzer(results_cardio_nn, 10); analyzer_rf = [analyzer_all_rf, analyzer_lung_rf, analyzer_oncology_rf, analyzer_cardio_rf]; analyzer_lr = [analyzer_all_lr, analyzer_lung_lr, analyzer_oncology_lr, analyzer_cardio_lr]; analyzer_nn = [analyzer_all_nn, analyzer_lung_nn, analyzer_oncology_nn, analyzer_cardio_nn] analyzer = analyzer_lr; names_rf = ['RF - all', 'RF - chronic lung', 'RF - oncology', 'RF - cardiovascular']; names_lr = ['LR - all', 'LR - chronic lung', 'LR - oncology', 'LR - cardiovascular']; names_nn = ['NN - all', 'NN - chronic lung', 'NN - oncology', 'NN - cardiovascular']; names = names_lr; title_plot = '' filename_plot_rf = os.path.join(dirPlotsBase, 'diseases_rf_classification_performance.png'); filename_plot_lr = os.path.join(dirPlotsBase, 'diseases_lr_classification_performance.png'); filename_plot_nn = os.path.join(dirPlotsBase, 'diseases_nn_classification_performance.png'); filename_plot = filename_plot_lr; results_analyzer.plotROCcurveMulitpleConfigs(analyzer, names, titlePlot=title_plot, f_plot=filename_plot)
def plotOEPerformances(results_analyzer, dirData, dirModelsBase, dirResultsBase): dict_opt_lr = {'penalty': 'l1', 'C': 0.5}; dict_opt_rf = {'n_estimators': 500, 'max_depth': 50}; dict_options_nn = { 'hidden_units': [60, 40, 40, 20], 'learningrate': 0.001, 'dropout': 0.5, 'batchnorm': True, 'batch_size': 64, 'training_epochs': 1000, 'pretrained': None, } DIRPROJECT = '/home/thomas/projects/patrec'; model_dir = os.path.join(DIRPROJECT, "patients_model") dict_options_dataset_training = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20122015', 'grouping': 'verylightgrouping', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': None, 'filtering': None, 'balanced': False, 'resample': False } dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20162017', 'grouping': 'verylightgrouping', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': None, 'filtering': None, 'balanced': False, 'resample': False } dataset_options_training_all = DatasetOptions(dict_options_dataset_training); dataset_options_testing_all = DatasetOptions(dict_options_dataset_testing); options_all_nn = OptionsNN(model_dir, dataset_options_training_all, options_clf=dict_options_nn); options_all_lr = OptionsLogisticRegression(dirModelsBase, dataset_options_training_all.getFilenameOptions(filteroptions=True), options_clf=dict_opt_lr) options_all_rf = OptionsRF(dirModelsBase, dataset_options_training_all.getFilenameOptions(filteroptions=True), options_clf=dict_opt_rf); classifier_nn_all = ClassifierNN(options_all_nn) classifier_lr_all = ClassifierLogisticRegression(options_all_lr) classifier_rf_all = ClassifierRF(options_all_rf) dict_options_dataset_training = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20122015', 'grouping': 'verylightgrouping', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': None, 'filtering': 'EntlassBereich_SaO', 'balanced': False, 'resample': False } dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20162017', 'grouping': 'verylightgrouping', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': None, 'filtering': 'EntlassBereich_SaO', 'balanced': False, 'resample': False } dataset_options_training_SaO = DatasetOptions(dict_options_dataset_training); dataset_options_testing_SaO = DatasetOptions(dict_options_dataset_testing); options_SaO_nn = OptionsNN(model_dir, dataset_options_training_SaO, options_clf=dict_options_nn); options_SaO_lr = OptionsLogisticRegression(dirModelsBase, dataset_options_training_SaO.getFilenameOptions(filteroptions=True), options_clf=dict_opt_lr); options_SaO_rf = OptionsRF(dirModelsBase, dataset_options_training_SaO.getFilenameOptions(filteroptions=True), options_clf=dict_opt_rf) classifier_nn_SaO = ClassifierNN(options_SaO_nn); classifier_lr_SaO = ClassifierLogisticRegression(options_SaO_lr); classifier_rf_SaO = ClassifierRF(options_SaO_rf); dict_options_dataset_training = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20122015', 'grouping': 'verylightgrouping', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': None, 'filtering': 'EntlassBereich_Med', 'balanced': False, 'resample': False } dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20162017', 'grouping': 'verylightgrouping', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': None, 'filtering': 'EntlassBereich_Med', 'balanced': False, 'resample': False } dataset_options_training_Med = DatasetOptions(dict_options_dataset_training); dataset_options_testing_Med = DatasetOptions(dict_options_dataset_testing); options_Med_nn = OptionsNN(model_dir, dataset_options_training_Med, options_clf=dict_options_nn); options_Med_lr = OptionsLogisticRegression(dirModelsBase, dataset_options_training_Med.getFilenameOptions(filteroptions=True), options_clf=dict_opt_lr); options_Med_rf = OptionsRF(dirModelsBase, dataset_options_training_Med.getFilenameOptions(filteroptions=True), options_clf=dict_opt_rf) classifier_nn_Med = ClassifierNN(options_Med_nn) classifier_lr_Med = ClassifierLogisticRegression(options_Med_lr); classifier_rf_Med = ClassifierRF(options_Med_rf); dict_options_dataset_training = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20122015', 'grouping': 'verylightgrouping', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': None, 'filtering': 'EntlassBereich_Gyn', 'balanced': False, 'resample': False } dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20162017', 'grouping': 'verylightgrouping', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': None, 'filtering': 'EntlassBereich_Gyn', 'balanced': False, 'resample': False } dataset_options_training_Gyn = DatasetOptions(dict_options_dataset_training); dataset_options_testing_Gyn = DatasetOptions(dict_options_dataset_testing); options_Gyn_nn = OptionsNN(model_dir, dataset_options_training_Gyn, options_clf=dict_options_nn); options_Gyn_lr = OptionsLogisticRegression(dirModelsBase, dataset_options_training_Gyn.getFilenameOptions(filteroptions=True), options_clf=dict_opt_lr); options_Gyn_rf = OptionsRF(dirModelsBase, dataset_options_training_Gyn.getFilenameOptions(filteroptions=True), options_clf=dict_opt_rf); classifier_nn_Gyn = ClassifierNN(options_Gyn_nn) classifier_lr_Gyn = ClassifierLogisticRegression(options_Gyn_lr) classifier_rf_Gyn = ClassifierRF(options_Gyn_rf) results_all_nn = Results(dirResultsBase, dataset_options_training_all, options_all_nn, 'test', dataset_options_testing_all); results_SaO_nn = Results(dirResultsBase, dataset_options_training_SaO, options_SaO_nn, 'test', dataset_options_testing_SaO); results_Med_nn = Results(dirResultsBase, dataset_options_training_Med, options_Med_nn, 'test', dataset_options_testing_Med); results_Gyn_nn = Results(dirResultsBase, dataset_options_training_Gyn, options_Gyn_nn, 'test', dataset_options_testing_Gyn); results_all_lr = Results(dirResultsBase, dataset_options_training_all, options_all_lr, 'test', dataset_options_testing_all); results_SaO_lr = Results(dirResultsBase, dataset_options_training_SaO, options_SaO_lr, 'test', dataset_options_testing_SaO); results_Med_lr = Results(dirResultsBase, dataset_options_training_Med, options_Med_lr, 'test', dataset_options_testing_Med); results_Gyn_lr = Results(dirResultsBase, dataset_options_training_Gyn, options_Gyn_lr, 'test', dataset_options_testing_Gyn); results_all_rf = Results(dirResultsBase, dataset_options_training_all, options_all_rf, 'test', dataset_options_testing_all); results_SaO_rf = Results(dirResultsBase, dataset_options_training_SaO, options_SaO_rf, 'test', dataset_options_testing_SaO); results_Med_rf = Results(dirResultsBase, dataset_options_training_Med, options_Med_rf, 'test', dataset_options_testing_Med); results_Gyn_rf = Results(dirResultsBase, dataset_options_training_Gyn, options_Gyn_rf, 'test', dataset_options_testing_Gyn); analyzer_all_nn = ResultsSingleConfigAnalyzer(results_all_nn, 10); analyzer_SaO_nn = ResultsSingleConfigAnalyzer(results_SaO_nn, 10); analyzer_Med_nn = ResultsSingleConfigAnalyzer(results_Med_nn, 10); analyzer_Gyn_nn = ResultsSingleConfigAnalyzer(results_Gyn_nn, 10); analyzer_all_lr = ResultsSingleConfigAnalyzer(results_all_lr, 10); analyzer_SaO_lr = ResultsSingleConfigAnalyzer(results_SaO_lr, 10); analyzer_Med_lr = ResultsSingleConfigAnalyzer(results_Med_lr, 10); analyzer_Gyn_lr = ResultsSingleConfigAnalyzer(results_Gyn_lr, 10); analyzer_all_rf = ResultsSingleConfigAnalyzer(results_all_rf, 10); analyzer_SaO_rf = ResultsSingleConfigAnalyzer(results_SaO_rf, 10); analyzer_Med_rf = ResultsSingleConfigAnalyzer(results_Med_rf, 10); analyzer_Gyn_rf = ResultsSingleConfigAnalyzer(results_Gyn_rf, 10); analyzer_nn = [analyzer_all_nn, analyzer_Med_nn, analyzer_SaO_nn, analyzer_Gyn_nn]; analyzer_lr = [analyzer_all_lr, analyzer_Med_lr, analyzer_SaO_lr, analyzer_Gyn_lr]; analyzer_rf = [analyzer_all_rf, analyzer_Med_rf, analyzer_SaO_rf, analyzer_Gyn_rf]; analyzer = analyzer_nn; names_nn = ['NN - all', 'NN - Med', 'NN - SaO', 'NN - Gyn'] names_lr = ['LR - all', 'LR - Med', 'LR - SaO', 'LR - Gyn'] names_rf = ['RF - all', 'RF - Med', 'RF - SaO', 'RF - Gyn'] names = names_nn; title_plot = '' filename_plot_nn = os.path.join(dirPlotsBase, 'oes_nn_classification_performance.png') filename_plot_lr = os.path.join(dirPlotsBase, 'oes_lr_classification_performance.png') filename_plot_rf = os.path.join(dirPlotsBase, 'oes_rf_classification_performance.png') filename_plot = filename_plot_nn; results_analyzer.plotROCcurveMulitpleConfigs(analyzer, names, titlePlot=title_plot, f_plot=filename_plot)
def plotNNPerformance(results_analyzer, dirData, dirModelsBase, dirResultsBase): # compare different trainings of NNs dict_options_dataset_training = { 'dir_data': dirData, 'data_prefix': 'nz', 'dataset': '20122016', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': {'method': 'FUSION'}, 'grouping': 'verylightgrouping' } dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': 'nz', 'dataset': '2017', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': {'method': 'FUSION'}, 'grouping': 'verylightgrouping' } options_training_nn = DatasetOptions(dict_options_dataset_training); options_testing_nn = DatasetOptions(dict_options_dataset_testing); dict_options_nn = { 'hidden_units': [60, 40, 20, 10, 10], 'learningrate': 0.05, 'dropout': 0.25, 'batch_size': 640, 'training_epochs': 250, 'pretrained': 'pretrained' } options_nn_nz = OptionsNN(dirModelsBase, options_training_nn.getFilenameOptions(filteroptions=True), options_clf=dict_options_nn) results_nn_nz = Results(dirResultsBase, options_training_nn, options_nn_nz, 'test', options_testing_nn); dict_options_dataset_training = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20122015', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': {'method': 'FUSION'}, 'grouping': 'verylightgrouping' } dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20162017', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': {'method': 'FUSION'}, 'grouping': 'verylightgrouping' } options_training_nn = DatasetOptions(dict_options_dataset_training); options_testing_nn = DatasetOptions(dict_options_dataset_testing); dict_options_nn = { 'hidden_units': [20, 10, 10], 'learningrate': 0.01, 'dropout': 0.15, 'batch_size': 80, 'training_epochs': 500, } options_nn_patrec = OptionsNN(dirModelsBase, options_training_nn.getFilenameOptions(filteroptions=True), options_clf=dict_options_nn) results_nn_patrec = Results(dirResultsBase, options_training_nn, options_nn_patrec, 'test', options_testing_nn); dict_options_nn = { 'hidden_units': [20, 10, 10], 'learningrate': 0.01, 'dropout': 0.25, 'batch_size': 80, 'training_epochs': 500, 'pretrained': 'pretrained' } options_nn_patrec_pretrained = OptionsNN(dirModelsBase, options_training_nn.getFilenameOptions(filteroptions=True), options_clf=dict_options_nn) results_nn_patrec_pretrained = Results(dirResultsBase, options_training_nn, options_nn_patrec_pretrained, 'test', options_testing_nn); dict_options_dataset_training = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20122015', 'subgroups': ['DK'], 'encoding': 'categorical', 'newfeatures': None, 'featurereduction': {'method': 'FUSION'}, 'grouping': 'verylightgrouping' } dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20162017', 'subgroups': ['DK'], 'encoding': 'categorical', 'newfeatures': None, 'featurereduction': {'method': 'FUSION'}, 'grouping': 'verylightgrouping' } dict_opt_lr = {'penalty': 'l1', 'C': 0.075}; options_training = DatasetOptions(dict_options_dataset_training); options_testing = DatasetOptions(dict_options_dataset_testing); options_lr = OptionsLogisticRegression(dirModelsBase, options_training.getFilenameOptions(filteroptions=True), options_clf=dict_opt_lr); results_lr = Results(dirResultsBase, options_training, options_lr, 'test', options_testing); analyzer_nn_nz = ResultsSingleConfigAnalyzer(results_nn_nz, 1); analyzer_nn_patrec = ResultsSingleConfigAnalyzer(results_nn_patrec, 1); analyzer_nn_patrec_pretrained = ResultsSingleConfigAnalyzer(results_nn_patrec_pretrained, 1); analyzer_lr = ResultsSingleConfigAnalyzer(results_lr, 10); analyzer = [analyzer_nn_nz, analyzer_nn_patrec, analyzer_nn_patrec_pretrained, analyzer_lr]; names = ['NZ', 'Basel', 'Basel (pretrained NZ)', 'LASSO'] title_plot = 'neural network performance: with and without pre-training' filename_plot = dirPlotsBase + 'nn_pretraining_nz_plus_lasso.png' results_analyzer.plotROCcurveMulitpleConfigs(analyzer, names, f_plot=filename_plot)
def plotDifferentClassifiers(results_analyzer, dirData, dirModelsBase, dirResultsBase): dict_opt_lr = {'penalty': 'l1', 'C': 0.5}; dict_opt_rf = {'n_estimators': 500, 'max_depth': 50}; dict_options_nn = { 'hidden_units': [60, 40, 40, 20], 'learningrate': 0.001, 'dropout': 0.5, 'batchnorm': True, 'batch_size': 64, 'training_epochs': 1000, 'pretrained': None, } DIRPROJECT = '/home/thomas/projects/patrec'; model_dir = os.path.join(DIRPROJECT, "patients_model") dict_options_dataset_training = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20122015', 'grouping': 'verylightgrouping', 'encoding': 'categorical', 'newfeatures': {'names': constantsPATREC.NEW_FEATURES}, 'featurereduction': None, 'filtering': None, 'balanced': False, 'resample': False } dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20162017', 'grouping': 'verylightgrouping', 'encoding': 'categorical', 'newfeatures': {'names': constantsPATREC.NEW_FEATURES}, 'featurereduction': None, 'filtering': None, 'balanced': False, 'resample': False } options_training = DatasetOptions(dict_options_dataset_training); options_testing = DatasetOptions(dict_options_dataset_testing); dict_opt_rf = {'n_estimators': 500, 'max_depth': 50}; options_rf = OptionsRF(dirModelsBase, options_training.getFilenameOptions(filteroptions=True), options_clf=dict_opt_rf); results_test_rf = Results(dirResultsBase, options_training, options_rf, 'test', options_testing); dict_opt_lr_l2 = {'penalty': 'l2', 'C': 0.01}; options_lr_l2 = OptionsLogisticRegression(dirModelsBase, options_training.getFilenameOptions(filteroptions=True), options_clf=dict_opt_lr_l2); results_test_lr_l2 = Results(dirResultsBase, options_training, options_lr_l2, 'test', options_testing); dict_opt_lr_l1 = {'penalty': 'l1', 'C': 0.5}; options_lr_l1 = OptionsLogisticRegression(dirModelsBase, options_training.getFilenameOptions(filteroptions=True), options_clf=dict_opt_lr_l1); results_test_lr_l1 = Results(dirResultsBase, options_training, options_lr_l1, 'test', options_testing); dict_options_dataset_training_nn = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20122015', 'grouping': 'verylightgrouping', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': None, 'filtering': None, 'balanced': False, 'resample': False } dict_options_dataset_testing_nn = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20162017', 'grouping': 'verylightgrouping', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': None, 'filtering': None, 'balanced': False, 'resample': False } options_training = DatasetOptions(dict_options_dataset_training_nn); options_testing = DatasetOptions(dict_options_dataset_testing_nn); options_nn = OptionsNN(model_dir, options_training, options_clf=dict_options_nn); results_test_nn = Results(dirResultsBase, options_training, options_nn, 'test', options_testing); analyzer_rf = ResultsSingleConfigAnalyzer(results_test_rf, 10); analyzer_lr_l1 = ResultsSingleConfigAnalyzer(results_test_lr_l1, 10); analyzer_nn = ResultsSingleConfigAnalyzer(results_test_nn, 10) analyzer = [analyzer_rf, analyzer_lr_l1, analyzer_nn]; names = ['RF', 'Logistic Regression (l1)', 'Neural Network'] title_plot = '' filename_plot = os.path.join(dirPlotsBase, 'different_classifiers_train_patrec_20122015_test_patrec_20162017.png') results_analyzer.plotROCcurveMulitpleConfigs(analyzer, names, f_plot=filename_plot, titlePlot=title_plot, )
'data_prefix': 'nz', 'dataset': str(year), 'newfeatures': { 'names': constantsNZ.NEW_FEATURES }, 'featurereduction': None } options_training = DatasetOptions(dict_options_dataset_training) dataset_training = Dataset(dataset_options=options_training) early_readmission_flagname = options_training.getEarlyReadmissionFlagname( ) print('dataset filename: ' + str(dataset_training.getFilename())) results_all_runs_train = Results(dirResultsBase, options_training, options_sgd, 'train') results_all_runs_eval = Results(dirResultsBase, options_training, options_sgd, 'eval') df_balanced_test = dataset_testing.getBalancedSubSet() num_runs = 1 eval_aucs = [] for run in range(0, num_runs): print('') [df_balanced_train, df_balanced_eval ] = dataset_training.getBalancedSubsetTrainingAndTesting() print('train...') clf_sgd.train_partial(df_balanced_train, early_readmission_flagname) results_train = clf_sgd.predict(df_balanced_train,
clf_lr = ClassifierLogisticRegression(options_lr) dict_options_svm = { 'kernel': 'rbf', 'C': 1.0 } options_svm = OptionsSVM( dirModelsBase, options_training.getFilenameOptions(filteroptions=True), options_clf=dict_options_svm) clf_svm = ClassifierSVM(options_svm) options_clf = options_lr clf = clf_lr results_all_runs_train = Results(dirResultsBase, options_training, options_clf, 'train') results_all_runs_eval = Results(dirResultsBase, options_training, options_clf, 'eval') num_runs = 10 eval_aucs = [] for run in range(0, num_runs): print('') [df_balanced_train, df_balanced_eval ] = dataset_training.getBalancedSubsetTrainingAndTesting() clf.train(df_balanced_train, early_readmission_flagname) results_train = clf.predict(df_balanced_train, early_readmission_flagname) results_eval = clf.predict(df_balanced_eval, early_readmission_flagname)
dict_opt_lr = { 'penalty': 'l1', 'C': 0.5 } options_lr = OptionsLogisticRegression( dirModelsBase, options_training.getFilenameOptions(filteroptions=True), options_clf=dict_opt_lr) clf_lr = ClassifierLogisticRegression(options_lr) options_clf = options_lr clf = clf_lr options_testing = DatasetOptions(dict_options_dataset_testing) dataset_testing = Dataset(dataset_options=options_testing) results_all_runs_test = Results(dirResultsBase, options_training, options_clf, 'test', options_testing) early_readmission_flagname = options_testing.getEarlyReadmissionFlagname() test_aucs = [] num_runs = 10 for k in range(0, num_runs): df_balanced_test = dataset_testing.getBalancedSubSet() clf.loadFromFile(k) results_test = clf.predict(df_balanced_test, early_readmission_flagname) auc_test = results_test.getAUC() test_aucs.append(auc_test) print('test auc: ' + str(auc_test)) results_all_runs_test.addResultsSingleRun(results_test)