class NeuralNetDatasetMaker: def __init__(self, mode, dir_model, dataset_options, balanced_datasets=True): self.dir_model = dir_model self.mode = mode self.dataset_options = dataset_options self.dataset = Dataset(self.dataset_options) self.balanced_datasets = balanced_datasets return def createDatasets(self): print('_getFilenameDatasetBalanced: ' + str(self.mode)) filename_dataset_base = self.dataset_options.getFilename() filename_prefix = self.dir_model + os.sep + filename_dataset_base.split( os.sep)[-1][:-4] if self.mode == 'traineval': if self.balanced_datasets: [df_training, df_testing ] = self.dataset.getBalancedSubsetTrainingAndTesting() self.num_samples_train = df_training.shape[0] self.num_samples_validation = df_testing.shape[0] filename_train = filename_prefix + '_balanced_train.csv' filename_eval = filename_prefix + '_balanced_eval.csv' df_training.to_csv(filename_train, line_terminator='\n', index=False) df_testing.to_csv(filename_eval, line_terminator='\n', index=False) print(filename_train) print(filename_eval) else: [training, testing] = self.dataset.getTrainingAndTestingSet() df_training_pos = training[0] df_training_neg = training[1] df_eval_pos = testing[0] df_eval_neg = testing[1] self.num_samples_train = 2 * int(df_training_neg.shape[0]) self.num_samples_validation = 2 * int(df_eval_neg.shape[0]) filename_train_pos = filename_prefix + '_train_pos.csv' filename_train_neg = filename_prefix + '_train_neg.csv' filename_eval_pos = filename_prefix + '_eval_pos.csv' filename_eval_neg = filename_prefix + '_eval_neg.csv' df_training_pos.to_csv(filename_train_pos, line_terminator='\n', index=False) df_training_neg.to_csv(filename_train_neg, line_terminator='\n', index=False) df_eval_pos.to_csv(filename_eval_pos, line_terminator='\n', index=False) df_eval_neg.to_csv(filename_eval_neg, line_terminator='\n', index=False) else: if self.balanced_datasets: df_balanced = self.dataset.getBalancedSubSet() filename_dataset = filename_prefix + '_balanced_' + self.mode + '.csv' df_balanced.to_csv(filename_dataset, line_terminator='\n', index=False) print(filename_dataset) else: print('no valid configuration of datasets and mode..exit') sys.exit() def removeDatasets(self): filename_dataset_base = self.dataset_options.getFilename() filename_prefix = self.dir_model + os.sep + filename_dataset_base.split( os.sep)[-1][:-4] if self.balanced_datasets: filename_dataset = filename_prefix + '_balanced_' + self.mode + '.csv' print('remove: ' + str(filename_dataset)) os.remove(filename_dataset) else: print('no valid configuration of datasets and mode..exit') sys.exit() def _dfToFile(self, df, filename): list_df = [df[i:i + 10000] for i in range(0, df.shape[0], 10000)] list_df[0].to_csv(filename, index=False, line_terminator='\n') for l in list_df[1:]: l.to_csv(filename, index=False, line_terminator='\n', header=False, mode='a') def createDatasetsAutoEncoder(self): print('_getFilenameDatasetBalanced: ' + str(self.mode)) filename_dataset_base = self.dataset_options.getFilename() filename_prefix = self.dir_model + os.sep + filename_dataset_base.split( os.sep)[-1][:-4] if self.mode == 'traineval': df = self.dataset.getData() df = df.sample(frac=1) print('num samples: ' + str(df.shape[0])) print('df.shape: ' + str(df.shape)) num_samples = df.shape[0] ratio_train_test = self.dataset_options.getRatioTrainingSamples() df_train = df[:int(round(ratio_train_test * num_samples))] df_eval = df[int(round(ratio_train_test * num_samples)):] filename_train = filename_prefix + '_balanced_train.csv' filename_eval = filename_prefix + '_balanced_eval.csv' self._dfToFile(df_train, filename_train) self._dfToFile(df_eval, filename_eval) else: filename_test = filename_prefix + '_test.csv' df = self.dataset.getData() df = df.sample(frac=1) self._dfToFile(df, filename_test)
'featurereduction': None } options_training = DatasetOptions(dict_options_dataset_training) dataset_training = Dataset(dataset_options=options_training) early_readmission_flagname = options_training.getEarlyReadmissionFlagname( ) print('dataset filename: ' + str(dataset_training.getFilename())) results_all_runs_train = Results(dirResultsBase, options_training, options_sgd, 'train') results_all_runs_eval = Results(dirResultsBase, options_training, options_sgd, 'eval') df_balanced_test = dataset_testing.getBalancedSubSet() num_runs = 1 eval_aucs = [] for run in range(0, num_runs): print('') [df_balanced_train, df_balanced_eval ] = dataset_training.getBalancedSubsetTrainingAndTesting() print('train...') clf_sgd.train_partial(df_balanced_train, early_readmission_flagname) results_train = clf_sgd.predict(df_balanced_train, early_readmission_flagname) results_eval = clf_sgd.predict(df_balanced_eval, early_readmission_flagname) results_all_runs_train.addResultsSingleRun(results_train)
dict_options_dataset = { 'dir_data': dirData, 'data_prefix': 'nz', 'dataset': str(year), 'encoding': 'embedding', 'grouping': 'verylightgrouping', 'newfeatures': None, 'featurereduction': { 'method': 'FUSION' } } options_dataset_year = DatasetOptions(dict_options_dataset) dataset_year = Dataset(options_dataset_year) if balanced: df_year = dataset_year.getBalancedSubSet() else: df_year = dataset_year.getDf() #df_year['main_diag'] = df_year['main_diag'].apply(convertDiagToInd) print(df_year.shape) df_all_years = df_all_years.append(df_year) print('df balanced all years: ' + str(df_all_years.shape)) encoding = options_dataset_year.getEncodingScheme() grouping = options_dataset_year.getGroupingName() featureset = options_dataset_year.getFeatureSetStr() filename_data_years = dirData + 'data_nz_' + str(min(years)) + str( max(years) ) + '_' + featureset + '_' + encoding + '_' + grouping + '.csv'