def get_raw_matrix(self, data_tag): if data_tag == "src": return FeatureMatrixIO().read_file_to_data_frame( self.raw_matrix_filepath) else: return FeatureMatrixIO().read_file_to_data_frame( self.raw_matrix_filepath)
def _process_raw_feature_matrix(self): # Read raw CMM. self._fm_io = FeatureMatrixIO() print('Reading raw matrix...') self._cmm_raw = self._fm_io.read_file_to_data_frame(self._cmm_name_raw) # Add and remove features to _cmm_processed. self._fmt = FeatureMatrixTransform() self._fmt.set_input_matrix(self._cmm_raw) print('Adding features...') self._add_features() print('Imputing data...') self._impute_data() self._remove_features() self._fmt.drop_duplicate_rows() self._cmm_processed = self._fmt.fetch_matrix() # Divide _cmm_processed into training and test data. # This must happen before feature selection so that we don't # accidentally learn information from the test data. self._train_test_split() print('Selecting features...') self._select_features() # Write output to new matrix. train = self._y_train.join(self._X_train) test = self._y_test.join(self._X_test) self._cmm_processed = train.append(test) header = self._build_processed_matrix_header() self._fm_io.write_data_frame_to_file(self._cmm_processed, self._cmm_name_processed, header)
def load_imputation_template(lab, dataset_folderpath, lab_type='panel'): data_lab_folderpath = os.path.join(dataset_folderpath, lab) imputations = pickle.load( open(data_lab_folderpath + '/' + "feat2imputed_dict.pkl")) if len(imputations) < 200: # ''' only includes selected features ''' return imputations if lab_type == 'panel': ylabel = 'all_components_normal' else: ylabel = 'component_normal' ''' All raw matrix's columns are included. Have to extract final features from processed matrix ''' fm_io = FeatureMatrixIO() df_processed = fm_io.read_file_to_data_frame( data_lab_folderpath + '/' + '%s-normality-matrix-processed.tab' % lab) df_processed.pop('pat_id') df_processed.pop(ylabel) # TODO?! processed_columns_stanford = df_processed.columns.values.tolist() imputations_new = {} for i, col_selected in enumerate(processed_columns_stanford): imputations_new[col_selected] = (i, imputations[col_selected]) return imputations_new
def load_raw_matrix(lab, dataset_folderpath): data_lab_folderpath = os.path.join(dataset_folderpath, lab) raw_matrix_filepath = os.path.join(data_lab_folderpath, raw_matrix_template % lab) fm_io = FeatureMatrixIO() # TODO: check if raw matrix exists raw_matrix = fm_io.read_file_to_data_frame(raw_matrix_filepath) return raw_matrix
def _build_raw_feature_matrix(self): raw_matrix_path = self._build_raw_matrix_path() matrix_class = LabNormalityMatrix SupervisedLearningPipeline._build_raw_feature_matrix(self, matrix_class, \ raw_matrix_path) if not self._holdOut: fm_io = FeatureMatrixIO() matrix = fm_io.read_file_to_data_frame(raw_matrix_path) self.usedPatIds = set(matrix['pat_id'].values)
def __init__(self, lab_panel, num_episodes, use_cache=None, random_state=None, timeLimit=None, notUsePatIds=None, holdOut=False, pat_batch_ind=None, includeLastNormality=True): # self.notUsePatIds = notUsePatIds self.pat_batch_ind = pat_batch_ind self._holdOut = holdOut self.usedPatIds = [] SupervisedLearningPipeline.__init__(self, lab_panel, num_episodes, use_cache, random_state, timeLimit, notUsePatIds) # TODO: naming of lab_panel self._factory = FeatureMatrixFactory() self._build_raw_feature_matrix() if LAB_TYPE == 'panel': self.ylabel = 'all_components_normal' else: self.ylabel = 'component_normal' self.includeLastNormality = includeLastNormality if self.includeLastNormality: fm_io = FeatureMatrixIO() df = fm_io.read_file_to_data_frame('data/'+lab_panel+'/%s-normality-matrix-raw.tab'%lab_panel) df = df.sort_values(['pat_id', 'order_time']).reset_index(drop=True) df['last_normality'] = df['order_proc_id'].apply(lambda x:float('nan')) for i in range(1,df.shape[0]): if df.ix[i, 'pat_id'] == df.ix[i-1, 'pat_id']: df.ix[i, 'last_normality'] = df.ix[i-1, self.ylabel] df.to_csv('data/'+lab_panel+'/%s-normality-matrix-raw.tab'%lab_panel, index=False, sep='\t') data_lab_folder = self._fetch_data_dir_path(inspect.getfile(inspect.currentframe())) feat2imputed_dict_path = data_lab_folder + '/feat2imputed_dict.pkl' if holdOut: ''' For holdOut evaluation data, produce the raw matrix, pick features according to the saved feat2imputed_dict. ''' self.feat2imputed_dict = pickle.load(open(feat2imputed_dict_path, 'r')) self._build_processed_feature_matrix_holdout() self._analyze_predictors_on_holdout() else: ''' For training/validation data, record the pat_ids, selected features and their imputed value correspondingly. ''' pickle.dump(self.usedPatIds, open('data/used_patient_set_%s.pkl'%self._var, 'w'), pickle.HIGHEST_PROTOCOL) self._build_processed_feature_matrix() self._build_baseline_results() # TODO: prototype in SLPP # return # TODO: find better place to put the dict.pkl pickle.dump(self.feat2imputed_dict, open(feat2imputed_dict_path, 'w'), pickle.HIGHEST_PROTOCOL) self._train_and_analyze_predictors()
def _analyze_predictor_holdoutset(self, dest_dir, pipeline_prefix): slugified_var = '-'.join(self._var.split()) holdout_path = dest_dir + '/../' + '%s-normality-matrix-%d-episodes-processed-holdout.tab' % ( slugified_var, self._num_rows) fm_io = FeatureMatrixIO() processed_matrix = fm_io.read_file_to_data_frame(holdout_path) if self._isLabPanel: y_holdout = pd.DataFrame( processed_matrix.pop('all_components_normal')) else: y_holdout = pd.DataFrame(processed_matrix.pop('component_normal')) X_holdout = processed_matrix analyzer = ClassifierAnalyzer(self._predictor, X_holdout, y_holdout) train_label = 'holdoutset' # Build names for output plots and report. direct_comparisons_name = '%s-direct-compare-results-%s.csv' % ( pipeline_prefix, train_label) precision_at_k_plot_name = '%s-precision-at-k-plot-%s.png' % ( pipeline_prefix, train_label) precision_recall_plot_name = '%s-precision-recall-plot-%s.png' % ( pipeline_prefix, train_label) roc_plot_name = '%s-roc-plot-%s.png' % (pipeline_prefix, train_label) report_name = '%s-report-%s.tab' % (pipeline_prefix, train_label) # Build paths. direct_comparisons_path = '/'.join([dest_dir, direct_comparisons_name]) log.debug('direct_comparisons_path: %s' % direct_comparisons_path) precision_at_k_plot_path = '/'.join( [dest_dir, precision_at_k_plot_name]) log.debug('precision_at_k_plot_path: %s' % precision_at_k_plot_path) precision_recall_plot_path = '/'.join( [dest_dir, precision_recall_plot_name]) log.debug('precision_recall_plot_path: %s' % precision_recall_plot_path) roc_plot_path = '/'.join([dest_dir, roc_plot_name]) log.debug('roc_plot_path: %s' % roc_plot_path) report_path = '/'.join([dest_dir, report_name]) log.debug('report_path: %s' % report_path) # Build plot titles. roc_plot_title = 'ROC (%s)' % pipeline_prefix precision_recall_plot_title = 'Precision-Recall (%s)' % pipeline_prefix precision_at_k_plot_title = 'Precision @K (%s)' % pipeline_prefix # Write output. analyzer.output_direct_comparisons(direct_comparisons_path) analyzer.plot_roc_curve(roc_plot_title, roc_plot_path) analyzer.plot_precision_recall_curve(precision_recall_plot_title, precision_recall_plot_path) analyzer.plot_precision_at_k_curve(precision_at_k_plot_title, precision_at_k_plot_path) analyzer.write_report(report_path, ci=0.95)
def read_lab_meta_report(lab_panel): fm_io = FeatureMatrixIO() data_dir = LabNormalityReport.fetch_data_dir_path() meta_report_path = data_dir + '/%s/%s-normality-prediction-report.tab' % (lab_panel, lab_panel) if os.path.exists(meta_report_path): meta_report = fm_io.read_file_to_data_frame(meta_report_path) return meta_report else: # IF meta_report does not exist, fetch the data on class counts. algorithm = SupervisedClassifier.REGRESS_AND_ROUND report_path = data_dir + '/%s/%s/%s-normality-prediction-report.tab' % (lab_panel, algorithm, lab_panel) algorithm_report = fm_io.read_file_to_data_frame(report_path) return algorithm_report
def _build_processed_feature_matrix_holdout(self): fm_io = FeatureMatrixIO() raw_matrix = fm_io.read_file_to_data_frame( self._build_raw_matrix_path()) # if outcome_label in self.feat2imputed_dict: # self.feat2imputed_dict.pop(outcome_label) # # processed_matrix = raw_matrix[self.feat2imputed_dict.keys()+[outcome_label]].copy() ''' TODO: feat2imputed_dict includes the outcome label ''' processed_matrix = raw_matrix[self.feat2imputed_dict.keys()].copy() # TODO: tmp solution! tmp_path = self._build_processed_matrix_path().replace( "2000", "10000").replace("-holdout", "") fm_io1 = FeatureMatrixIO() processed_matrix_previous = fm_io1.read_file_to_data_frame(tmp_path) processed_matrix = processed_matrix[processed_matrix_previous.columns] # TODO: tmp solution! for feat in self.feat2imputed_dict.keys(): processed_matrix[feat] = processed_matrix[feat].fillna( self.feat2imputed_dict[feat]) fm_io.write_data_frame_to_file(processed_matrix, \ self._build_processed_matrix_path(), None)
def _analyze_predictors_on_holdout(self): fm_io = FeatureMatrixIO() algorithms_to_test = list() algorithms_to_test.extend(SupervisedClassifier.SUPPORTED_ALGORITHMS) pipeline_file_name = inspect.getfile(inspect.currentframe()) data_dir = SupervisedLearningPipeline._fetch_data_dir_path( self, pipeline_file_name) # for algorithm in SupervisedClassifier.SUPPORTED_ALGORITHMS: # algorithms_to_test.append('bifurcated-%s' % algorithm) log.debug('algorithms_to_test: %s' % algorithms_to_test) for algorithm in algorithms_to_test: log.info('analyzing %s...' % algorithm) # If report_dir does not exist, make it. report_dir = '/'.join([data_dir, algorithm]) pipeline_prefix = '%s-normality-prediction-%s' % (self._var, algorithm) predictor_path = self._build_model_dump_path(algorithm) if os.path.exists( predictor_path) and 'bifurcated' not in algorithm: log.debug('Loading model from disk...') # TODO(sbala): Fix loblib.load so that it works for bifurcated # supervised classifiers. self._predictor = joblib.load(predictor_path) # self._features = self._X_train.columns status = SupervisedClassifier.TRAINED SupervisedLearningPipeline._analyze_predictor_holdoutset( self, report_dir, pipeline_prefix)
def test_SupervisedLearner(): from medinfo.ml.SupervisedLearner import SupervisedLearner import inspect from medinfo.dataconversion.FeatureMatrixIO import FeatureMatrixIO class LabNormalityLearner(SupervisedLearner): def __init__(self, input_matrix, ylabel): self.working_folderpath = '/'.join(inspect.getfile(inspect.currentframe()).split('/')[:-1]) self.input_matrix = input_matrix self.ylabel = ylabel pass fm_io = FeatureMatrixIO() processed_matrix = fm_io.read_file_to_data_frame('data-testingSupervisedLearner-panel-10000-episodes/LABA1C/LABA1C-normality-train-matrix-processed.tab') processed_matrix.pop('pat_id') lnl = LabNormalityLearner(processed_matrix, 'all_components_normal') lnl.run()
def test_read_file_to_data_frame(self): # Initialize FeatureMatrixIO. fm_io = FeatureMatrixIO() # Build paths for test files. app_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) no_header_file_name = 'test-matrix-no-header.tab' with_header_file_name = 'test-matrix-with-header.tab' no_header_file_path = os.path.join(app_dir, no_header_file_name) with_header_file_path = os.path.join(app_dir, with_header_file_name) # Read files into data frames. matrix_stripped_header = fm_io.read_file_to_data_frame(with_header_file_path) matrix_no_header = fm_io.read_file_to_data_frame(no_header_file_path) # Verify that FeatureMatrixIO correctly stripped the header. expected_matrix = MANUAL_TEST_CASE['matrix_no_header'] assert_frame_equal(expected_matrix, matrix_stripped_header) assert_frame_equal(expected_matrix, matrix_no_header)
def test_strip_header(self): # Initialize FeatureMatrixIO. fm_io = FeatureMatrixIO() # Build paths for test files. app_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) no_header_file_name = 'test-matrix-no-header.tab' with_header_file_name = 'test-matrix-with-header.tab' no_header_file_path = os.path.join(app_dir, no_header_file_name) with_header_file_path = os.path.join(app_dir, with_header_file_name) # Strip header. matrix_with_header = fm_io.read_file_to_data_frame(with_header_file_path) self._stripped_header_file_path = fm_io.strip_header(with_header_file_path) # Validate matrix data. expected_matrix = MANUAL_TEST_CASE['matrix_no_header'] actual_matrix = fm_io.read_file_to_data_frame(self._stripped_header_file_path, \ datetime_col_index=1) assert_frame_equal(expected_matrix, actual_matrix)
def jitter_processed_matrix(lab, pat_num_limit=100): data_file = "%s-normality-matrix-10000-episodes-processed.tab" % lab data_path = os.path.join(data_folder, lab, data_file) fm_io = FeatureMatrixIO() df = fm_io.read_file_to_data_frame(data_path) ''' Reset the pat ids ''' pat_ids = sorted(set(df['pat_id'].values.tolist())) pat_ids = pat_ids[:pat_num_limit] pat2pat = {} for i, pat_id in enumerate(pat_ids): pat2pat[pat_id] = i df['pat_id'] = df['pat_id'].apply(lambda x: pat2pat[x] if x in pat2pat else None) df = df.dropna() print np.array_repr(df.values) print df.columns
def test(test_suite=[]): import LabNormalityLearner_Config as Config from medinfo.dataconversion.FeatureMatrixIO import FeatureMatrixIO fm_io = FeatureMatrixIO() raw_matrix = fm_io.read_file_to_data_frame( 'LabNormalityLearner_TestData/LABA1C-normality-matrix-raw.tab') if 'remove' in test_suite: remover = LNL.FeatureRemover(Config.features_to_remove) processed_matrix_removed = remover.transform(raw_matrix) assert raw_matrix.shape[0] < processed_matrix_removed.shape[0] assert raw_matrix.shape[1] == processed_matrix_removed.shape[1] if 'impute' in test_suite: features_to_impute = [ 'TBIL.-14_0.max', 'TBIL.-14_0.median', 'TBIL.-14_0.mean', 'TBIL.-14_0.std' ] #('min', 'max', 'median', 'mean', 'std', 'first', 'last', 'diff', 'slope', 'proximate') imputation_dict = {} for feature in features_to_impute: imputation_dict[feature] = 0 imputer = LNL.FeatureImputer(imputation_dict=imputation_dict) columns_to_look = [ 'pat_id', 'TBIL.-14_0.max', 'TBIL.-14_0.median', 'TBIL.-14_0.mean', 'TBIL.-14_0.std' ] print 'raw_matrix[columns_to_look].head():', raw_matrix[ columns_to_look].head() processed_matrix_imputed = imputer.fit_transform(raw_matrix) print 'processed_matrix_imputed[columns_to_look].head():', processed_matrix_imputed[ columns_to_look].head() assert processed_matrix_imputed[columns_to_look].isna().any().any( ) == False assert (raw_matrix['order_proc_id'].values == processed_matrix_imputed['order_proc_id'].values).all()
def test_write_data_frame_to_file(self): # Initialize FeatureMatrixIO. fm_io = FeatureMatrixIO() # Build paths for test files. app_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) no_header_file_name = 'test-matrix-no-header.tab' with_header_file_name = 'test-matrix-with-header.tab' no_header_file_path = os.path.join(app_dir, no_header_file_name) with_header_file_path = os.path.join(app_dir, with_header_file_name) # Read data frames from test files. matrix_no_header = MANUAL_TEST_CASE['matrix_no_header'] matrix_header = MANUAL_TEST_CASE['custom_header'] # Write data frame without header. no_header_temp_file_name = 'no-header-temp-file.tab' self._no_header_temp_file_path = os.path.join(app_dir, no_header_temp_file_name) fm_io.write_data_frame_to_file(matrix_no_header, self._no_header_temp_file_path) # Write data frame with header. with_header_temp_file_name = 'header-temp-file.tab' self._with_header_temp_file_path = os.path.join(app_dir, with_header_temp_file_name) fm_io.write_data_frame_to_file(matrix_no_header, self._with_header_temp_file_path, matrix_header) # Validate output files. self.assertTrue(filecmp.cmp(no_header_file_path, self._no_header_temp_file_path)) self.assertTrue(filecmp.cmp(with_header_file_path, self._with_header_temp_file_path))
def load_processed_matrix(lab, dataset_folderpath, type='full'): data_lab_folderpath = os.path.join(dataset_folderpath, lab) if type == 'train': matrix_filepath = os.path.join(data_lab_folderpath, processed_matrix_train_template % lab) elif type == 'evalu': matrix_filepath = os.path.join(data_lab_folderpath, processed_matrix_evalu_template % lab) else: matrix_filepath = os.path.join(data_lab_folderpath, processed_matrix_template % lab) fm_io = FeatureMatrixIO() # TODO: check if raw matrix exists if os.path.exists(matrix_filepath): matrix = fm_io.read_file_to_data_frame(matrix_filepath) else: matrix = fm_io.read_file_to_data_frame( matrix_filepath.replace('-test', '-evalu')) return matrix
def write_matrix(self, dest_path, header=None): log.info('Writing matrix file...') fm_io = FeatureMatrixIO() # Get old matrix file. source_path = self._factory.getMatrixFileName() # Write to new matrix filee. matrix_file = open(dest_path, 'w') # for line in header: # matrix_file.write('# %s\n' % line) for line in open(source_path, 'r'): if line[0] != '#': matrix_file.write(line) # Delete old matrix file. os.remove(source_path)
def _build_processed_feature_matrix(self, params): # params is a dict defining the details of how the raw feature matrix # should be transformed into the processed matrix. Given the sequence # of steps will be identical across all pipelines, sbala decided to # pack all the variability into this dict. It's not ideal because the # dict has 10+ values, but that seems better than forcing all pipelines # to reproduce the logic of the processing steps. # Principle: Minimize overridden function calls. # params['features_to_add'] = features_to_add # params['features_to_filter_on'] (optional) = features_to_filter_on # params['imputation_strategies'] = imputation_strategies # params['features_to_remove'] = features_to_remove # params['outcome_label'] = outcome_label # params['selection_problem'] = selection_problem # params['selection_algorithm'] = selection_algorithm # params['percent_features_to_select'] = percent_features_to_select # params['matrix_class'] = matrix_class # params['pipeline_file_path'] = pipeline_file_path # TODO(sbala): Determine which fields should have defaults. fm_io = FeatureMatrixIO() log.debug('params: %s' % params) # If processed matrix exists, and the client has not requested to flush # the cache, just use the matrix that already exists and return. processed_matrix_path = params['processed_matrix_path'] if os.path.exists(processed_matrix_path) and not self._flush_cache: # Assume feature selection already happened, but we still need # to split the data into training and test data. processed_matrix = fm_io.read_file_to_data_frame( processed_matrix_path) self._train_test_split(processed_matrix, params['outcome_label']) else: # Read raw matrix. raw_matrix = fm_io.read_file_to_data_frame( params['raw_matrix_path']) # Initialize FMT. fmt = FeatureMatrixTransform() fmt.set_input_matrix(raw_matrix) # Add features. self._add_features(fmt, params['features_to_add']) # Remove features. self._remove_features(fmt, params['features_to_remove']) # Filter on features if 'features_to_filter_on' in params: self._filter_on_features(fmt, params['features_to_filter_on']) # HACK: When read_csv encounters duplicate columns, it deduplicates # them by appending '.1, ..., .N' to the column names. # In future versions of pandas, simply pass mangle_dupe_cols=True # to read_csv, but not ready as of pandas 0.22.0. for feature in raw_matrix.columns.values: if feature[-2:] == ".1": fmt.remove_feature(feature) self._removed_features.append(feature) # Impute data. self._impute_data(fmt, raw_matrix, params['imputation_strategies']) # In case any all-null features were created in preprocessing, # drop them now so feature selection will work fmt.drop_null_features() # Build interim matrix. processed_matrix = fmt.fetch_matrix() # Divide processed_matrix into training and test data. # This must happen before feature selection so that we don't # accidentally learn information from the test data. self._train_test_split(processed_matrix, params['outcome_label']) self._select_features(params['selection_problem'], params['percent_features_to_select'], params['selection_algorithm'], params['features_to_keep']) train = self._y_train.join(self._X_train) test = self._y_test.join(self._X_test) processed_matrix = train.append(test) # Write output to new matrix file. header = self._build_processed_matrix_header(params) fm_io.write_data_frame_to_file(processed_matrix, \ processed_matrix_path, header)
def get_raw_matrix(self): return FeatureMatrixIO().read_file_to_data_frame( self.raw_matrix_filepath)
def _build_processed_feature_matrix(self): # Define parameters for processing steps. params = {} raw_matrix_path = self._build_raw_matrix_path() processed_matrix_path = self._build_processed_matrix_path(raw_matrix_path) log.debug('params: %s' % params) prev_measurement_feature = self._change_params['feature_old'] features_to_add = {'change': [self._change_params]} features_to_filter_on = [{'feature': prev_measurement_feature, 'value':np.nan}] imputation_strategies = { } features_to_remove = [ 'pat_id', 'order_time', 'order_proc_id', 'ord_num_value', 'proc_code', 'abnormal_panel', 'all_components_normal', 'num_normal_components', 'Birth.pre', 'Male.preTimeDays', 'Female.preTimeDays', 'RaceWhiteHispanicLatino.preTimeDays', 'RaceWhiteNonHispanicLatino.preTimeDays', 'RaceHispanicLatino.preTimeDays', 'RaceAsian.preTimeDays', 'RaceBlack.preTimeDays', 'RacePacificIslander.preTimeDays', 'RaceNativeAmerican.preTimeDays', 'RaceOther.preTimeDays', 'RaceUnknown.preTimeDays', 'Death.post', 'Death.postTimeDays', 'num_components' ] features_to_keep = [ # Keep the # of times it's been ordered in past, even if low info. '%s.pre' % self._var ] outcome_label = 'unchanged_yn' selection_problem = FeatureSelector.CLASSIFICATION selection_algorithm = FeatureSelector.RECURSIVE_ELIMINATION percent_features_to_select = 0.05 matrix_class = LabChangeMatrix pipeline_file_path = inspect.getfile(inspect.currentframe()) data_overview = [ # Overview: 'Overview', # The outcome label is ___. 'The outcome label is %s.' % outcome_label, # %s is a boolean indicator which summarizes whether the lab test '%s is a boolean indicator which summarizes whether the lab test ' % outcome_label, # result is unchanged compared to the previous measurement. 'result is unchanged compared to the previous measurement.', # Each row represents a unique lab panel order. 'Each row represents a unique lab panel order.', # Each row contains fields summarizing the patient's demographics, "Each row contains fields summarizing the patient's demographics", # inpatient admit date, prior vitals, and prior lab results. 'inpatient admit date, prior vitals, and prior lab results.', # Most cells in matrix represent a count statistic for an event's "Most cells in matrix represent a count statistic for an event's", # occurrence or a difference between an event's time and index_time. "occurrence or a difference between an event's time and index_time.", # Lab panel orders were only included if a previous measurement of "Lab panel orders were only included if a previous measurement of", # the same lab panel has been recorded "the same lab panel has been recorded." ] # Bundle parameters into single object params['raw_matrix_path'] = raw_matrix_path params['processed_matrix_path'] = processed_matrix_path params['features_to_add'] = features_to_add params['features_to_keep'] = features_to_keep params['features_to_filter_on'] = features_to_filter_on params['imputation_strategies'] = imputation_strategies params['features_to_remove'] = features_to_remove params['outcome_label'] = outcome_label params['selection_problem'] = selection_problem params['selection_algorithm'] = selection_algorithm params['percent_features_to_select'] = percent_features_to_select params['matrix_class'] = matrix_class params['pipeline_file_path'] = pipeline_file_path params['data_overview'] = data_overview # defer to SupervisedLearningPipeline logic by SX fm_io = FeatureMatrixIO() log.debug('params: %s' % params) # If processed matrix exists, and the client has not requested to flush # the cache, just use the matrix that already exists and return. processed_matrix_path = params['processed_matrix_path'] if os.path.exists(processed_matrix_path) and not self._flush_cache: # Assume feature selection already happened, but we still need # to split the data into training and test data. processed_matrix = fm_io.read_file_to_data_frame(processed_matrix_path) ''' Make sure the order of rows is consistent before splitting ''' processed_matrix.sort_index(inplace=True) self._train_test_split(processed_matrix, params['outcome_label']) #TODO sxu: when reloading, no pat_id else: # Read raw matrix. raw_matrix = fm_io.read_file_to_data_frame(params['raw_matrix_path']) # Initialize FMT. # Add outcome label raw_fmt = FeatureMatrixTransform() raw_fmt.set_input_matrix(raw_matrix) self._filter_on_features(raw_fmt, params['features_to_filter_on']) self._add_features(raw_fmt, params['features_to_add']) raw_matrix = raw_fmt.fetch_matrix() # Divide processed_matrix into training and test data. # This must happen before feature selection so that we don't # accidentally learn information from the test data. # TODO: work on this... self._train_test_split(raw_matrix, params['outcome_label']) fmt = FeatureMatrixTransform() train_df = self._X_train.join(self._y_train) fmt.set_input_matrix(train_df) # Remove features. self._remove_features(fmt, params['features_to_remove']) # Filter on features if 'features_to_filter_on' in params: self._filter_on_features(fmt, params['features_to_filter_on']) # HACK: When read_csv encounters duplicate columns, it deduplicates # them by appending '.1, ..., .N' to the column names. # In future versions of pandas, simply pass mangle_dupe_cols=True # to read_csv, but not ready as of pandas 0.22.0. for feature in raw_matrix.columns.values: if feature[-2:] == ".1": fmt.remove_feature(feature) self._removed_features.append(feature) # Impute data. self._impute_data(fmt, train_df, params['imputation_strategies']) # In case any all-null features were created in preprocessing, # drop them now so feature selection will work fmt.drop_null_features() # Build interim matrix. train_df = fmt.fetch_matrix() self._y_train = pd.DataFrame(train_df.pop(params['outcome_label'])) self._X_train = train_df ''' Select X_test columns according to processed X_train ''' self._X_test = self._X_test[self._X_train.columns] ''' Impute data according to the same strategy when training ''' for feat in self._X_test.columns: self._X_test[feat] = self._X_test[feat].fillna(self.feat2imputed_dict[feat]) self._select_features(params['selection_problem'], params['percent_features_to_select'], params['selection_algorithm'], params['features_to_keep']) train = self._y_train.join(self._X_train) test = self._y_test.join(self._X_test) processed_matrix = train.append(test) ''' Need to recover the order of rows before writing into disk ''' processed_matrix.sort_index(inplace=True) # Write output to new matrix file. header = self._build_processed_matrix_header(params) fm_io.write_data_frame_to_file(processed_matrix, \ processed_matrix_path, header)
from scripts.LabTestAnalysis.machine_learning import LabNormalityPredictionPipeline import matplotlib matplotlib.rcParams['backend'] = 'TkAgg' import matplotlib.pyplot as plt folder = '../machine_learning/data/' labs = LabNormalityPredictionPipeline.NON_PANEL_TESTS_WITH_GT_500_ORDERS if False: all_testset_leakage_percentages = [] all_best_aucrocs = [] for lab in labs: try: raw_matrix_file = '%s-normality-matrix-10000-episodes-raw.tab' % lab fm_io = FeatureMatrixIO() raw_matrix = fm_io.read_file_to_data_frame(folder + '/' + lab + '/' + raw_matrix_file) row, col = raw_matrix.shape from medinfo.ml.SupervisedClassifier import SupervisedClassifier algs = SupervisedClassifier.SUPPORTED_ALGORITHMS best_aucroc = 0 for alg in algs: report_file = '%s-normality-prediction-%s-report.tab' % (lab, alg) report_df = pd.read_csv(folder + '/' + lab + '/' + alg + '/' + report_file, sep='\t')
labs['predictable_CV'] = (labs['percent_predictably_positive'].astype('float') * labs['annual_median_charge_volume ($)'].astype('float') / 1000).map('${:,.0f}'.format) labs['predictable_CV[-0.95]'] = (labs['percent_predictably_positive_0.95_lower_ci'].astype('float') * labs['annual_median_charge_volume ($)'].astype('float') / 1000).map('${:,.0f}'.format) labs['predictable_CV[+0.95]'] = (labs['percent_predictably_positive_0.95_upper_ci'].astype('float') * labs['annual_median_charge_volume ($)'].astype('float') / 1000).map('${:,.0f}'.format) summary = DataFrame() summary['lab'] = labs['label'] summary['charge'] = labs['median_charge'].astype('float').map('${:,.0f}'.format) summary['volume'] = labs['volume'].floordiv(6).astype('float').map('{:,.0f}'.format) summary['normal rate'] = labs['normality'] summary['[email protected]'] = labs['[email protected]'] + ' [' + \ labs['[email protected][-0.95]'] + ', ' + \ labs['[email protected][+0.95]'] + ']' summary['predictable CV ($1,000s)'] = labs['predictable_CV'] + ' [' + \ labs['predictable_CV[-0.95]'] + ', ' + \ labs['predictable_CV[+0.95]'] + ']' return summary if __name__ == '__main__': fm_io = FeatureMatrixIO() summary_table = LabNormalityReport.build_lab_performance_summary_table() fm_io.write_data_frame_to_file(summary_table, 'lab-performance-summary.tab') summary = LabNormalityReport.build_algorithm_performance_summary_table() fm_io.write_data_frame_to_file(summary, 'algorithm-performance-summary.tab') LabNormalityReport.plot_predictable_and_expensive_charges() summary = LabNormalityReport.build_lab_predictability_summary_report() fm_io.write_data_frame_to_file(summary, 'predictable-labs.tab') summary = LabNormalityReport.build_lab_predictability_summary_report(all=True) fm_io.write_data_frame_to_file(summary, 'all-labs.tab')
def __init__(self): self._fm_io = FeatureMatrixIO()
def _build_processed_feature_matrix(self, params): # params is a dict defining the details of how the raw feature matrix # should be transformed into the processed matrix. Given the sequence # of steps will be identical across all pipelines, sbala decided to # pack all the variability into this dict. It's not ideal because the # dict has 10+ values, but that seems better than forcing all pipelines # to reproduce the logic of the processing steps. # Principle: Minimize overridden function calls. # params['features_to_add'] = features_to_add # params['features_to_filter_on'] (optional) = features_to_filter_on # params['imputation_strategies'] = imputation_strategies # params['features_to_remove'] = features_to_remove # params['outcome_label'] = outcome_label # params['selection_problem'] = selection_problem # params['selection_algorithm'] = selection_algorithm # params['percent_features_to_select'] = percent_features_to_select # params['matrix_class'] = matrix_class # params['pipeline_file_path'] = pipeline_file_path # TODO(sbala): Determine which fields should have defaults. fm_io = FeatureMatrixIO() log.debug('params: %s' % params) # If processed matrix exists, and the client has not requested to flush # the cache, just use the matrix that already exists and return. processed_matrix_path = params['processed_matrix_path'] if os.path.exists(processed_matrix_path) and not self._flush_cache: # Assume feature selection already happened, but we still need # to split the data into training and test data. processed_matrix = fm_io.read_file_to_data_frame( processed_matrix_path) # processed_matrix['pat_id'] = processed_matrix['pat_id'].apply(lambda x: str(x)) self._train_test_split(processed_matrix, params['outcome_label']) ''' Pandas dataframe may automatically convert bigint to float (and round the last few digits), which may damage the uniqueness of pat_ids. ''' # processed_matrix['pat_id'] = processed_matrix['pat_id'].apply(lambda x: str(x)) else: # Read raw matrix. raw_matrix = fm_io.read_file_to_data_frame( params['raw_matrix_path']) # raw_matrix['pat_id'] = raw_matrix['pat_id'].apply(lambda x: str(x)) # Initialize FMT. # Divide processed_matrix into training and test data. # This must happen before feature selection so that we don't # accidentally learn information from the test data. patIds_df = raw_matrix['pat_id'].copy() self._train_test_split(raw_matrix, params['outcome_label']) # ## # folder_path = '/'.join(params['raw_matrix_path'].split('/')[:-1]) # self._X_train.join(self._y_train).to_csv(folder_path + '/' + 'train_raw.csv', index=False) # self._X_test.join(self._y_test).to_csv(folder_path + '/' + 'test_raw.csv', index=False) # # ''' # Mini-test that there are no overlapping patients # ''' # assert bool(set(self._X_train['pat_id'].values) & set(self._X_test['pat_id'].values)) == False # ## fmt = FeatureMatrixTransform() train_df = self._X_train.join(self._y_train) fmt.set_input_matrix(train_df) # Add features. self._add_features(fmt, params['features_to_add']) # Filter on features if 'features_to_filter_on' in params: self._filter_on_features(fmt, params['features_to_filter_on']) # HACK: When read_csv encounters duplicate columns, it deduplicates # them by appending '.1, ..., .N' to the column names. # In future versions of pandas, simply pass mangle_dupe_cols=True # to read_csv, but not ready as of pandas 0.22.0. for feature in raw_matrix.columns.values: if feature[-2:] == ".1": fmt.remove_feature(feature) self._removed_features.append(feature) # Impute data. if params['imputation_strategies'] == {'sxu_new_imputation'}: train_df = fmt.fetch_matrix() means = {} for column in train_df.columns.values.tolist(): # column_tail = column.split('.')[-1].strip() if train_df[column].dtype == 'float64': means[column] = train_df[column].mean() train_df = fmt.do_impute_sx(train_df, means) fmt.set_input_matrix(train_df) self._X_test = fmt.do_impute_sx(self._X_test, means) self._remove_features(fmt, params['features_to_remove']) else: self._remove_features(fmt, params['features_to_remove']) self._impute_data(fmt, train_df, params['imputation_strategies']) # Remove features. ''' Moved here, since still need pat_id for imputation! ''' # self._remove_features(fmt, params['features_to_remove']) # In case any all-null features were created in preprocessing, # drop them now so feature selection will work fmt.drop_null_features() # Build interim matrix. train_df = fmt.fetch_matrix() self._y_train = pd.DataFrame(train_df.pop(params['outcome_label'])) self._X_train = train_df ''' Select X_test columns according to processed X_train ''' self._X_test = self._X_test[self._X_train.columns] if not params['imputation_strategies'] == {'sxu_new_imputation'}: for feat in self._X_test.columns: self._X_test[feat] = self._X_test[feat].fillna( self.feat2imputed_dict[feat]) self._select_features(params['selection_problem'], params['percent_features_to_select'], params['selection_algorithm'], params['features_to_keep']) ''' The join is based on index by default. Will remove 'pat_id' (TODO sxu: more general in the future) later in train(). ''' self._X_train = self._X_train.join(patIds_df, how='left') self._X_test = self._X_test.join(patIds_df, how='left') # print set(self._X_train['pat_id'].values.tolist()) & set(self._X_test['pat_id'].values.tolist()) train = self._y_train.join(self._X_train) test = self._y_test.join(self._X_test) processed_trainMatrix_path = processed_matrix_path.replace( "matrix", "train-matrix") train.to_csv(processed_trainMatrix_path, sep='\t', index=False) processed_testMatrix_path = processed_matrix_path.replace( "matrix", "test-matrix") test.to_csv(processed_testMatrix_path, sep='\t', index=False) processed_matrix = train.append(test) ''' Recover the order of rows before writing into disk, where the index info will be missing. ''' processed_matrix.sort_index(inplace=True) # Write output to new matrix file. header = self._build_processed_matrix_header(params) fm_io.write_data_frame_to_file(processed_matrix, \ processed_matrix_path, header) ''' Pop out pat_id from the feature matrices. Also check whether there is pat_id leakage. ''' self._patIds_train = self._X_train.pop('pat_id').values.tolist() self._patIds_test = self._X_test.pop('pat_id').values.tolist() assert not (set(self._patIds_train) & set(self._patIds_test))
class ConditionMortalityPredictor: def __init__(self, condition, num_patients, icd_list=None, use_cache=None): self._condition = condition self._num_patients = num_patients self._icd_list = icd_list self._FEATURES_TO_REMOVE = [ 'index_time', 'death_date', 'Death.post', 'Death.postTimeDays', 'Birth.pre', 'Male.preTimeDays', 'Female.preTimeDays', 'RaceWhiteHispanicLatino.preTimeDays', 'RaceWhiteNonHispanicLatino.preTimeDays', 'RaceHispanicLatino.preTimeDays', 'RaceAsian.preTimeDays', 'RaceBlack.preTimeDays', 'RacePacificIslander.preTimeDays', 'RaceNativeAmerican.preTimeDays', 'RaceOther.preTimeDays', 'RaceUnknown.preTimeDays' ] self._eliminated_features = list() self._build_cmm_names() if use_cache is None: self._build_raw_feature_matrix() print('Processing raw feature matrix...') self._process_raw_feature_matrix() print('Training predictor...') self._train_predictor() print('Testing predictor...') self._test_predictor() def _build_cmm_names(self): slugified_condition = "-".join(self._condition.split()) self._build_cmm_name_raw(slugified_condition, self._num_patients) self._build_cmm_name_processed(slugified_condition, self._num_patients) def _build_cmm_name_raw(self, slugified_condition, num_patients): template = '%s-mortality-matrix-%d-pat-raw.tab' self._cmm_name_raw = template % (slugified_condition, num_patients) def _build_cmm_name_processed(self, slugified_condition, num_patients): template = '%s-mortality-matrix-%d-pat-processed.tab' self._cmm_name_processed = template % (slugified_condition, num_patients) def _build_raw_feature_matrix(self): self._cmm = ConditionMortalityMatrix(self._condition, \ self._num_patients, self._cmm_name_raw, self._icd_list) def _process_raw_feature_matrix(self): # Read raw CMM. self._fm_io = FeatureMatrixIO() print('Reading raw matrix...') self._cmm_raw = self._fm_io.read_file_to_data_frame(self._cmm_name_raw) # Add and remove features to _cmm_processed. self._fmt = FeatureMatrixTransform() self._fmt.set_input_matrix(self._cmm_raw) print('Adding features...') self._add_features() print('Imputing data...') self._impute_data() self._remove_features() self._fmt.drop_duplicate_rows() self._cmm_processed = self._fmt.fetch_matrix() # Divide _cmm_processed into training and test data. # This must happen before feature selection so that we don't # accidentally learn information from the test data. self._train_test_split() print('Selecting features...') self._select_features() # Write output to new matrix. train = self._y_train.join(self._X_train) test = self._y_test.join(self._X_test) self._cmm_processed = train.append(test) header = self._build_processed_matrix_header() self._fm_io.write_data_frame_to_file(self._cmm_processed, self._cmm_name_processed, header) def _build_processed_matrix_header(self): # FeatureMatrixFactory and FeatureMatrixIO expect a list of strings. # Each comment below represents the line in the comment. header = list() # <file_name.tab> file_name = self._cmm_name_processed header.append(file_name) # Created: <timestamp> timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M") header.append('Created: %s' % timestamp) # Source: __name__ header.append('Source: %s' % __name__) # Command: ConditionMortalityMatrix() if self._icd_list: command = 'ConditionMortalityPredictor(%s, %s, %s)' % \ (self._condition, self._num_patients, self._icd_list) else: command = 'ConditionMortalityPredictor(%s, %s)' % \ (self._condition, self._num_patients) header.append('Command: %s' % command) # header.append('') # Overview: header.append('Overview:') # This file is a processed version of ___. line = 'This file is a post-processed version of %s.' % self._cmm_name_raw header.append(line) # The outcome label is ___, which is a boolean indicator line = 'The outcome label is I(0<=Death.postTimeDays<=28), which is a boolean indicator' header.append(line) # for whether the patient given by pat_id passed away within 28 days line = 'for whether the patient given by pat_id passed away within 28 days' header.append(line) # of the time index represented by a given row. line = 'of the time index represented by a given row.' header.append(line) # This matrix is the result of the following processing steps on the raw matrix: line = 'This matrix is the result of the following processing steps on the raw matrix:' header.append(line) # (1) Imputing missing values with the mean value of each column. line = ' (1) Imputing missing values with the mean value of each column.' header.append(line) # (2) Manually removing low-information features: line = ' (2) Manually removing low-information features:' header.append(line) # ___ line = ' %s' % str(self._FEATURES_TO_REMOVE) header.append(line) # (3) Algorithmically selecting the top 100 features via recursive feature elimination. line = ' (3) Algorithmically selecting the top 100 features via recursive feature elimination.' header.append(line) # The following features were eliminated. line = ' The following features were eliminated:' header.append(line) # List all features with rank >100. line = ' %s' % str(self._eliminated_features) header.append(line) # line = '' header.append(line) # Each row represents a decision point (proxied by clinical order). line = 'Each row represents a decision point (proxied by clinical order).' header.append(line) # Each row contains fields summarizing the patient's demographics, line = "Each row contains fields summarizing the patient's demographics" header.append(line) # inpatient admit date, prior vitals, and prior lab results. line = 'inpatient admit date, prior vitals, and prior lab results.' header.append(line) # Most cells in matrix represent a count statistic for an event's line = "Most cells in matrix represent a count statistic for an event's" header.append(line) # occurrence or a difference between an event's time and index_time. line = "occurrence or a difference between an event's time and index_time." header.append(line) # header.append('') # Fields: header.append('Fields:') # pat_id - ID # for patient in the STRIDE data set. header.append(' pat_id - ID # for patient in the STRIDE data set.') # index_time - time at which clinical decision was made. header.append( ' index_time - time at which clinical decision was made.') # death_date - if patient died, date on which they died. header.append( ' death_date - if patient died, date on which they died.') # AdmitDxDate.[clinical_item] - admit diagnosis, pegged to admit date. header.append( ' AdmitDxDate.[clinical_item] - admit diagnosis, pegged to admit date.' ) # Birth.preTimeDays - patient's age in days. header.append(" Birth.preTimeDays - patient's age in days.") # [Male|Female].pre - is patient male/female (binary)? header.append(' [Male|Female].pre - is patient male/female (binary)?') # [RaceX].pre - is patient race [X]? header.append(' [RaceX].pre - is patient race [X]?') # Team.[specialty].[clinical_item] - specialist added to treatment team. header.append( ' Team.[specialty].[clinical_item] - specialist added to treatment team.' ) # Comorbidity.[disease].[clinical_item] - disease added to problem list. header.append( ' Comorbidity.[disease].[clinical_item] - disease added to problem list.' ) # ___.[flowsheet] - measurements for flowsheet biometrics. header.append( ' ___.[flowsheet] - measurements for flowsheet biometrics.') # Includes BP_High_Systolic, BP_Low_Diastolic, FiO2, header.append(' Includes BP_High_Systolic, BP_Low_Diastolic, FiO2,') # Glasgow Coma Scale Score, Pulse, Resp, Temp, and Urine. header.append( ' Glasgow Coma Scale Score, Pulse, Resp, Temp, and Urine.') # ___.[lab_result] - lab component results. header.append(' ___.[lab_result] - lab component results.') # Included standard components: WBC, HCT, PLT, NA, K, CO2, BUN, header.append( ' Included standard components: WBC, HCT, PLT, NA, K, CO2, BUN,' ) # CR, TBIL, ALB, CA, LAC, ESR, CRP, TNI, PHA, PO2A, PCO2A, header.append( ' CR, TBIL, ALB, CA, LAC, ESR, CRP, TNI, PHA, PO2A, PCO2A,') # PHV, PO2V, PCO2V header.append(' PHV, PO2V, PCO2V') # header.append('') # [clinical_item] fields may have the following suffixes: header.append( ' [clinical_item] fields may have the following suffixes:') # ___.pre - how many times has this occurred before order_time? header.append( ' ___.pre - how many times has this occurred before order_time?' ) # ___.pre.Xd - how many times has this occurred within X days before index_time? header.append( ' ___.pre.Xd - how many times has this occurred within X days before index_time?' ) # ___.preTimeDays - how many days before order_time was last occurrence? header.append( ' ___.preTimeDays - how many days before order_time was last occurrence?' ) # header.append('') # [flowsheet] and [lab_result] fields may have the following suffixes: header.append( ' [flowsheet] and [lab_result] fields may have the following suffixes:' ) # ___.X_Y.count - # of result values between X and Y days of index_time. header.append( ' ___.X_Y.count - # of result values between X and Y days of index_time.' ) # ___.X_Y.countInRange - # of result values in normal range. header.append( ' ___.X_Y.countInRange - # of result values in normal range.') # ___.X_Y.min - minimum result value. header.append(' ___.X_Y.min - minimum result value.') # ___.X_Y.max - maximum result value. header.append(' ___.X_Y.max - maximum result value.') # ___.X_Y.median - median result value. header.append(' ___.X_Y.median - median result value.') # ___.X_Y.std - standard deviation of result values. header.append(' ___.X_Y.std - standard deviation of result values.') # ___.X_Y.first - first result value. header.append(' ___.X_Y.first - first result value.') # ___.X_Y.last - last result value. header.append(' ___.X_Y.last - last result value.') # ___.X_Y.diff - difference between penultimate and proximate values. header.append( ' ___.X_Y.diff - difference between penultimate and proximate values.' ) # ___.X_Y.slope - slope between penultimate and proximate values. header.append( ' ___.X_Y.slope - slope between penultimate and proximate values.' ) # ___.X_Y.proximate - closest result value to order_time. header.append( ' ___.X_Y.proximate - closest result value to order_time.') # ___.X_Y.firstTimeDays - time between first and order_time. header.append( ' ___.X_Y.firstTimeDays - time between first and order_time.') # ___.X_Y.lastTimeDays - time between last and order_time. header.append( ' ___.X_Y.lastTimeDays - time between last and order_time.') # ___.X_Y.proximateTimeDays - time between proximate and order_time. header.append( ' ___.X_Y.proximateTimeDays - time between proximate and order_time.' ) return header def _train_predictor(self): self._predictor = SupervisedClassifier( algorithm=SupervisedClassifier.REGRESS_AND_ROUND) self._predictor.train(self._X_train, column_or_1d(self._y_train)) def _train_test_split(self): y = pd.DataFrame( self._cmm_processed.pop('I(0<=Death.postTimeDays<=28)')) # Without this line, sklearn complains about the format of y. # "DataConversionWarning: A column-vector y was passed when a 1d array # was expected. Please change the shape of y to (n_samples, ), for # example using ravel()." # Note that this turns y into a numpy array, so need to cast back. # y = y.values.ravel() X = self._cmm_processed self._X_train, self._X_test, self._y_train, self._y_test = train_test_split( X, y, shuffle=False) def _impute_data(self): # Impute missing values with mean value. for feature in self._cmm_raw.columns.values: if feature in self._FEATURES_TO_REMOVE: continue # If all values are null, just remove the feature. # Otherwise, imputation will fail (there's no mean value), # and sklearn will ragequit. if self._cmm_raw[feature].isnull().all(): self._fmt.remove_feature(feature) self._eliminated_features.append(feature) # Only try to impute if some of the values are null. elif self._cmm_raw[feature].isnull().any(): # TODO(sbala): Impute all time features with non-mean value. self._fmt.impute(feature) def _add_features(self): # Add threshold feature indicating whether death date # is within 28 days of index time. self._fmt.add_threshold_feature('Death.postTimeDays', lower_bound=0, upper_bound=28) def _remove_features(self): # Prune obviously unhelpful fields. # In theory, FeatureSelector should be able to prune these, but no # reason not to help it out a little bit. for feature in self._FEATURES_TO_REMOVE: self._fmt.remove_feature(feature) def _select_features(self): # Use FeatureSelector to prune all but 100 variables. fs = FeatureSelector(algorithm=FeatureSelector.RECURSIVE_ELIMINATION, \ problem=FeatureSelector.CLASSIFICATION) fs.set_input_matrix(self._X_train, column_or_1d(self._y_train)) num_features_to_select = int(0.01 * len(self._X_train.columns.values)) fs.select(k=num_features_to_select) # Enumerate eliminated features pre-transformation. self._feature_ranks = fs.compute_ranks() for i in range(len(self._feature_ranks)): if self._feature_ranks[i] > num_features_to_select: self._eliminated_features.append(self._X_train.columns[i]) self._X_train = fs.transform_matrix(self._X_train) self._X_test = fs.transform_matrix(self._X_test) def _test_predictor(self): self._accuracy = self._predictor.compute_accuracy( self._X_test, self._y_test) def predict(self, X): return self._predictor.predict(X) def summarize(self): summary_lines = list() # Condition: condition condition = self._condition line = 'Condition: %s' % condition summary_lines.append(line) # Algorithm: SupervisedClassifier(algorithm) algorithm = 'SupervisedClassifier(REGRESS_AND_ROUND)' line = 'Algorithm: %s' % algorithm summary_lines.append(line) # Train/Test Size: training_size, test_size training_size = self._X_train.shape[0] test_size = self._X_test.shape[0] line = 'Train/Test Size: %s/%s' % (training_size, test_size) summary_lines.append(line) # Model: sig_features coefs = self._predictor.coefs() cols = self._X_train.columns sig_features = [(coefs[cols.get_loc(f)], f) for f in cols.values if coefs[cols.get_loc(f)] > 0] linear_model = ' + '.join('%s*%s' % (weight, feature) for weight, feature in sig_features) line = 'Model: logistic(%s)' % linear_model summary_lines.append(line) # Baseline Episode Mortality: episode_mortality counts = self._y_test[self._y_test.columns[0]].value_counts() line = 'Baseline Episode Mortality: %s/%s' % (counts[1], test_size) summary_lines.append(line) # AUC: auc auc = self._predictor.compute_roc_auc(self._X_test, self._y_test) line = 'AUC: %s' % auc summary_lines.append(line) # Accuracy: accuracy line = 'Accuracy: %s' % self._accuracy summary_lines.append(line) return '\n'.join(summary_lines)
lab = 'ALK' data_source = 'UMich' lab_type = 'component' data_folderpath = '../data-%s-component-10000-episodes/%s/' % (data_source, lab) rf_model = joblib.load(data_folderpath + "%s-normality-random-forest-model.pkl" % lab)._model # rf_model = joblib.load('Uric-Acid, Serum - Plasma-normality-random-forest-model.pkl')._model print(len(rf_model.feature_importances_)) from medinfo.dataconversion.FeatureMatrixIO import FeatureMatrixIO fm_io = FeatureMatrixIO() df_processed = fm_io.read_file_to_data_frame( data_folderpath + '%s-normality-matrix-processed.tab' % lab) # df_processed = fm_io.read_file_to_data_frame('Uric Acid, Serum - Plasma-normality-test-matrix-processed_byStanford.tab') df_processed.pop('pat_id') if lab_type == 'panel': df_processed.pop('all_components_normal') else: df_processed.pop('component_normal') cols = df_processed.columns.values.tolist() estimator = rf_model.estimators_[5] export_graphviz(estimator, out_file='tree.dot',
train_ids)].copy() y_train = pd.DataFrame(train_matrix.pop(outcome_label)) X_train = train_matrix test_matrix = processed_matrix[processed_matrix[columnToSplitOn].isin( test_ids)].copy() y_test = pd.DataFrame(test_matrix.pop(outcome_label)) X_test = test_matrix return X_train, y_train, X_test, y_test ''' Load data ''' lab = 'LABA1C' fm_io = FeatureMatrixIO() processed_matrix = fm_io.read_file_to_data_frame( "data-panels/%s/%s-normality-matrix-10000-episodes-processed.tab" % (lab, lab)) X_train, y_train, X_test, y_test = _train_test_split(processed_matrix, 'all_components_normal') X_train.pop('pat_id') X_test.pop('pat_id') features = X_train.columns.tolist() print(features) X_train, y_train, X_test, y_test = X_train.values, y_train.values, X_test.values, y_test.values scaler = preprocessing.StandardScaler().fit(X_train) X_train = scaler.transform(X_train)
def _train_and_analyze_predictors(self): log.info('Training and analyzing predictors...') problem = SupervisedLearningPipeline.CLASSIFICATION meta_report = None fm_io = FeatureMatrixIO() # Build paths for output. pipeline_file_name = inspect.getfile(inspect.currentframe()) data_dir = self._fetch_data_dir_path(pipeline_file_name) # Test BifurcatedSupervisedClassifier and SupervisedClassifier. algorithms_to_test = list() algorithms_to_test.extend(SupervisedClassifier.SUPPORTED_ALGORITHMS) for algorithm in SupervisedClassifier.SUPPORTED_ALGORITHMS: pass # TODO:(raikens) something in the BifurcatedSupervisedClassifier pipeline is crashing #algorithms_to_test.append('bifurcated-%s' % algorithm) log.debug('algorithms_to_test: %s' % algorithms_to_test) # Train and analyse algorithms. for algorithm in algorithms_to_test: log.info('Training and analyzing %s...' % algorithm) # If report_dir does not exist, make it. report_dir = '/'.join([data_dir, algorithm]) if not os.path.exists(report_dir): os.makedirs(report_dir) log.debug('report_dir: %s' % report_dir) # Define hyperparams. hyperparams = {} hyperparams['algorithm'] = algorithm hyperparams[ 'hyperparam_strategy'] = SupervisedClassifier.EXHAUSTIVE_SEARCH hyperparams['max_iter'] = 1024 # If bifurcated algorithm, define bifurcator. if 'bifurcated' in algorithm: # bifrucator = LAB.pre == 0 hyperparams['bifurcator'] = '%s.pre' % self._var hyperparams[ 'bifurcation_strategy'] = BifurcatedSupervisedClassifier.EQUAL hyperparams['bifurcation_value'] = 0 hyperparams['bifurcated'] = True # Train classifier. predictor_path = self._build_model_dump_path(algorithm) if os.path.exists( predictor_path) and 'bifurcated' not in algorithm: log.debug('Loading model from disk...') # TODO(sbala): Fix loblib.load so that it works for bifurcated # supervised classifiers. self._predictor = joblib.load(predictor_path) self._features = self._X_train.columns status = SupervisedClassifier.TRAINED else: status = SupervisedLearningPipeline._train_predictor( self, problem, [0, 1], hyperparams) # If failed to train, write an error report. y_train_counts = self._y_train[ self._y_train.columns[0]].value_counts() y_test_counts = self._y_test[ self._y_test.columns[0]].value_counts() if status == SupervisedClassifier.INSUFFICIENT_SAMPLES: # Skip all analysis and reporting. # This will be true for all algorithms, so just return. # Build error report. algorithm_report = DataFrame( { 'lab_panel': [self._var], 'algorithm': [algorithm], 'error': [status], 'y_train.value_counts()': [y_train_counts.to_dict()], 'y_test.value_counts()': [y_test_counts.to_dict()] }, columns=[ 'lab_panel', 'algorithm', 'error', 'y_train.value_counts()', 'y_test.value_counts()' ]) header = [ 'LabChangePredictionPipeline("%s", %d)' % (self._var, self._num_rows) ] # Write error report. fm_io.write_data_frame_to_file(algorithm_report, \ '/'.join([report_dir, '%s-change-prediction-report.tab' % (self._var)]), \ header) # If successfully trained, append to a meta report. elif status == SupervisedClassifier.TRAINED: pipeline_prefix = '%s-change-prediction-%s' % (self._var, algorithm) SupervisedLearningPipeline._analyze_predictor( self, report_dir, pipeline_prefix) if meta_report is None: meta_report = fm_io.read_file_to_data_frame('/'.join( [report_dir, '%s-report.tab' % pipeline_prefix])) else: algorithm_report = fm_io.read_file_to_data_frame('/'.join( [report_dir, '%s-report.tab' % pipeline_prefix])) log.debug('algorithm_report: %s' % algorithm_report) meta_report = meta_report.append(algorithm_report) # Write predictor to disk. predictor = SupervisedLearningPipeline.predictor(self) predictor_path = self._build_model_dump_path(algorithm) joblib.dump(predictor, predictor_path) # After building per-algorithm reports, write to meta report. # Note that if there were insufficient samples to build any of the # algorithms, then meta_report will still be None. if meta_report is not None: header = [ 'LabChangePredictionPipeline("%s", %d)' % (self._var, self._num_rows) ] fm_io.write_data_frame_to_file(meta_report, \ '/'.join([data_dir, '%s-change-prediction-report.tab' % self._var]), header)
def get_train_and_evalu_raw_matrices(lab, data_lab_folderpath, random_state, train_size=0.75, columnToSplitOn='pat_id'): ''' If train and eval exist, direct get from disk Avoided saving as 2 raw matrices, too much space! elif raw matrix exists, get from dist and split else, get from SQL Args: raw_matrix_filepath: random_state: use_cached: Returns: ''' raw_matrix_filepath = os.path.join(data_lab_folderpath, raw_matrix_template % lab) fm_io = FeatureMatrixIO() # TODO: check if raw matrix exists raw_matrix = fm_io.read_file_to_data_frame(raw_matrix_filepath) pat_split_filepath = os.path.join(data_lab_folderpath, pat_split_filename) ''' Old pipeline style ''' if os.path.exists(pat_split_filepath): pat_split_df = pd.read_csv(pat_split_filepath) pat_ids_train = pat_split_df[pat_split_df['in_train'] == 1]['pat_id'].values.tolist() raw_matrix_train = raw_matrix[raw_matrix['pat_id'].isin(pat_ids_train)] pat_ids_evalu = pat_split_df[pat_split_df['in_train'] == 0]['pat_id'].values.tolist() raw_matrix_evalu = raw_matrix[raw_matrix['pat_id'].isin(pat_ids_evalu)] else: raw_matrix_train, raw_matrix_evalu = split_rows( raw_matrix, train_size=train_size, columnToSplitOn=columnToSplitOn, random_state=random_state) pat_ids_train = set(raw_matrix_train['pat_id'].values.tolist()) pat_split_df = raw_matrix[['pat_id']].copy() pat_split_df['in_train'] = pat_split_df['pat_id'].apply( lambda x: 1 if x in pat_ids_train else 0) # pat_split_df.to_csv(pat_split_filepath, index=False) assert set(raw_matrix_train['pat_id'].values.tolist()) & set( raw_matrix_evalu['pat_id'].values.tolist()) == set([]) assert raw_matrix_train.shape[0] + raw_matrix_evalu.shape[ 0] == raw_matrix.shape[0] return raw_matrix_train, raw_matrix_evalu