def get_raw_matrix(self, data_tag): if data_tag == "src": return FeatureMatrixIO().read_file_to_data_frame( self.raw_matrix_filepath) else: return FeatureMatrixIO().read_file_to_data_frame( self.raw_matrix_filepath)
def _build_processed_feature_matrix_holdout(self): fm_io = FeatureMatrixIO() raw_matrix = fm_io.read_file_to_data_frame( self._build_raw_matrix_path()) # if outcome_label in self.feat2imputed_dict: # self.feat2imputed_dict.pop(outcome_label) # # processed_matrix = raw_matrix[self.feat2imputed_dict.keys()+[outcome_label]].copy() ''' TODO: feat2imputed_dict includes the outcome label ''' processed_matrix = raw_matrix[self.feat2imputed_dict.keys()].copy() # TODO: tmp solution! tmp_path = self._build_processed_matrix_path().replace( "2000", "10000").replace("-holdout", "") fm_io1 = FeatureMatrixIO() processed_matrix_previous = fm_io1.read_file_to_data_frame(tmp_path) processed_matrix = processed_matrix[processed_matrix_previous.columns] # TODO: tmp solution! for feat in self.feat2imputed_dict.keys(): processed_matrix[feat] = processed_matrix[feat].fillna( self.feat2imputed_dict[feat]) fm_io.write_data_frame_to_file(processed_matrix, \ self._build_processed_matrix_path(), None)
def test_write_data_frame_to_file(self): # Initialize FeatureMatrixIO. fm_io = FeatureMatrixIO() # Build paths for test files. app_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) no_header_file_name = 'test-matrix-no-header.tab' with_header_file_name = 'test-matrix-with-header.tab' no_header_file_path = os.path.join(app_dir, no_header_file_name) with_header_file_path = os.path.join(app_dir, with_header_file_name) # Read data frames from test files. matrix_no_header = MANUAL_TEST_CASE['matrix_no_header'] matrix_header = MANUAL_TEST_CASE['custom_header'] # Write data frame without header. no_header_temp_file_name = 'no-header-temp-file.tab' self._no_header_temp_file_path = os.path.join(app_dir, no_header_temp_file_name) fm_io.write_data_frame_to_file(matrix_no_header, self._no_header_temp_file_path) # Write data frame with header. with_header_temp_file_name = 'header-temp-file.tab' self._with_header_temp_file_path = os.path.join(app_dir, with_header_temp_file_name) fm_io.write_data_frame_to_file(matrix_no_header, self._with_header_temp_file_path, matrix_header) # Validate output files. self.assertTrue(filecmp.cmp(no_header_file_path, self._no_header_temp_file_path)) self.assertTrue(filecmp.cmp(with_header_file_path, self._with_header_temp_file_path))
def _analyze_predictors_on_holdout(self): fm_io = FeatureMatrixIO() algorithms_to_test = list() algorithms_to_test.extend(SupervisedClassifier.SUPPORTED_ALGORITHMS) pipeline_file_name = inspect.getfile(inspect.currentframe()) data_dir = SupervisedLearningPipeline._fetch_data_dir_path( self, pipeline_file_name) # for algorithm in SupervisedClassifier.SUPPORTED_ALGORITHMS: # algorithms_to_test.append('bifurcated-%s' % algorithm) log.debug('algorithms_to_test: %s' % algorithms_to_test) for algorithm in algorithms_to_test: log.info('analyzing %s...' % algorithm) # If report_dir does not exist, make it. report_dir = '/'.join([data_dir, algorithm]) pipeline_prefix = '%s-normality-prediction-%s' % (self._var, algorithm) predictor_path = self._build_model_dump_path(algorithm) if os.path.exists( predictor_path) and 'bifurcated' not in algorithm: log.debug('Loading model from disk...') # TODO(sbala): Fix loblib.load so that it works for bifurcated # supervised classifiers. self._predictor = joblib.load(predictor_path) # self._features = self._X_train.columns status = SupervisedClassifier.TRAINED SupervisedLearningPipeline._analyze_predictor_holdoutset( self, report_dir, pipeline_prefix)
def _process_raw_feature_matrix(self): # Read raw CMM. self._fm_io = FeatureMatrixIO() print('Reading raw matrix...') self._cmm_raw = self._fm_io.read_file_to_data_frame(self._cmm_name_raw) # Add and remove features to _cmm_processed. self._fmt = FeatureMatrixTransform() self._fmt.set_input_matrix(self._cmm_raw) print('Adding features...') self._add_features() print('Imputing data...') self._impute_data() self._remove_features() self._fmt.drop_duplicate_rows() self._cmm_processed = self._fmt.fetch_matrix() # Divide _cmm_processed into training and test data. # This must happen before feature selection so that we don't # accidentally learn information from the test data. self._train_test_split() print('Selecting features...') self._select_features() # Write output to new matrix. train = self._y_train.join(self._X_train) test = self._y_test.join(self._X_test) self._cmm_processed = train.append(test) header = self._build_processed_matrix_header() self._fm_io.write_data_frame_to_file(self._cmm_processed, self._cmm_name_processed, header)
def load_imputation_template(lab, dataset_folderpath, lab_type='panel'): data_lab_folderpath = os.path.join(dataset_folderpath, lab) imputations = pickle.load( open(data_lab_folderpath + '/' + "feat2imputed_dict.pkl")) if len(imputations) < 200: # ''' only includes selected features ''' return imputations if lab_type == 'panel': ylabel = 'all_components_normal' else: ylabel = 'component_normal' ''' All raw matrix's columns are included. Have to extract final features from processed matrix ''' fm_io = FeatureMatrixIO() df_processed = fm_io.read_file_to_data_frame( data_lab_folderpath + '/' + '%s-normality-matrix-processed.tab' % lab) df_processed.pop('pat_id') df_processed.pop(ylabel) # TODO?! processed_columns_stanford = df_processed.columns.values.tolist() imputations_new = {} for i, col_selected in enumerate(processed_columns_stanford): imputations_new[col_selected] = (i, imputations[col_selected]) return imputations_new
def load_raw_matrix(lab, dataset_folderpath): data_lab_folderpath = os.path.join(dataset_folderpath, lab) raw_matrix_filepath = os.path.join(data_lab_folderpath, raw_matrix_template % lab) fm_io = FeatureMatrixIO() # TODO: check if raw matrix exists raw_matrix = fm_io.read_file_to_data_frame(raw_matrix_filepath) return raw_matrix
def _build_raw_feature_matrix(self): raw_matrix_path = self._build_raw_matrix_path() matrix_class = LabNormalityMatrix SupervisedLearningPipeline._build_raw_feature_matrix(self, matrix_class, \ raw_matrix_path) if not self._holdOut: fm_io = FeatureMatrixIO() matrix = fm_io.read_file_to_data_frame(raw_matrix_path) self.usedPatIds = set(matrix['pat_id'].values)
def __init__(self, lab_panel, num_episodes, use_cache=None, random_state=None, timeLimit=None, notUsePatIds=None, holdOut=False, pat_batch_ind=None, includeLastNormality=True): # self.notUsePatIds = notUsePatIds self.pat_batch_ind = pat_batch_ind self._holdOut = holdOut self.usedPatIds = [] SupervisedLearningPipeline.__init__(self, lab_panel, num_episodes, use_cache, random_state, timeLimit, notUsePatIds) # TODO: naming of lab_panel self._factory = FeatureMatrixFactory() self._build_raw_feature_matrix() if LAB_TYPE == 'panel': self.ylabel = 'all_components_normal' else: self.ylabel = 'component_normal' self.includeLastNormality = includeLastNormality if self.includeLastNormality: fm_io = FeatureMatrixIO() df = fm_io.read_file_to_data_frame('data/'+lab_panel+'/%s-normality-matrix-raw.tab'%lab_panel) df = df.sort_values(['pat_id', 'order_time']).reset_index(drop=True) df['last_normality'] = df['order_proc_id'].apply(lambda x:float('nan')) for i in range(1,df.shape[0]): if df.ix[i, 'pat_id'] == df.ix[i-1, 'pat_id']: df.ix[i, 'last_normality'] = df.ix[i-1, self.ylabel] df.to_csv('data/'+lab_panel+'/%s-normality-matrix-raw.tab'%lab_panel, index=False, sep='\t') data_lab_folder = self._fetch_data_dir_path(inspect.getfile(inspect.currentframe())) feat2imputed_dict_path = data_lab_folder + '/feat2imputed_dict.pkl' if holdOut: ''' For holdOut evaluation data, produce the raw matrix, pick features according to the saved feat2imputed_dict. ''' self.feat2imputed_dict = pickle.load(open(feat2imputed_dict_path, 'r')) self._build_processed_feature_matrix_holdout() self._analyze_predictors_on_holdout() else: ''' For training/validation data, record the pat_ids, selected features and their imputed value correspondingly. ''' pickle.dump(self.usedPatIds, open('data/used_patient_set_%s.pkl'%self._var, 'w'), pickle.HIGHEST_PROTOCOL) self._build_processed_feature_matrix() self._build_baseline_results() # TODO: prototype in SLPP # return # TODO: find better place to put the dict.pkl pickle.dump(self.feat2imputed_dict, open(feat2imputed_dict_path, 'w'), pickle.HIGHEST_PROTOCOL) self._train_and_analyze_predictors()
def _analyze_predictor_holdoutset(self, dest_dir, pipeline_prefix): slugified_var = '-'.join(self._var.split()) holdout_path = dest_dir + '/../' + '%s-normality-matrix-%d-episodes-processed-holdout.tab' % ( slugified_var, self._num_rows) fm_io = FeatureMatrixIO() processed_matrix = fm_io.read_file_to_data_frame(holdout_path) if self._isLabPanel: y_holdout = pd.DataFrame( processed_matrix.pop('all_components_normal')) else: y_holdout = pd.DataFrame(processed_matrix.pop('component_normal')) X_holdout = processed_matrix analyzer = ClassifierAnalyzer(self._predictor, X_holdout, y_holdout) train_label = 'holdoutset' # Build names for output plots and report. direct_comparisons_name = '%s-direct-compare-results-%s.csv' % ( pipeline_prefix, train_label) precision_at_k_plot_name = '%s-precision-at-k-plot-%s.png' % ( pipeline_prefix, train_label) precision_recall_plot_name = '%s-precision-recall-plot-%s.png' % ( pipeline_prefix, train_label) roc_plot_name = '%s-roc-plot-%s.png' % (pipeline_prefix, train_label) report_name = '%s-report-%s.tab' % (pipeline_prefix, train_label) # Build paths. direct_comparisons_path = '/'.join([dest_dir, direct_comparisons_name]) log.debug('direct_comparisons_path: %s' % direct_comparisons_path) precision_at_k_plot_path = '/'.join( [dest_dir, precision_at_k_plot_name]) log.debug('precision_at_k_plot_path: %s' % precision_at_k_plot_path) precision_recall_plot_path = '/'.join( [dest_dir, precision_recall_plot_name]) log.debug('precision_recall_plot_path: %s' % precision_recall_plot_path) roc_plot_path = '/'.join([dest_dir, roc_plot_name]) log.debug('roc_plot_path: %s' % roc_plot_path) report_path = '/'.join([dest_dir, report_name]) log.debug('report_path: %s' % report_path) # Build plot titles. roc_plot_title = 'ROC (%s)' % pipeline_prefix precision_recall_plot_title = 'Precision-Recall (%s)' % pipeline_prefix precision_at_k_plot_title = 'Precision @K (%s)' % pipeline_prefix # Write output. analyzer.output_direct_comparisons(direct_comparisons_path) analyzer.plot_roc_curve(roc_plot_title, roc_plot_path) analyzer.plot_precision_recall_curve(precision_recall_plot_title, precision_recall_plot_path) analyzer.plot_precision_at_k_curve(precision_at_k_plot_title, precision_at_k_plot_path) analyzer.write_report(report_path, ci=0.95)
def read_lab_meta_report(lab_panel): fm_io = FeatureMatrixIO() data_dir = LabNormalityReport.fetch_data_dir_path() meta_report_path = data_dir + '/%s/%s-normality-prediction-report.tab' % (lab_panel, lab_panel) if os.path.exists(meta_report_path): meta_report = fm_io.read_file_to_data_frame(meta_report_path) return meta_report else: # IF meta_report does not exist, fetch the data on class counts. algorithm = SupervisedClassifier.REGRESS_AND_ROUND report_path = data_dir + '/%s/%s/%s-normality-prediction-report.tab' % (lab_panel, algorithm, lab_panel) algorithm_report = fm_io.read_file_to_data_frame(report_path) return algorithm_report
def write_matrix(self, dest_path, header=None): log.info('Writing matrix file...') fm_io = FeatureMatrixIO() # Get old matrix file. source_path = self._factory.getMatrixFileName() # Write to new matrix filee. matrix_file = open(dest_path, 'w') # for line in header: # matrix_file.write('# %s\n' % line) for line in open(source_path, 'r'): if line[0] != '#': matrix_file.write(line) # Delete old matrix file. os.remove(source_path)
def test_SupervisedLearner(): from medinfo.ml.SupervisedLearner import SupervisedLearner import inspect from medinfo.dataconversion.FeatureMatrixIO import FeatureMatrixIO class LabNormalityLearner(SupervisedLearner): def __init__(self, input_matrix, ylabel): self.working_folderpath = '/'.join(inspect.getfile(inspect.currentframe()).split('/')[:-1]) self.input_matrix = input_matrix self.ylabel = ylabel pass fm_io = FeatureMatrixIO() processed_matrix = fm_io.read_file_to_data_frame('data-testingSupervisedLearner-panel-10000-episodes/LABA1C/LABA1C-normality-train-matrix-processed.tab') processed_matrix.pop('pat_id') lnl = LabNormalityLearner(processed_matrix, 'all_components_normal') lnl.run()
def test_read_file_to_data_frame(self): # Initialize FeatureMatrixIO. fm_io = FeatureMatrixIO() # Build paths for test files. app_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) no_header_file_name = 'test-matrix-no-header.tab' with_header_file_name = 'test-matrix-with-header.tab' no_header_file_path = os.path.join(app_dir, no_header_file_name) with_header_file_path = os.path.join(app_dir, with_header_file_name) # Read files into data frames. matrix_stripped_header = fm_io.read_file_to_data_frame(with_header_file_path) matrix_no_header = fm_io.read_file_to_data_frame(no_header_file_path) # Verify that FeatureMatrixIO correctly stripped the header. expected_matrix = MANUAL_TEST_CASE['matrix_no_header'] assert_frame_equal(expected_matrix, matrix_stripped_header) assert_frame_equal(expected_matrix, matrix_no_header)
def test_strip_header(self): # Initialize FeatureMatrixIO. fm_io = FeatureMatrixIO() # Build paths for test files. app_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) no_header_file_name = 'test-matrix-no-header.tab' with_header_file_name = 'test-matrix-with-header.tab' no_header_file_path = os.path.join(app_dir, no_header_file_name) with_header_file_path = os.path.join(app_dir, with_header_file_name) # Strip header. matrix_with_header = fm_io.read_file_to_data_frame(with_header_file_path) self._stripped_header_file_path = fm_io.strip_header(with_header_file_path) # Validate matrix data. expected_matrix = MANUAL_TEST_CASE['matrix_no_header'] actual_matrix = fm_io.read_file_to_data_frame(self._stripped_header_file_path, \ datetime_col_index=1) assert_frame_equal(expected_matrix, actual_matrix)
def jitter_processed_matrix(lab, pat_num_limit=100): data_file = "%s-normality-matrix-10000-episodes-processed.tab" % lab data_path = os.path.join(data_folder, lab, data_file) fm_io = FeatureMatrixIO() df = fm_io.read_file_to_data_frame(data_path) ''' Reset the pat ids ''' pat_ids = sorted(set(df['pat_id'].values.tolist())) pat_ids = pat_ids[:pat_num_limit] pat2pat = {} for i, pat_id in enumerate(pat_ids): pat2pat[pat_id] = i df['pat_id'] = df['pat_id'].apply(lambda x: pat2pat[x] if x in pat2pat else None) df = df.dropna() print np.array_repr(df.values) print df.columns
def test(test_suite=[]): import LabNormalityLearner_Config as Config from medinfo.dataconversion.FeatureMatrixIO import FeatureMatrixIO fm_io = FeatureMatrixIO() raw_matrix = fm_io.read_file_to_data_frame( 'LabNormalityLearner_TestData/LABA1C-normality-matrix-raw.tab') if 'remove' in test_suite: remover = LNL.FeatureRemover(Config.features_to_remove) processed_matrix_removed = remover.transform(raw_matrix) assert raw_matrix.shape[0] < processed_matrix_removed.shape[0] assert raw_matrix.shape[1] == processed_matrix_removed.shape[1] if 'impute' in test_suite: features_to_impute = [ 'TBIL.-14_0.max', 'TBIL.-14_0.median', 'TBIL.-14_0.mean', 'TBIL.-14_0.std' ] #('min', 'max', 'median', 'mean', 'std', 'first', 'last', 'diff', 'slope', 'proximate') imputation_dict = {} for feature in features_to_impute: imputation_dict[feature] = 0 imputer = LNL.FeatureImputer(imputation_dict=imputation_dict) columns_to_look = [ 'pat_id', 'TBIL.-14_0.max', 'TBIL.-14_0.median', 'TBIL.-14_0.mean', 'TBIL.-14_0.std' ] print 'raw_matrix[columns_to_look].head():', raw_matrix[ columns_to_look].head() processed_matrix_imputed = imputer.fit_transform(raw_matrix) print 'processed_matrix_imputed[columns_to_look].head():', processed_matrix_imputed[ columns_to_look].head() assert processed_matrix_imputed[columns_to_look].isna().any().any( ) == False assert (raw_matrix['order_proc_id'].values == processed_matrix_imputed['order_proc_id'].values).all()
def load_processed_matrix(lab, dataset_folderpath, type='full'): data_lab_folderpath = os.path.join(dataset_folderpath, lab) if type == 'train': matrix_filepath = os.path.join(data_lab_folderpath, processed_matrix_train_template % lab) elif type == 'evalu': matrix_filepath = os.path.join(data_lab_folderpath, processed_matrix_evalu_template % lab) else: matrix_filepath = os.path.join(data_lab_folderpath, processed_matrix_template % lab) fm_io = FeatureMatrixIO() # TODO: check if raw matrix exists if os.path.exists(matrix_filepath): matrix = fm_io.read_file_to_data_frame(matrix_filepath) else: matrix = fm_io.read_file_to_data_frame( matrix_filepath.replace('-test', '-evalu')) return matrix
def _build_processed_feature_matrix(self): # Define parameters for processing steps. params = {} raw_matrix_path = self._build_raw_matrix_path() processed_matrix_path = self._build_processed_matrix_path(raw_matrix_path) log.debug('params: %s' % params) prev_measurement_feature = self._change_params['feature_old'] features_to_add = {'change': [self._change_params]} features_to_filter_on = [{'feature': prev_measurement_feature, 'value':np.nan}] imputation_strategies = { } features_to_remove = [ 'pat_id', 'order_time', 'order_proc_id', 'ord_num_value', 'proc_code', 'abnormal_panel', 'all_components_normal', 'num_normal_components', 'Birth.pre', 'Male.preTimeDays', 'Female.preTimeDays', 'RaceWhiteHispanicLatino.preTimeDays', 'RaceWhiteNonHispanicLatino.preTimeDays', 'RaceHispanicLatino.preTimeDays', 'RaceAsian.preTimeDays', 'RaceBlack.preTimeDays', 'RacePacificIslander.preTimeDays', 'RaceNativeAmerican.preTimeDays', 'RaceOther.preTimeDays', 'RaceUnknown.preTimeDays', 'Death.post', 'Death.postTimeDays', 'num_components' ] features_to_keep = [ # Keep the # of times it's been ordered in past, even if low info. '%s.pre' % self._var ] outcome_label = 'unchanged_yn' selection_problem = FeatureSelector.CLASSIFICATION selection_algorithm = FeatureSelector.RECURSIVE_ELIMINATION percent_features_to_select = 0.05 matrix_class = LabChangeMatrix pipeline_file_path = inspect.getfile(inspect.currentframe()) data_overview = [ # Overview: 'Overview', # The outcome label is ___. 'The outcome label is %s.' % outcome_label, # %s is a boolean indicator which summarizes whether the lab test '%s is a boolean indicator which summarizes whether the lab test ' % outcome_label, # result is unchanged compared to the previous measurement. 'result is unchanged compared to the previous measurement.', # Each row represents a unique lab panel order. 'Each row represents a unique lab panel order.', # Each row contains fields summarizing the patient's demographics, "Each row contains fields summarizing the patient's demographics", # inpatient admit date, prior vitals, and prior lab results. 'inpatient admit date, prior vitals, and prior lab results.', # Most cells in matrix represent a count statistic for an event's "Most cells in matrix represent a count statistic for an event's", # occurrence or a difference between an event's time and index_time. "occurrence or a difference between an event's time and index_time.", # Lab panel orders were only included if a previous measurement of "Lab panel orders were only included if a previous measurement of", # the same lab panel has been recorded "the same lab panel has been recorded." ] # Bundle parameters into single object params['raw_matrix_path'] = raw_matrix_path params['processed_matrix_path'] = processed_matrix_path params['features_to_add'] = features_to_add params['features_to_keep'] = features_to_keep params['features_to_filter_on'] = features_to_filter_on params['imputation_strategies'] = imputation_strategies params['features_to_remove'] = features_to_remove params['outcome_label'] = outcome_label params['selection_problem'] = selection_problem params['selection_algorithm'] = selection_algorithm params['percent_features_to_select'] = percent_features_to_select params['matrix_class'] = matrix_class params['pipeline_file_path'] = pipeline_file_path params['data_overview'] = data_overview # defer to SupervisedLearningPipeline logic by SX fm_io = FeatureMatrixIO() log.debug('params: %s' % params) # If processed matrix exists, and the client has not requested to flush # the cache, just use the matrix that already exists and return. processed_matrix_path = params['processed_matrix_path'] if os.path.exists(processed_matrix_path) and not self._flush_cache: # Assume feature selection already happened, but we still need # to split the data into training and test data. processed_matrix = fm_io.read_file_to_data_frame(processed_matrix_path) ''' Make sure the order of rows is consistent before splitting ''' processed_matrix.sort_index(inplace=True) self._train_test_split(processed_matrix, params['outcome_label']) #TODO sxu: when reloading, no pat_id else: # Read raw matrix. raw_matrix = fm_io.read_file_to_data_frame(params['raw_matrix_path']) # Initialize FMT. # Add outcome label raw_fmt = FeatureMatrixTransform() raw_fmt.set_input_matrix(raw_matrix) self._filter_on_features(raw_fmt, params['features_to_filter_on']) self._add_features(raw_fmt, params['features_to_add']) raw_matrix = raw_fmt.fetch_matrix() # Divide processed_matrix into training and test data. # This must happen before feature selection so that we don't # accidentally learn information from the test data. # TODO: work on this... self._train_test_split(raw_matrix, params['outcome_label']) fmt = FeatureMatrixTransform() train_df = self._X_train.join(self._y_train) fmt.set_input_matrix(train_df) # Remove features. self._remove_features(fmt, params['features_to_remove']) # Filter on features if 'features_to_filter_on' in params: self._filter_on_features(fmt, params['features_to_filter_on']) # HACK: When read_csv encounters duplicate columns, it deduplicates # them by appending '.1, ..., .N' to the column names. # In future versions of pandas, simply pass mangle_dupe_cols=True # to read_csv, but not ready as of pandas 0.22.0. for feature in raw_matrix.columns.values: if feature[-2:] == ".1": fmt.remove_feature(feature) self._removed_features.append(feature) # Impute data. self._impute_data(fmt, train_df, params['imputation_strategies']) # In case any all-null features were created in preprocessing, # drop them now so feature selection will work fmt.drop_null_features() # Build interim matrix. train_df = fmt.fetch_matrix() self._y_train = pd.DataFrame(train_df.pop(params['outcome_label'])) self._X_train = train_df ''' Select X_test columns according to processed X_train ''' self._X_test = self._X_test[self._X_train.columns] ''' Impute data according to the same strategy when training ''' for feat in self._X_test.columns: self._X_test[feat] = self._X_test[feat].fillna(self.feat2imputed_dict[feat]) self._select_features(params['selection_problem'], params['percent_features_to_select'], params['selection_algorithm'], params['features_to_keep']) train = self._y_train.join(self._X_train) test = self._y_test.join(self._X_test) processed_matrix = train.append(test) ''' Need to recover the order of rows before writing into disk ''' processed_matrix.sort_index(inplace=True) # Write output to new matrix file. header = self._build_processed_matrix_header(params) fm_io.write_data_frame_to_file(processed_matrix, \ processed_matrix_path, header)
def get_raw_matrix(self): return FeatureMatrixIO().read_file_to_data_frame( self.raw_matrix_filepath)
def apply_src_to_dst(lab, lab_type, src_datasource, dst_datasource, src_dataset_folderpath, dst_dataset_folderpath, output_folderpath, use_cached_results=True): ''' What: Use case that transfers model from one institute (src) to another (dst) Why: TODO: automatically recognize lab_type How: Load inputs: (1) Read dst raw matrix from dst dataset_folder/lab (2) Read src imputation template (only includes final features) from src dataset_folder/lab (3) Read src trained model from src dataset_folder/lab Process: (1) Create a dst imputation template For each src feature in the src imputation template map the feature to dst column if not exists such dst column: create a new column in the dst raw matrix, fill with the src imputing value (2) Feed into process_matrix, pop pat_id and info features, split Xy (3) Feed X into classifier, get y_pred, write to direct_comparisons with y_true Args: lab: lab_type: dataset_folder: Returns: ''' print "Transfering %s %s from %s to %s..." % ( lab_type, lab, src_datasource, dst_datasource) from medinfo.dataconversion.FeatureMatrixIO import FeatureMatrixIO import pickle import os if not os.path.exists(output_folderpath): os.mkdir(output_folderpath) output_filepath = os.path.join(output_folderpath, 'direct_comparisons.csv') if use_cached_results and os.path.exists(output_filepath): # print output_filepath + ' exists!' return # TODO: UMich? if lab_type == 'panel': # from scripts.LabTestAnalysis.machine_learning.ml_utils import map_panel_from_Stanford_to_UCSF as map_lab ylabel = 'all_components_normal' else: # from scripts.LabTestAnalysis.machine_learning.ml_utils import map_component_from_Stanford_to_UCSF as map_lab ylabel = 'component_normal' lab_src = ml_utils.map_lab(lab=lab, data_source=src_datasource, lab_type=lab_type) lab_dst = ml_utils.map_lab(lab=lab, data_source=dst_datasource, lab_type=lab_type) ''' Helper function ''' fm_io = FeatureMatrixIO() ''' Data folder ''' # lab_folder = os.path.join(dataset_folder, lab) ''' Load raw data from dst ''' df_raw_dst = SL.load_raw_matrix(lab=lab_dst, dataset_folderpath=dst_dataset_folderpath) # imputations_stanford = SL.load_imputation_template(lab=lab_mapped, dataset_folderpath=dst_dataset_folderpath, lab_type=lab_type) # df_ucsf_raw = fm_io.read_file_to_data_frame(lab_folder + '/' + "%s-normality-matrix-raw.tab"%map_lab_Stanford_to_UCSF[lab]) raw_columns_dst = df_raw_dst.columns.values.tolist() ''' From test processed, get the patient evalu set ''' df_processed_evalu_dst = SL.load_processed_matrix( lab_dst, dst_dataset_folderpath, type='evalu' ) #fm_io.read_file_to_data_frame(lab_folder + '/' + "%s-normality-test-matrix-processed.tab" % map_lab_Stanford_to_UCSF[lab]) patIds_evalu_dst = ml_utils.get_patIds( df_processed_evalu_dst ) #set(df_ucsf_processed_evalu['pat_id'].values.tolist()) df_raw_evalu_dst = df_raw_dst[df_raw_dst['pat_id'].isin(patIds_evalu_dst)] assert df_raw_dst.shape[0] > df_raw_evalu_dst.shape[0] ''' Load imputation template from Stanford TODO: this is old-versioned template, (1) without column order and (2) a lot of unnecessary columns. ''' # del impute_dict_old[ylabel] # ''' Use processed_matrix to select columns ''' df_processed_src = fm_io.read_file_to_data_frame( src_dataset_folderpath + '/' + lab_src + '/%s-normality-matrix-processed.tab' % lab_src) df_processed_src.pop('pat_id') df_processed_src.pop(ylabel) # TODO?! processed_columns_src = df_processed_src.columns.values.tolist() classifier_src = SL.load_ML_model( lab=lab_src, alg='random-forest', dataset_folderpath=src_dataset_folderpath) ''' Finding the corresponding UCSF column of each Stanford's processed feature If this feature exists in UCSF, then good If not, create dummy feature for UCSF raw matrix! ''' imputations_filepath = src_dataset_folderpath + '/' + lab_src + '/' + "feat2imputed_dict.pkl" impute_dict_old = pickle.load(open(imputations_filepath)) if ylabel in impute_dict_old: del impute_dict_old[ylabel] impute_dict_new = {} i = 0 for col_src in processed_columns_src: col_dst = map_col(col_src, src=src_datasource, dst=dst_datasource) if col_src in raw_columns_dst: col_dst = col_src elif col_dst not in raw_columns_dst: print "Unknown:", col_src, col_dst ''' create dummy column for dst ''' df_raw_evalu_dst[col_dst] = df_raw_evalu_dst['pat_id'].apply( lambda x: 0) # if col_dst in impute_dict_new: # ''' # Different src features map into the same dst feature # ''' # df_raw_evalu_dst['dummy_%s'%i] = df_raw_evalu_dst['pat_id'].apply(lambda x: 0) # pass if col_dst in impute_dict_new: col_dst = 'dummy_%s' % i df_raw_evalu_dst[col_dst] = df_raw_evalu_dst['pat_id'].apply( lambda x: 0) ''' Use Stanford mean to impute ''' if col_dst in df_raw_evalu_dst: # print col_mapped # TODO: XPPT and PPT are the same thing? pass # print i, col_selected, col_mapped ''' 40 PCO2A.-14_0.proximate PCO2.-14_0.proximate 41 PCO2V.-14_0.proximate PCO2.-14_0.proximate ''' impute_dict_new[col_dst] = (i, impute_dict_old[col_src]) # i += 1 ''' Feature auxillary ''' features = {'ylabel': ylabel, 'info': ['pat_id']} df_ucsf_processed_evalu, _ = SL.process_matrix( df_raw_evalu_dst, features, impute_template=impute_dict_new) # print "Finished processing!" # df_ucsf_processed.pop('all_components_normal') df_ucsf_processed_evalu.pop('pat_id') ''' Load model ''' # print "Finished Loading!" # print classifier.description() # TODO: why is this step so slow?! # print classifier.predict_probability(df_ucsf_processed) # print classifier_src._params_random_forest()['decision_features'] X_evalu, y_evalu = SL.split_Xy(data_matrix=df_ucsf_processed_evalu, outcome_label=ylabel) SL.predict(X_evalu, y_evalu, classifier_src, output_filepath=output_filepath)
labs['predictable_CV'] = (labs['percent_predictably_positive'].astype('float') * labs['annual_median_charge_volume ($)'].astype('float') / 1000).map('${:,.0f}'.format) labs['predictable_CV[-0.95]'] = (labs['percent_predictably_positive_0.95_lower_ci'].astype('float') * labs['annual_median_charge_volume ($)'].astype('float') / 1000).map('${:,.0f}'.format) labs['predictable_CV[+0.95]'] = (labs['percent_predictably_positive_0.95_upper_ci'].astype('float') * labs['annual_median_charge_volume ($)'].astype('float') / 1000).map('${:,.0f}'.format) summary = DataFrame() summary['lab'] = labs['label'] summary['charge'] = labs['median_charge'].astype('float').map('${:,.0f}'.format) summary['volume'] = labs['volume'].floordiv(6).astype('float').map('{:,.0f}'.format) summary['normal rate'] = labs['normality'] summary['[email protected]'] = labs['[email protected]'] + ' [' + \ labs['[email protected][-0.95]'] + ', ' + \ labs['[email protected][+0.95]'] + ']' summary['predictable CV ($1,000s)'] = labs['predictable_CV'] + ' [' + \ labs['predictable_CV[-0.95]'] + ', ' + \ labs['predictable_CV[+0.95]'] + ']' return summary if __name__ == '__main__': fm_io = FeatureMatrixIO() summary_table = LabNormalityReport.build_lab_performance_summary_table() fm_io.write_data_frame_to_file(summary_table, 'lab-performance-summary.tab') summary = LabNormalityReport.build_algorithm_performance_summary_table() fm_io.write_data_frame_to_file(summary, 'algorithm-performance-summary.tab') LabNormalityReport.plot_predictable_and_expensive_charges() summary = LabNormalityReport.build_lab_predictability_summary_report() fm_io.write_data_frame_to_file(summary, 'predictable-labs.tab') summary = LabNormalityReport.build_lab_predictability_summary_report(all=True) fm_io.write_data_frame_to_file(summary, 'all-labs.tab')
def apply_Stanford_to_UCSF(): from medinfo.dataconversion.FeatureMatrixIO import FeatureMatrixIO import pickle ''' Data folder ''' dataset_folder = "data-apply-Stanford-to-UCSF-10000-episodes/LABMGN/" ''' Helper function ''' fm_io = FeatureMatrixIO() ''' Load raw data from UCSF ''' df_ucsf_raw = fm_io.read_file_to_data_frame( dataset_folder + "Magnesium, Serum - Plasma-normality-matrix-raw.tab") raw_columns_ucsf = df_ucsf_raw.columns.values.tolist() ''' Load imputation template from Stanford TODO: this is old-versioned template, (1) without column order and (2) a lot of unnecessary columns. ''' impute_dict_old = pickle.load( open(dataset_folder + "feat2imputed_dict.pkl")) del impute_dict_old['all_components_normal'] # TODO?! ''' Use processed_matrix to select columns ''' df_stanford_processed = fm_io.read_file_to_data_frame( dataset_folder + 'LABMGN-normality-matrix-processed.tab') df_stanford_processed.pop('pat_id') df_stanford_processed.pop('all_components_normal') # TODO?! processed_columns_stanford = df_stanford_processed.columns.values.tolist() ''' Finding the corresponding UCSF column of each Stanford's processed feature If this feature exists in UCSF, then good If not, create dummy feature for UCSF raw matrix! ''' impute_dict_new = {} for i, col_selected in enumerate(processed_columns_stanford): col_mapped = map_col_Stanford_to_UCSF(col_selected) if col_mapped in raw_columns_ucsf: impute_dict_new[col_mapped] = (i, impute_dict_old[col_selected]) else: ''' Features Unknown to Stanford ''' df_ucsf_raw[col_mapped] = df_ucsf_raw['pat_id'].apply(lambda x: 0) impute_dict_new[col_mapped] = (i, 0 ) # TODO: better strategy later? print "Unknown:", col_mapped ''' Feature auxillary ''' features = {'ylabel': 'all_components_normal', 'info': ['pat_id']} df_ucsf_processed, _ = SL.process_matrix(df_ucsf_raw, features, impute_template=impute_dict_new) print "Finished processing!" # df_ucsf_processed.pop('all_components_normal') df_ucsf_processed.pop('pat_id') ''' Load model ''' classifier = joblib.load(dataset_folder + 'LABMGN-normality-random-forest-model.pkl') print "Finished Loading!" # print classifier.description() # TODO: why is this step so slow?! # print classifier.predict_probability(df_ucsf_processed) print classifier._params_random_forest()['decision_features'] X_evalu, y_evalu = SL.split_Xy(data_matrix=df_ucsf_processed, outcome_label='all_components_normal') SL.predict(X_evalu, y_evalu, classifier, output_filepath=dataset_folder + 'direct_comparisons.csv')
def __init__(self): self._fm_io = FeatureMatrixIO()
def get_train_and_evalu_raw_matrices(lab, data_lab_folderpath, random_state, train_size=0.75, columnToSplitOn='pat_id'): ''' If train and eval exist, direct get from disk Avoided saving as 2 raw matrices, too much space! elif raw matrix exists, get from dist and split else, get from SQL Args: raw_matrix_filepath: random_state: use_cached: Returns: ''' raw_matrix_filepath = os.path.join(data_lab_folderpath, raw_matrix_template % lab) fm_io = FeatureMatrixIO() # TODO: check if raw matrix exists raw_matrix = fm_io.read_file_to_data_frame(raw_matrix_filepath) pat_split_filepath = os.path.join(data_lab_folderpath, pat_split_filename) ''' Old pipeline style ''' if os.path.exists(pat_split_filepath): pat_split_df = pd.read_csv(pat_split_filepath) pat_ids_train = pat_split_df[pat_split_df['in_train'] == 1]['pat_id'].values.tolist() raw_matrix_train = raw_matrix[raw_matrix['pat_id'].isin(pat_ids_train)] pat_ids_evalu = pat_split_df[pat_split_df['in_train'] == 0]['pat_id'].values.tolist() raw_matrix_evalu = raw_matrix[raw_matrix['pat_id'].isin(pat_ids_evalu)] else: raw_matrix_train, raw_matrix_evalu = split_rows( raw_matrix, train_size=train_size, columnToSplitOn=columnToSplitOn, random_state=random_state) pat_ids_train = set(raw_matrix_train['pat_id'].values.tolist()) pat_split_df = raw_matrix[['pat_id']].copy() pat_split_df['in_train'] = pat_split_df['pat_id'].apply( lambda x: 1 if x in pat_ids_train else 0) # pat_split_df.to_csv(pat_split_filepath, index=False) assert set(raw_matrix_train['pat_id'].values.tolist()) & set( raw_matrix_evalu['pat_id'].values.tolist()) == set([]) assert raw_matrix_train.shape[0] + raw_matrix_evalu.shape[ 0] == raw_matrix.shape[0] return raw_matrix_train, raw_matrix_evalu
def _build_processed_feature_matrix(self, params): # params is a dict defining the details of how the raw feature matrix # should be transformed into the processed matrix. Given the sequence # of steps will be identical across all pipelines, sbala decided to # pack all the variability into this dict. It's not ideal because the # dict has 10+ values, but that seems better than forcing all pipelines # to reproduce the logic of the processing steps. # Principle: Minimize overridden function calls. # params['features_to_add'] = features_to_add # params['features_to_filter_on'] (optional) = features_to_filter_on # params['imputation_strategies'] = imputation_strategies # params['features_to_remove'] = features_to_remove # params['outcome_label'] = outcome_label # params['selection_problem'] = selection_problem # params['selection_algorithm'] = selection_algorithm # params['percent_features_to_select'] = percent_features_to_select # params['matrix_class'] = matrix_class # params['pipeline_file_path'] = pipeline_file_path # TODO(sbala): Determine which fields should have defaults. fm_io = FeatureMatrixIO() log.debug('params: %s' % params) # If processed matrix exists, and the client has not requested to flush # the cache, just use the matrix that already exists and return. processed_matrix_path = params['processed_matrix_path'] if os.path.exists(processed_matrix_path) and not self._flush_cache: # Assume feature selection already happened, but we still need # to split the data into training and test data. processed_matrix = fm_io.read_file_to_data_frame( processed_matrix_path) self._train_test_split(processed_matrix, params['outcome_label']) else: # Read raw matrix. raw_matrix = fm_io.read_file_to_data_frame( params['raw_matrix_path']) # Initialize FMT. fmt = FeatureMatrixTransform() fmt.set_input_matrix(raw_matrix) # Add features. self._add_features(fmt, params['features_to_add']) # Remove features. self._remove_features(fmt, params['features_to_remove']) # Filter on features if 'features_to_filter_on' in params: self._filter_on_features(fmt, params['features_to_filter_on']) # HACK: When read_csv encounters duplicate columns, it deduplicates # them by appending '.1, ..., .N' to the column names. # In future versions of pandas, simply pass mangle_dupe_cols=True # to read_csv, but not ready as of pandas 0.22.0. for feature in raw_matrix.columns.values: if feature[-2:] == ".1": fmt.remove_feature(feature) self._removed_features.append(feature) # Impute data. self._impute_data(fmt, raw_matrix, params['imputation_strategies']) # In case any all-null features were created in preprocessing, # drop them now so feature selection will work fmt.drop_null_features() # Build interim matrix. processed_matrix = fmt.fetch_matrix() # Divide processed_matrix into training and test data. # This must happen before feature selection so that we don't # accidentally learn information from the test data. self._train_test_split(processed_matrix, params['outcome_label']) self._select_features(params['selection_problem'], params['percent_features_to_select'], params['selection_algorithm'], params['features_to_keep']) train = self._y_train.join(self._X_train) test = self._y_test.join(self._X_test) processed_matrix = train.append(test) # Write output to new matrix file. header = self._build_processed_matrix_header(params) fm_io.write_data_frame_to_file(processed_matrix, \ processed_matrix_path, header)
def _train_and_analyze_predictors(self): log.info('Training and analyzing predictors...') problem = SupervisedLearningPipeline.CLASSIFICATION meta_report = None fm_io = FeatureMatrixIO() # Build paths for output. pipeline_file_name = inspect.getfile(inspect.currentframe()) data_dir = self._fetch_data_dir_path(pipeline_file_name) # Test BifurcatedSupervisedClassifier and SupervisedClassifier. algorithms_to_test = list() algorithms_to_test.extend(SupervisedClassifier.SUPPORTED_ALGORITHMS) for algorithm in SupervisedClassifier.SUPPORTED_ALGORITHMS: pass # TODO:(raikens) something in the BifurcatedSupervisedClassifier pipeline is crashing #algorithms_to_test.append('bifurcated-%s' % algorithm) log.debug('algorithms_to_test: %s' % algorithms_to_test) # Train and analyse algorithms. for algorithm in algorithms_to_test: log.info('Training and analyzing %s...' % algorithm) # If report_dir does not exist, make it. report_dir = '/'.join([data_dir, algorithm]) if not os.path.exists(report_dir): os.makedirs(report_dir) log.debug('report_dir: %s' % report_dir) # Define hyperparams. hyperparams = {} hyperparams['algorithm'] = algorithm hyperparams[ 'hyperparam_strategy'] = SupervisedClassifier.EXHAUSTIVE_SEARCH hyperparams['max_iter'] = 1024 # If bifurcated algorithm, define bifurcator. if 'bifurcated' in algorithm: # bifrucator = LAB.pre == 0 hyperparams['bifurcator'] = '%s.pre' % self._var hyperparams[ 'bifurcation_strategy'] = BifurcatedSupervisedClassifier.EQUAL hyperparams['bifurcation_value'] = 0 hyperparams['bifurcated'] = True # Train classifier. predictor_path = self._build_model_dump_path(algorithm) if os.path.exists( predictor_path) and 'bifurcated' not in algorithm: log.debug('Loading model from disk...') # TODO(sbala): Fix loblib.load so that it works for bifurcated # supervised classifiers. self._predictor = joblib.load(predictor_path) self._features = self._X_train.columns status = SupervisedClassifier.TRAINED else: status = SupervisedLearningPipeline._train_predictor( self, problem, [0, 1], hyperparams) # If failed to train, write an error report. y_train_counts = self._y_train[ self._y_train.columns[0]].value_counts() y_test_counts = self._y_test[ self._y_test.columns[0]].value_counts() if status == SupervisedClassifier.INSUFFICIENT_SAMPLES: # Skip all analysis and reporting. # This will be true for all algorithms, so just return. # Build error report. algorithm_report = DataFrame( { 'lab_panel': [self._var], 'algorithm': [algorithm], 'error': [status], 'y_train.value_counts()': [y_train_counts.to_dict()], 'y_test.value_counts()': [y_test_counts.to_dict()] }, columns=[ 'lab_panel', 'algorithm', 'error', 'y_train.value_counts()', 'y_test.value_counts()' ]) header = [ 'LabChangePredictionPipeline("%s", %d)' % (self._var, self._num_rows) ] # Write error report. fm_io.write_data_frame_to_file(algorithm_report, \ '/'.join([report_dir, '%s-change-prediction-report.tab' % (self._var)]), \ header) # If successfully trained, append to a meta report. elif status == SupervisedClassifier.TRAINED: pipeline_prefix = '%s-change-prediction-%s' % (self._var, algorithm) SupervisedLearningPipeline._analyze_predictor( self, report_dir, pipeline_prefix) if meta_report is None: meta_report = fm_io.read_file_to_data_frame('/'.join( [report_dir, '%s-report.tab' % pipeline_prefix])) else: algorithm_report = fm_io.read_file_to_data_frame('/'.join( [report_dir, '%s-report.tab' % pipeline_prefix])) log.debug('algorithm_report: %s' % algorithm_report) meta_report = meta_report.append(algorithm_report) # Write predictor to disk. predictor = SupervisedLearningPipeline.predictor(self) predictor_path = self._build_model_dump_path(algorithm) joblib.dump(predictor, predictor_path) # After building per-algorithm reports, write to meta report. # Note that if there were insufficient samples to build any of the # algorithms, then meta_report will still be None. if meta_report is not None: header = [ 'LabChangePredictionPipeline("%s", %d)' % (self._var, self._num_rows) ] fm_io.write_data_frame_to_file(meta_report, \ '/'.join([data_dir, '%s-change-prediction-report.tab' % self._var]), header)
def _build_processed_feature_matrix(self, params): # params is a dict defining the details of how the raw feature matrix # should be transformed into the processed matrix. Given the sequence # of steps will be identical across all pipelines, sbala decided to # pack all the variability into this dict. It's not ideal because the # dict has 10+ values, but that seems better than forcing all pipelines # to reproduce the logic of the processing steps. # Principle: Minimize overridden function calls. # params['features_to_add'] = features_to_add # params['features_to_filter_on'] (optional) = features_to_filter_on # params['imputation_strategies'] = imputation_strategies # params['features_to_remove'] = features_to_remove # params['outcome_label'] = outcome_label # params['selection_problem'] = selection_problem # params['selection_algorithm'] = selection_algorithm # params['percent_features_to_select'] = percent_features_to_select # params['matrix_class'] = matrix_class # params['pipeline_file_path'] = pipeline_file_path # TODO(sbala): Determine which fields should have defaults. fm_io = FeatureMatrixIO() log.debug('params: %s' % params) # If processed matrix exists, and the client has not requested to flush # the cache, just use the matrix that already exists and return. processed_matrix_path = params['processed_matrix_path'] if os.path.exists(processed_matrix_path) and not self._flush_cache: # Assume feature selection already happened, but we still need # to split the data into training and test data. processed_matrix = fm_io.read_file_to_data_frame( processed_matrix_path) # processed_matrix['pat_id'] = processed_matrix['pat_id'].apply(lambda x: str(x)) self._train_test_split(processed_matrix, params['outcome_label']) ''' Pandas dataframe may automatically convert bigint to float (and round the last few digits), which may damage the uniqueness of pat_ids. ''' # processed_matrix['pat_id'] = processed_matrix['pat_id'].apply(lambda x: str(x)) else: # Read raw matrix. raw_matrix = fm_io.read_file_to_data_frame( params['raw_matrix_path']) # raw_matrix['pat_id'] = raw_matrix['pat_id'].apply(lambda x: str(x)) # Initialize FMT. # Divide processed_matrix into training and test data. # This must happen before feature selection so that we don't # accidentally learn information from the test data. patIds_df = raw_matrix['pat_id'].copy() self._train_test_split(raw_matrix, params['outcome_label']) # ## # folder_path = '/'.join(params['raw_matrix_path'].split('/')[:-1]) # self._X_train.join(self._y_train).to_csv(folder_path + '/' + 'train_raw.csv', index=False) # self._X_test.join(self._y_test).to_csv(folder_path + '/' + 'test_raw.csv', index=False) # # ''' # Mini-test that there are no overlapping patients # ''' # assert bool(set(self._X_train['pat_id'].values) & set(self._X_test['pat_id'].values)) == False # ## fmt = FeatureMatrixTransform() train_df = self._X_train.join(self._y_train) fmt.set_input_matrix(train_df) # Add features. self._add_features(fmt, params['features_to_add']) # Filter on features if 'features_to_filter_on' in params: self._filter_on_features(fmt, params['features_to_filter_on']) # HACK: When read_csv encounters duplicate columns, it deduplicates # them by appending '.1, ..., .N' to the column names. # In future versions of pandas, simply pass mangle_dupe_cols=True # to read_csv, but not ready as of pandas 0.22.0. for feature in raw_matrix.columns.values: if feature[-2:] == ".1": fmt.remove_feature(feature) self._removed_features.append(feature) # Impute data. if params['imputation_strategies'] == {'sxu_new_imputation'}: train_df = fmt.fetch_matrix() means = {} for column in train_df.columns.values.tolist(): # column_tail = column.split('.')[-1].strip() if train_df[column].dtype == 'float64': means[column] = train_df[column].mean() train_df = fmt.do_impute_sx(train_df, means) fmt.set_input_matrix(train_df) self._X_test = fmt.do_impute_sx(self._X_test, means) self._remove_features(fmt, params['features_to_remove']) else: self._remove_features(fmt, params['features_to_remove']) self._impute_data(fmt, train_df, params['imputation_strategies']) # Remove features. ''' Moved here, since still need pat_id for imputation! ''' # self._remove_features(fmt, params['features_to_remove']) # In case any all-null features were created in preprocessing, # drop them now so feature selection will work fmt.drop_null_features() # Build interim matrix. train_df = fmt.fetch_matrix() self._y_train = pd.DataFrame(train_df.pop(params['outcome_label'])) self._X_train = train_df ''' Select X_test columns according to processed X_train ''' self._X_test = self._X_test[self._X_train.columns] if not params['imputation_strategies'] == {'sxu_new_imputation'}: for feat in self._X_test.columns: self._X_test[feat] = self._X_test[feat].fillna( self.feat2imputed_dict[feat]) self._select_features(params['selection_problem'], params['percent_features_to_select'], params['selection_algorithm'], params['features_to_keep']) ''' The join is based on index by default. Will remove 'pat_id' (TODO sxu: more general in the future) later in train(). ''' self._X_train = self._X_train.join(patIds_df, how='left') self._X_test = self._X_test.join(patIds_df, how='left') # print set(self._X_train['pat_id'].values.tolist()) & set(self._X_test['pat_id'].values.tolist()) train = self._y_train.join(self._X_train) test = self._y_test.join(self._X_test) processed_trainMatrix_path = processed_matrix_path.replace( "matrix", "train-matrix") train.to_csv(processed_trainMatrix_path, sep='\t', index=False) processed_testMatrix_path = processed_matrix_path.replace( "matrix", "test-matrix") test.to_csv(processed_testMatrix_path, sep='\t', index=False) processed_matrix = train.append(test) ''' Recover the order of rows before writing into disk, where the index info will be missing. ''' processed_matrix.sort_index(inplace=True) # Write output to new matrix file. header = self._build_processed_matrix_header(params) fm_io.write_data_frame_to_file(processed_matrix, \ processed_matrix_path, header) ''' Pop out pat_id from the feature matrices. Also check whether there is pat_id leakage. ''' self._patIds_train = self._X_train.pop('pat_id').values.tolist() self._patIds_test = self._X_test.pop('pat_id').values.tolist() assert not (set(self._patIds_train) & set(self._patIds_test))