예제 #1
0
 def get_raw_matrix(self, data_tag):
     if data_tag == "src":
         return FeatureMatrixIO().read_file_to_data_frame(
             self.raw_matrix_filepath)
     else:
         return FeatureMatrixIO().read_file_to_data_frame(
             self.raw_matrix_filepath)
예제 #2
0
    def _process_raw_feature_matrix(self):
        # Read raw CMM.
        self._fm_io = FeatureMatrixIO()
        print('Reading raw matrix...')
        self._cmm_raw = self._fm_io.read_file_to_data_frame(self._cmm_name_raw)

        # Add and remove features to _cmm_processed.
        self._fmt = FeatureMatrixTransform()
        self._fmt.set_input_matrix(self._cmm_raw)
        print('Adding features...')
        self._add_features()
        print('Imputing data...')
        self._impute_data()
        self._remove_features()
        self._fmt.drop_duplicate_rows()
        self._cmm_processed = self._fmt.fetch_matrix()

        # Divide _cmm_processed into training and test data.
        # This must happen before feature selection so that we don't
        # accidentally learn information from the test data.
        self._train_test_split()
        print('Selecting features...')
        self._select_features()

        # Write output to new matrix.
        train = self._y_train.join(self._X_train)
        test = self._y_test.join(self._X_test)
        self._cmm_processed = train.append(test)

        header = self._build_processed_matrix_header()

        self._fm_io.write_data_frame_to_file(self._cmm_processed,
                                             self._cmm_name_processed, header)
예제 #3
0
def load_imputation_template(lab, dataset_folderpath, lab_type='panel'):
    data_lab_folderpath = os.path.join(dataset_folderpath, lab)
    imputations = pickle.load(
        open(data_lab_folderpath + '/' + "feat2imputed_dict.pkl"))

    if len(imputations) < 200:  #
        '''
        only includes selected features
        '''
        return imputations

    if lab_type == 'panel':
        ylabel = 'all_components_normal'
    else:
        ylabel = 'component_normal'
    '''
    All raw matrix's columns are included. Have to extract final features from processed matrix
    '''
    fm_io = FeatureMatrixIO()
    df_processed = fm_io.read_file_to_data_frame(
        data_lab_folderpath + '/' + '%s-normality-matrix-processed.tab' % lab)
    df_processed.pop('pat_id')
    df_processed.pop(ylabel)  # TODO?!

    processed_columns_stanford = df_processed.columns.values.tolist()

    imputations_new = {}
    for i, col_selected in enumerate(processed_columns_stanford):
        imputations_new[col_selected] = (i, imputations[col_selected])
    return imputations_new
예제 #4
0
def load_raw_matrix(lab, dataset_folderpath):
    data_lab_folderpath = os.path.join(dataset_folderpath, lab)
    raw_matrix_filepath = os.path.join(data_lab_folderpath,
                                       raw_matrix_template % lab)
    fm_io = FeatureMatrixIO()

    # TODO: check if raw matrix exists
    raw_matrix = fm_io.read_file_to_data_frame(raw_matrix_filepath)
    return raw_matrix
    def _build_raw_feature_matrix(self):
        raw_matrix_path = self._build_raw_matrix_path()
        matrix_class = LabNormalityMatrix
        SupervisedLearningPipeline._build_raw_feature_matrix(self, matrix_class, \
            raw_matrix_path)

        if not self._holdOut:
            fm_io = FeatureMatrixIO()
            matrix = fm_io.read_file_to_data_frame(raw_matrix_path)
            self.usedPatIds = set(matrix['pat_id'].values)
예제 #6
0
    def __init__(self, lab_panel, num_episodes, use_cache=None, random_state=None,
                 timeLimit=None, notUsePatIds=None, holdOut=False, pat_batch_ind=None, includeLastNormality=True):
        # self.notUsePatIds = notUsePatIds
        self.pat_batch_ind = pat_batch_ind
        self._holdOut = holdOut
        self.usedPatIds = []
        SupervisedLearningPipeline.__init__(self, lab_panel, num_episodes, use_cache, random_state,
                                            timeLimit, notUsePatIds)
        # TODO: naming of lab_panel
        self._factory = FeatureMatrixFactory()
        self._build_raw_feature_matrix()

        if LAB_TYPE == 'panel':
            self.ylabel = 'all_components_normal'
        else:
            self.ylabel = 'component_normal'

        self.includeLastNormality = includeLastNormality

        if self.includeLastNormality:
            fm_io = FeatureMatrixIO()
            df = fm_io.read_file_to_data_frame('data/'+lab_panel+'/%s-normality-matrix-raw.tab'%lab_panel)
            df = df.sort_values(['pat_id', 'order_time']).reset_index(drop=True)
            df['last_normality'] = df['order_proc_id'].apply(lambda x:float('nan'))
            for i in range(1,df.shape[0]):
                if df.ix[i, 'pat_id'] == df.ix[i-1, 'pat_id']:
                    df.ix[i, 'last_normality'] = df.ix[i-1, self.ylabel]
            df.to_csv('data/'+lab_panel+'/%s-normality-matrix-raw.tab'%lab_panel, index=False, sep='\t')

        data_lab_folder = self._fetch_data_dir_path(inspect.getfile(inspect.currentframe()))
        feat2imputed_dict_path = data_lab_folder + '/feat2imputed_dict.pkl'

        if holdOut:
            '''
            For holdOut evaluation data, produce the raw matrix, pick 
            features according to the saved feat2imputed_dict. 
            '''
            self.feat2imputed_dict = pickle.load(open(feat2imputed_dict_path, 'r'))
            self._build_processed_feature_matrix_holdout()
            self._analyze_predictors_on_holdout()
        else:
            '''
            For training/validation data, record the pat_ids, 
            selected features and their imputed value correspondingly. 
            '''
            pickle.dump(self.usedPatIds, open('data/used_patient_set_%s.pkl'%self._var, 'w'), pickle.HIGHEST_PROTOCOL)
            self._build_processed_feature_matrix()
            self._build_baseline_results()  # TODO: prototype in SLPP
            # return

            # TODO: find better place to put the dict.pkl
            pickle.dump(self.feat2imputed_dict, open(feat2imputed_dict_path, 'w'), pickle.HIGHEST_PROTOCOL)
            self._train_and_analyze_predictors()
예제 #7
0
    def _analyze_predictor_holdoutset(self, dest_dir, pipeline_prefix):
        slugified_var = '-'.join(self._var.split())
        holdout_path = dest_dir + '/../' + '%s-normality-matrix-%d-episodes-processed-holdout.tab' % (
            slugified_var, self._num_rows)
        fm_io = FeatureMatrixIO()
        processed_matrix = fm_io.read_file_to_data_frame(holdout_path)
        if self._isLabPanel:
            y_holdout = pd.DataFrame(
                processed_matrix.pop('all_components_normal'))
        else:
            y_holdout = pd.DataFrame(processed_matrix.pop('component_normal'))
        X_holdout = processed_matrix
        analyzer = ClassifierAnalyzer(self._predictor, X_holdout, y_holdout)
        train_label = 'holdoutset'

        # Build names for output plots and report.
        direct_comparisons_name = '%s-direct-compare-results-%s.csv' % (
            pipeline_prefix, train_label)
        precision_at_k_plot_name = '%s-precision-at-k-plot-%s.png' % (
            pipeline_prefix, train_label)
        precision_recall_plot_name = '%s-precision-recall-plot-%s.png' % (
            pipeline_prefix, train_label)
        roc_plot_name = '%s-roc-plot-%s.png' % (pipeline_prefix, train_label)
        report_name = '%s-report-%s.tab' % (pipeline_prefix, train_label)

        # Build paths.
        direct_comparisons_path = '/'.join([dest_dir, direct_comparisons_name])
        log.debug('direct_comparisons_path: %s' % direct_comparisons_path)
        precision_at_k_plot_path = '/'.join(
            [dest_dir, precision_at_k_plot_name])
        log.debug('precision_at_k_plot_path: %s' % precision_at_k_plot_path)
        precision_recall_plot_path = '/'.join(
            [dest_dir, precision_recall_plot_name])
        log.debug('precision_recall_plot_path: %s' %
                  precision_recall_plot_path)
        roc_plot_path = '/'.join([dest_dir, roc_plot_name])
        log.debug('roc_plot_path: %s' % roc_plot_path)
        report_path = '/'.join([dest_dir, report_name])
        log.debug('report_path: %s' % report_path)

        # Build plot titles.
        roc_plot_title = 'ROC (%s)' % pipeline_prefix
        precision_recall_plot_title = 'Precision-Recall (%s)' % pipeline_prefix
        precision_at_k_plot_title = 'Precision @K (%s)' % pipeline_prefix

        # Write output.
        analyzer.output_direct_comparisons(direct_comparisons_path)
        analyzer.plot_roc_curve(roc_plot_title, roc_plot_path)
        analyzer.plot_precision_recall_curve(precision_recall_plot_title,
                                             precision_recall_plot_path)
        analyzer.plot_precision_at_k_curve(precision_at_k_plot_title,
                                           precision_at_k_plot_path)
        analyzer.write_report(report_path, ci=0.95)
예제 #8
0
 def read_lab_meta_report(lab_panel):
     fm_io = FeatureMatrixIO()
     data_dir = LabNormalityReport.fetch_data_dir_path()
     meta_report_path = data_dir + '/%s/%s-normality-prediction-report.tab' % (lab_panel, lab_panel)
     if os.path.exists(meta_report_path):
         meta_report = fm_io.read_file_to_data_frame(meta_report_path)
         return meta_report
     else:
         # IF meta_report does not exist, fetch the data on class counts.
         algorithm = SupervisedClassifier.REGRESS_AND_ROUND
         report_path = data_dir + '/%s/%s/%s-normality-prediction-report.tab' % (lab_panel, algorithm, lab_panel)
         algorithm_report = fm_io.read_file_to_data_frame(report_path)
         return algorithm_report
    def _build_processed_feature_matrix_holdout(self):
        fm_io = FeatureMatrixIO()
        raw_matrix = fm_io.read_file_to_data_frame(
            self._build_raw_matrix_path())

        # if outcome_label in self.feat2imputed_dict:
        #     self.feat2imputed_dict.pop(outcome_label)
        #
        # processed_matrix = raw_matrix[self.feat2imputed_dict.keys()+[outcome_label]].copy()
        '''
        TODO: feat2imputed_dict includes the outcome label
        '''
        processed_matrix = raw_matrix[self.feat2imputed_dict.keys()].copy()

        # TODO: tmp solution!
        tmp_path = self._build_processed_matrix_path().replace(
            "2000", "10000").replace("-holdout", "")
        fm_io1 = FeatureMatrixIO()
        processed_matrix_previous = fm_io1.read_file_to_data_frame(tmp_path)
        processed_matrix = processed_matrix[processed_matrix_previous.columns]
        # TODO: tmp solution!

        for feat in self.feat2imputed_dict.keys():
            processed_matrix[feat] = processed_matrix[feat].fillna(
                self.feat2imputed_dict[feat])

        fm_io.write_data_frame_to_file(processed_matrix, \
                                       self._build_processed_matrix_path(), None)
    def _analyze_predictors_on_holdout(self):
        fm_io = FeatureMatrixIO()

        algorithms_to_test = list()
        algorithms_to_test.extend(SupervisedClassifier.SUPPORTED_ALGORITHMS)

        pipeline_file_name = inspect.getfile(inspect.currentframe())
        data_dir = SupervisedLearningPipeline._fetch_data_dir_path(
            self, pipeline_file_name)
        # for algorithm in SupervisedClassifier.SUPPORTED_ALGORITHMS:
        #     algorithms_to_test.append('bifurcated-%s' % algorithm)
        log.debug('algorithms_to_test: %s' % algorithms_to_test)
        for algorithm in algorithms_to_test:
            log.info('analyzing %s...' % algorithm)
            # If report_dir does not exist, make it.
            report_dir = '/'.join([data_dir, algorithm])

            pipeline_prefix = '%s-normality-prediction-%s' % (self._var,
                                                              algorithm)

            predictor_path = self._build_model_dump_path(algorithm)

            if os.path.exists(
                    predictor_path) and 'bifurcated' not in algorithm:
                log.debug('Loading model from disk...')
                # TODO(sbala): Fix loblib.load so that it works for bifurcated
                # supervised classifiers.
                self._predictor = joblib.load(predictor_path)
                # self._features = self._X_train.columns
                status = SupervisedClassifier.TRAINED

            SupervisedLearningPipeline._analyze_predictor_holdoutset(
                self, report_dir, pipeline_prefix)
예제 #11
0
def test_SupervisedLearner():
    from medinfo.ml.SupervisedLearner import SupervisedLearner
    import inspect
    from medinfo.dataconversion.FeatureMatrixIO import FeatureMatrixIO

    class LabNormalityLearner(SupervisedLearner):
        def __init__(self, input_matrix, ylabel):
            self.working_folderpath = '/'.join(inspect.getfile(inspect.currentframe()).split('/')[:-1])
            self.input_matrix = input_matrix
            self.ylabel = ylabel
            pass

    fm_io = FeatureMatrixIO()
    processed_matrix = fm_io.read_file_to_data_frame('data-testingSupervisedLearner-panel-10000-episodes/LABA1C/LABA1C-normality-train-matrix-processed.tab')
    processed_matrix.pop('pat_id')

    lnl = LabNormalityLearner(processed_matrix, 'all_components_normal')
    lnl.run()
예제 #12
0
    def test_read_file_to_data_frame(self):
        # Initialize FeatureMatrixIO.
        fm_io = FeatureMatrixIO()

        # Build paths for test files.
        app_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
        no_header_file_name = 'test-matrix-no-header.tab'
        with_header_file_name = 'test-matrix-with-header.tab'
        no_header_file_path = os.path.join(app_dir, no_header_file_name)
        with_header_file_path = os.path.join(app_dir, with_header_file_name)

        # Read files into data frames.
        matrix_stripped_header = fm_io.read_file_to_data_frame(with_header_file_path)
        matrix_no_header = fm_io.read_file_to_data_frame(no_header_file_path)

        # Verify that FeatureMatrixIO correctly stripped the header.
        expected_matrix = MANUAL_TEST_CASE['matrix_no_header']
        assert_frame_equal(expected_matrix, matrix_stripped_header)
        assert_frame_equal(expected_matrix, matrix_no_header)
예제 #13
0
    def test_strip_header(self):
        # Initialize FeatureMatrixIO.
        fm_io = FeatureMatrixIO()

        # Build paths for test files.
        app_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
        no_header_file_name = 'test-matrix-no-header.tab'
        with_header_file_name = 'test-matrix-with-header.tab'
        no_header_file_path = os.path.join(app_dir, no_header_file_name)
        with_header_file_path = os.path.join(app_dir, with_header_file_name)

        # Strip header.
        matrix_with_header = fm_io.read_file_to_data_frame(with_header_file_path)
        self._stripped_header_file_path = fm_io.strip_header(with_header_file_path)

        # Validate matrix data.
        expected_matrix = MANUAL_TEST_CASE['matrix_no_header']
        actual_matrix = fm_io.read_file_to_data_frame(self._stripped_header_file_path, \
            datetime_col_index=1)
        assert_frame_equal(expected_matrix, actual_matrix)
예제 #14
0
def jitter_processed_matrix(lab, pat_num_limit=100):
    data_file = "%s-normality-matrix-10000-episodes-processed.tab" % lab
    data_path = os.path.join(data_folder, lab, data_file)
    fm_io = FeatureMatrixIO()
    df = fm_io.read_file_to_data_frame(data_path)
    '''
    Reset the pat ids
    '''
    pat_ids = sorted(set(df['pat_id'].values.tolist()))

    pat_ids = pat_ids[:pat_num_limit]

    pat2pat = {}
    for i, pat_id in enumerate(pat_ids):
        pat2pat[pat_id] = i
    df['pat_id'] = df['pat_id'].apply(lambda x: pat2pat[x]
                                      if x in pat2pat else None)
    df = df.dropna()

    print np.array_repr(df.values)
    print df.columns
예제 #15
0
def test(test_suite=[]):
    import LabNormalityLearner_Config as Config
    from medinfo.dataconversion.FeatureMatrixIO import FeatureMatrixIO

    fm_io = FeatureMatrixIO()
    raw_matrix = fm_io.read_file_to_data_frame(
        'LabNormalityLearner_TestData/LABA1C-normality-matrix-raw.tab')

    if 'remove' in test_suite:
        remover = LNL.FeatureRemover(Config.features_to_remove)
        processed_matrix_removed = remover.transform(raw_matrix)
        assert raw_matrix.shape[0] < processed_matrix_removed.shape[0]
        assert raw_matrix.shape[1] == processed_matrix_removed.shape[1]

    if 'impute' in test_suite:
        features_to_impute = [
            'TBIL.-14_0.max', 'TBIL.-14_0.median', 'TBIL.-14_0.mean',
            'TBIL.-14_0.std'
        ]
        #('min', 'max', 'median', 'mean', 'std', 'first', 'last', 'diff', 'slope', 'proximate')
        imputation_dict = {}
        for feature in features_to_impute:
            imputation_dict[feature] = 0

        imputer = LNL.FeatureImputer(imputation_dict=imputation_dict)
        columns_to_look = [
            'pat_id', 'TBIL.-14_0.max', 'TBIL.-14_0.median', 'TBIL.-14_0.mean',
            'TBIL.-14_0.std'
        ]
        print 'raw_matrix[columns_to_look].head():', raw_matrix[
            columns_to_look].head()

        processed_matrix_imputed = imputer.fit_transform(raw_matrix)
        print 'processed_matrix_imputed[columns_to_look].head():', processed_matrix_imputed[
            columns_to_look].head()

        assert processed_matrix_imputed[columns_to_look].isna().any().any(
        ) == False
        assert (raw_matrix['order_proc_id'].values ==
                processed_matrix_imputed['order_proc_id'].values).all()
예제 #16
0
    def test_write_data_frame_to_file(self):
        # Initialize FeatureMatrixIO.
        fm_io = FeatureMatrixIO()

        # Build paths for test files.
        app_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
        no_header_file_name = 'test-matrix-no-header.tab'
        with_header_file_name = 'test-matrix-with-header.tab'
        no_header_file_path = os.path.join(app_dir, no_header_file_name)
        with_header_file_path = os.path.join(app_dir, with_header_file_name)

        # Read data frames from test files.
        matrix_no_header = MANUAL_TEST_CASE['matrix_no_header']
        matrix_header = MANUAL_TEST_CASE['custom_header']

        # Write data frame without header.
        no_header_temp_file_name = 'no-header-temp-file.tab'
        self._no_header_temp_file_path = os.path.join(app_dir, no_header_temp_file_name)
        fm_io.write_data_frame_to_file(matrix_no_header, self._no_header_temp_file_path)

        # Write data frame with header.
        with_header_temp_file_name = 'header-temp-file.tab'
        self._with_header_temp_file_path = os.path.join(app_dir, with_header_temp_file_name)
        fm_io.write_data_frame_to_file(matrix_no_header, self._with_header_temp_file_path, matrix_header)

        # Validate output files.
        self.assertTrue(filecmp.cmp(no_header_file_path, self._no_header_temp_file_path))
        self.assertTrue(filecmp.cmp(with_header_file_path, self._with_header_temp_file_path))
예제 #17
0
def load_processed_matrix(lab, dataset_folderpath, type='full'):
    data_lab_folderpath = os.path.join(dataset_folderpath, lab)

    if type == 'train':
        matrix_filepath = os.path.join(data_lab_folderpath,
                                       processed_matrix_train_template % lab)
    elif type == 'evalu':
        matrix_filepath = os.path.join(data_lab_folderpath,
                                       processed_matrix_evalu_template % lab)
    else:
        matrix_filepath = os.path.join(data_lab_folderpath,
                                       processed_matrix_template % lab)

    fm_io = FeatureMatrixIO()

    # TODO: check if raw matrix exists
    if os.path.exists(matrix_filepath):
        matrix = fm_io.read_file_to_data_frame(matrix_filepath)
    else:
        matrix = fm_io.read_file_to_data_frame(
            matrix_filepath.replace('-test', '-evalu'))
    return matrix
예제 #18
0
 def write_matrix(self, dest_path, header=None):
     log.info('Writing matrix file...')
     fm_io = FeatureMatrixIO()
     # Get old matrix file.
     source_path = self._factory.getMatrixFileName()
     # Write to new matrix filee.
     matrix_file = open(dest_path, 'w')
     # for line in header:
     #     matrix_file.write('# %s\n' % line)
     for line in open(source_path, 'r'):
         if line[0] != '#':
             matrix_file.write(line)
     # Delete old matrix file.
     os.remove(source_path)
    def _build_processed_feature_matrix(self, params):
        # params is a dict defining the details of how the raw feature matrix
        # should be transformed into the processed matrix. Given the sequence
        # of steps will be identical across all pipelines, sbala decided to
        # pack all the variability into this dict. It's not ideal because the
        # dict has 10+ values, but that seems better than forcing all pipelines
        # to reproduce the logic of the processing steps.
        # Principle: Minimize overridden function calls.
        #   params['features_to_add'] = features_to_add
        #   params['features_to_filter_on'] (optional) = features_to_filter_on
        #   params['imputation_strategies'] = imputation_strategies
        #   params['features_to_remove'] = features_to_remove
        #   params['outcome_label'] = outcome_label
        #   params['selection_problem'] = selection_problem
        #   params['selection_algorithm'] = selection_algorithm
        #   params['percent_features_to_select'] = percent_features_to_select
        #   params['matrix_class'] = matrix_class
        #   params['pipeline_file_path'] = pipeline_file_path
        #   TODO(sbala): Determine which fields should have defaults.
        fm_io = FeatureMatrixIO()
        log.debug('params: %s' % params)
        # If processed matrix exists, and the client has not requested to flush
        # the cache, just use the matrix that already exists and return.
        processed_matrix_path = params['processed_matrix_path']
        if os.path.exists(processed_matrix_path) and not self._flush_cache:
            # Assume feature selection already happened, but we still need
            # to split the data into training and test data.
            processed_matrix = fm_io.read_file_to_data_frame(
                processed_matrix_path)
            self._train_test_split(processed_matrix, params['outcome_label'])
        else:
            # Read raw matrix.
            raw_matrix = fm_io.read_file_to_data_frame(
                params['raw_matrix_path'])
            # Initialize FMT.
            fmt = FeatureMatrixTransform()
            fmt.set_input_matrix(raw_matrix)

            # Add features.
            self._add_features(fmt, params['features_to_add'])
            # Remove features.
            self._remove_features(fmt, params['features_to_remove'])
            # Filter on features
            if 'features_to_filter_on' in params:
                self._filter_on_features(fmt, params['features_to_filter_on'])

            # HACK: When read_csv encounters duplicate columns, it deduplicates
            # them by appending '.1, ..., .N' to the column names.
            # In future versions of pandas, simply pass mangle_dupe_cols=True
            # to read_csv, but not ready as of pandas 0.22.0.
            for feature in raw_matrix.columns.values:
                if feature[-2:] == ".1":
                    fmt.remove_feature(feature)
                    self._removed_features.append(feature)

            # Impute data.
            self._impute_data(fmt, raw_matrix, params['imputation_strategies'])

            # In case any all-null features were created in preprocessing,
            # drop them now so feature selection will work
            fmt.drop_null_features()

            # Build interim matrix.
            processed_matrix = fmt.fetch_matrix()

            # Divide processed_matrix into training and test data.
            # This must happen before feature selection so that we don't
            # accidentally learn information from the test data.
            self._train_test_split(processed_matrix, params['outcome_label'])

            self._select_features(params['selection_problem'],
                                  params['percent_features_to_select'],
                                  params['selection_algorithm'],
                                  params['features_to_keep'])

            train = self._y_train.join(self._X_train)
            test = self._y_test.join(self._X_test)
            processed_matrix = train.append(test)

            # Write output to new matrix file.
            header = self._build_processed_matrix_header(params)
            fm_io.write_data_frame_to_file(processed_matrix, \
                processed_matrix_path, header)
예제 #20
0
 def get_raw_matrix(self):
     return FeatureMatrixIO().read_file_to_data_frame(
         self.raw_matrix_filepath)
예제 #21
0
    def _build_processed_feature_matrix(self):
        # Define parameters for processing steps.
        params = {}
        raw_matrix_path = self._build_raw_matrix_path()
        processed_matrix_path = self._build_processed_matrix_path(raw_matrix_path)

        log.debug('params: %s' % params)

        prev_measurement_feature = self._change_params['feature_old']
        features_to_add = {'change': [self._change_params]}
        features_to_filter_on = [{'feature': prev_measurement_feature,
                                  'value':np.nan}]
        imputation_strategies = {
        }

        features_to_remove = [
            'pat_id', 'order_time', 'order_proc_id', 'ord_num_value',
            'proc_code', 'abnormal_panel', 'all_components_normal',
            'num_normal_components', 'Birth.pre',
            'Male.preTimeDays', 'Female.preTimeDays',
            'RaceWhiteHispanicLatino.preTimeDays',
            'RaceWhiteNonHispanicLatino.preTimeDays',
            'RaceHispanicLatino.preTimeDays',
            'RaceAsian.preTimeDays',
            'RaceBlack.preTimeDays',
            'RacePacificIslander.preTimeDays',
            'RaceNativeAmerican.preTimeDays',
            'RaceOther.preTimeDays',
            'RaceUnknown.preTimeDays',
            'Death.post',
            'Death.postTimeDays',
            'num_components'
        ]
        features_to_keep = [
            # Keep the # of times it's been ordered in past, even if low info.
            '%s.pre' % self._var
        ]
        outcome_label = 'unchanged_yn'
        selection_problem = FeatureSelector.CLASSIFICATION
        selection_algorithm = FeatureSelector.RECURSIVE_ELIMINATION
        percent_features_to_select = 0.05
        matrix_class = LabChangeMatrix
        pipeline_file_path = inspect.getfile(inspect.currentframe())
        data_overview = [
            # Overview:
            'Overview',
            # The outcome label is ___.
            'The outcome label is %s.' % outcome_label,
            # %s is a boolean indicator which summarizes whether the lab test
            '%s is a boolean indicator which summarizes whether the lab test ' % outcome_label,
            # result is unchanged compared to the previous measurement.
            'result is unchanged compared to the previous measurement.',
            # Each row represents a unique lab panel order.
            'Each row represents a unique lab panel order.',
            # Each row contains fields summarizing the patient's demographics,
            "Each row contains fields summarizing the patient's demographics",
            # inpatient admit date, prior vitals, and prior lab results.
            'inpatient admit date, prior vitals, and prior lab results.',
            # Most cells in matrix represent a count statistic for an event's
            "Most cells in matrix represent a count statistic for an event's",
            # occurrence or a difference between an event's time and index_time.
            "occurrence or a difference between an event's time and index_time.",
            # Lab panel orders were only included if a previous measurement of
            "Lab panel orders were only included if a previous measurement of",
            # the same lab panel has been recorded
            "the same lab panel has been recorded."
        ]

        # Bundle parameters into single object
        params['raw_matrix_path'] = raw_matrix_path
        params['processed_matrix_path'] = processed_matrix_path
        params['features_to_add'] = features_to_add
        params['features_to_keep'] = features_to_keep
        params['features_to_filter_on'] = features_to_filter_on
        params['imputation_strategies'] = imputation_strategies
        params['features_to_remove'] = features_to_remove
        params['outcome_label'] = outcome_label
        params['selection_problem'] = selection_problem
        params['selection_algorithm'] = selection_algorithm
        params['percent_features_to_select'] = percent_features_to_select
        params['matrix_class'] = matrix_class
        params['pipeline_file_path'] = pipeline_file_path
        params['data_overview'] = data_overview

        # defer to SupervisedLearningPipeline logic by SX
        fm_io = FeatureMatrixIO()
        log.debug('params: %s' % params)
        # If processed matrix exists, and the client has not requested to flush
        # the cache, just use the matrix that already exists and return.
        processed_matrix_path = params['processed_matrix_path']
        if os.path.exists(processed_matrix_path) and not self._flush_cache:
            # Assume feature selection already happened, but we still need
            # to split the data into training and test data.
            processed_matrix = fm_io.read_file_to_data_frame(processed_matrix_path)
            '''
            Make sure the order of rows is consistent before splitting
            '''
            processed_matrix.sort_index(inplace=True)
            self._train_test_split(processed_matrix, params['outcome_label']) #TODO sxu: when reloading, no pat_id
        else:
            # Read raw matrix.
            raw_matrix = fm_io.read_file_to_data_frame(params['raw_matrix_path'])
            # Initialize FMT.

            # Add outcome label
            raw_fmt = FeatureMatrixTransform()
            raw_fmt.set_input_matrix(raw_matrix)
            self._filter_on_features(raw_fmt, params['features_to_filter_on'])
            self._add_features(raw_fmt, params['features_to_add'])
            raw_matrix = raw_fmt.fetch_matrix()

            # Divide processed_matrix into training and test data.
            # This must happen before feature selection so that we don't
            # accidentally learn information from the test data.

            # TODO: work on this...
            self._train_test_split(raw_matrix, params['outcome_label'])

            fmt = FeatureMatrixTransform()
            train_df = self._X_train.join(self._y_train)
            fmt.set_input_matrix(train_df)

            # Remove features.
            self._remove_features(fmt, params['features_to_remove'])
            # Filter on features
            if 'features_to_filter_on' in params:
                self._filter_on_features(fmt, params['features_to_filter_on'])

            # HACK: When read_csv encounters duplicate columns, it deduplicates
            # them by appending '.1, ..., .N' to the column names.
            # In future versions of pandas, simply pass mangle_dupe_cols=True
            # to read_csv, but not ready as of pandas 0.22.0.
            for feature in raw_matrix.columns.values:
                if feature[-2:] == ".1":
                    fmt.remove_feature(feature)
                    self._removed_features.append(feature)

            # Impute data.
            self._impute_data(fmt, train_df, params['imputation_strategies'])

            # In case any all-null features were created in preprocessing,
            # drop them now so feature selection will work
            fmt.drop_null_features()

            # Build interim matrix.
            train_df = fmt.fetch_matrix()

            self._y_train = pd.DataFrame(train_df.pop(params['outcome_label']))
            self._X_train = train_df

            '''
            Select X_test columns according to processed X_train
            '''
            self._X_test = self._X_test[self._X_train.columns]

            '''
            Impute data according to the same strategy when training
            '''
            for feat in self._X_test.columns:
                self._X_test[feat] = self._X_test[feat].fillna(self.feat2imputed_dict[feat])

            self._select_features(params['selection_problem'],
                params['percent_features_to_select'],
                params['selection_algorithm'],
                params['features_to_keep'])

            train = self._y_train.join(self._X_train)
            test = self._y_test.join(self._X_test)

            processed_matrix = train.append(test)
            '''
            Need to recover the order of rows before writing into disk
            '''
            processed_matrix.sort_index(inplace=True)

            # Write output to new matrix file.
            header = self._build_processed_matrix_header(params)
            fm_io.write_data_frame_to_file(processed_matrix, \
                processed_matrix_path, header)
예제 #22
0
from scripts.LabTestAnalysis.machine_learning import LabNormalityPredictionPipeline
import matplotlib
matplotlib.rcParams['backend'] = 'TkAgg'
import matplotlib.pyplot as plt

folder = '../machine_learning/data/'
labs = LabNormalityPredictionPipeline.NON_PANEL_TESTS_WITH_GT_500_ORDERS

if False:
    all_testset_leakage_percentages = []
    all_best_aucrocs = []
    for lab in labs:

        try:
            raw_matrix_file = '%s-normality-matrix-10000-episodes-raw.tab' % lab
            fm_io = FeatureMatrixIO()
            raw_matrix = fm_io.read_file_to_data_frame(folder + '/' + lab +
                                                       '/' + raw_matrix_file)

            row, col = raw_matrix.shape

            from medinfo.ml.SupervisedClassifier import SupervisedClassifier
            algs = SupervisedClassifier.SUPPORTED_ALGORITHMS

            best_aucroc = 0
            for alg in algs:
                report_file = '%s-normality-prediction-%s-report.tab' % (lab,
                                                                         alg)
                report_df = pd.read_csv(folder + '/' + lab + '/' + alg + '/' +
                                        report_file,
                                        sep='\t')
예제 #23
0
        labs['predictable_CV'] = (labs['percent_predictably_positive'].astype('float') * labs['annual_median_charge_volume ($)'].astype('float') / 1000).map('${:,.0f}'.format)
        labs['predictable_CV[-0.95]'] = (labs['percent_predictably_positive_0.95_lower_ci'].astype('float') * labs['annual_median_charge_volume ($)'].astype('float') / 1000).map('${:,.0f}'.format)
        labs['predictable_CV[+0.95]'] = (labs['percent_predictably_positive_0.95_upper_ci'].astype('float') * labs['annual_median_charge_volume ($)'].astype('float') / 1000).map('${:,.0f}'.format)

        summary = DataFrame()
        summary['lab'] = labs['label']
        summary['charge'] = labs['median_charge'].astype('float').map('${:,.0f}'.format)
        summary['volume'] = labs['volume'].floordiv(6).astype('float').map('{:,.0f}'.format)
        summary['normal rate'] = labs['normality']
        summary['[email protected]'] = labs['[email protected]'] + ' [' + \
            labs['[email protected][-0.95]'] + ', ' + \
            labs['[email protected][+0.95]'] + ']'
        summary['predictable CV ($1,000s)'] = labs['predictable_CV'] + ' [' + \
            labs['predictable_CV[-0.95]'] + ', ' + \
            labs['predictable_CV[+0.95]'] + ']'

        return summary


if __name__ == '__main__':
    fm_io = FeatureMatrixIO()
    summary_table = LabNormalityReport.build_lab_performance_summary_table()
    fm_io.write_data_frame_to_file(summary_table, 'lab-performance-summary.tab')
    summary = LabNormalityReport.build_algorithm_performance_summary_table()
    fm_io.write_data_frame_to_file(summary, 'algorithm-performance-summary.tab')
    LabNormalityReport.plot_predictable_and_expensive_charges()
    summary = LabNormalityReport.build_lab_predictability_summary_report()
    fm_io.write_data_frame_to_file(summary, 'predictable-labs.tab')
    summary = LabNormalityReport.build_lab_predictability_summary_report(all=True)
    fm_io.write_data_frame_to_file(summary, 'all-labs.tab')
예제 #24
0
 def __init__(self):
     self._fm_io = FeatureMatrixIO()
예제 #25
0
    def _build_processed_feature_matrix(self, params):
        # params is a dict defining the details of how the raw feature matrix
        # should be transformed into the processed matrix. Given the sequence
        # of steps will be identical across all pipelines, sbala decided to
        # pack all the variability into this dict. It's not ideal because the
        # dict has 10+ values, but that seems better than forcing all pipelines
        # to reproduce the logic of the processing steps.
        # Principle: Minimize overridden function calls.
        #   params['features_to_add'] = features_to_add
        #   params['features_to_filter_on'] (optional) = features_to_filter_on
        #   params['imputation_strategies'] = imputation_strategies
        #   params['features_to_remove'] = features_to_remove
        #   params['outcome_label'] = outcome_label
        #   params['selection_problem'] = selection_problem
        #   params['selection_algorithm'] = selection_algorithm
        #   params['percent_features_to_select'] = percent_features_to_select
        #   params['matrix_class'] = matrix_class
        #   params['pipeline_file_path'] = pipeline_file_path
        #   TODO(sbala): Determine which fields should have defaults.
        fm_io = FeatureMatrixIO()
        log.debug('params: %s' % params)
        # If processed matrix exists, and the client has not requested to flush
        # the cache, just use the matrix that already exists and return.
        processed_matrix_path = params['processed_matrix_path']
        if os.path.exists(processed_matrix_path) and not self._flush_cache:
            # Assume feature selection already happened, but we still need
            # to split the data into training and test data.
            processed_matrix = fm_io.read_file_to_data_frame(
                processed_matrix_path)

            # processed_matrix['pat_id'] = processed_matrix['pat_id'].apply(lambda x: str(x))

            self._train_test_split(processed_matrix, params['outcome_label'])
            '''
            Pandas dataframe may automatically convert bigint to float (and round the last
            few digits), which may damage the uniqueness of pat_ids. 
            '''
            # processed_matrix['pat_id'] = processed_matrix['pat_id'].apply(lambda x: str(x))
        else:
            # Read raw matrix.
            raw_matrix = fm_io.read_file_to_data_frame(
                params['raw_matrix_path'])
            # raw_matrix['pat_id'] = raw_matrix['pat_id'].apply(lambda x: str(x))
            # Initialize FMT.

            # Divide processed_matrix into training and test data.
            # This must happen before feature selection so that we don't
            # accidentally learn information from the test data.

            patIds_df = raw_matrix['pat_id'].copy()

            self._train_test_split(raw_matrix, params['outcome_label'])

            # ##
            # folder_path = '/'.join(params['raw_matrix_path'].split('/')[:-1])
            # self._X_train.join(self._y_train).to_csv(folder_path + '/' + 'train_raw.csv', index=False)
            # self._X_test.join(self._y_test).to_csv(folder_path + '/' + 'test_raw.csv', index=False)
            #
            # '''
            # Mini-test that there are no overlapping patients
            # '''
            # assert bool(set(self._X_train['pat_id'].values) & set(self._X_test['pat_id'].values)) == False
            # ##

            fmt = FeatureMatrixTransform()
            train_df = self._X_train.join(self._y_train)
            fmt.set_input_matrix(train_df)

            # Add features.
            self._add_features(fmt, params['features_to_add'])

            # Filter on features
            if 'features_to_filter_on' in params:
                self._filter_on_features(fmt, params['features_to_filter_on'])

            # HACK: When read_csv encounters duplicate columns, it deduplicates
            # them by appending '.1, ..., .N' to the column names.
            # In future versions of pandas, simply pass mangle_dupe_cols=True
            # to read_csv, but not ready as of pandas 0.22.0.
            for feature in raw_matrix.columns.values:
                if feature[-2:] == ".1":
                    fmt.remove_feature(feature)
                    self._removed_features.append(feature)

            # Impute data.
            if params['imputation_strategies'] == {'sxu_new_imputation'}:
                train_df = fmt.fetch_matrix()
                means = {}
                for column in train_df.columns.values.tolist():
                    # column_tail = column.split('.')[-1].strip()
                    if train_df[column].dtype == 'float64':
                        means[column] = train_df[column].mean()

                train_df = fmt.do_impute_sx(train_df, means)
                fmt.set_input_matrix(train_df)
                self._X_test = fmt.do_impute_sx(self._X_test, means)

                self._remove_features(fmt, params['features_to_remove'])

            else:
                self._remove_features(fmt, params['features_to_remove'])
                self._impute_data(fmt, train_df,
                                  params['imputation_strategies'])

            # Remove features.
            '''
            Moved here, since still need pat_id for imputation!
            '''
            # self._remove_features(fmt, params['features_to_remove'])

            # In case any all-null features were created in preprocessing,
            # drop them now so feature selection will work
            fmt.drop_null_features()

            # Build interim matrix.
            train_df = fmt.fetch_matrix()

            self._y_train = pd.DataFrame(train_df.pop(params['outcome_label']))
            self._X_train = train_df
            '''
            Select X_test columns according to processed X_train
            '''
            self._X_test = self._X_test[self._X_train.columns]

            if not params['imputation_strategies'] == {'sxu_new_imputation'}:
                for feat in self._X_test.columns:
                    self._X_test[feat] = self._X_test[feat].fillna(
                        self.feat2imputed_dict[feat])

            self._select_features(params['selection_problem'],
                                  params['percent_features_to_select'],
                                  params['selection_algorithm'],
                                  params['features_to_keep'])
            '''
            The join is based on index by default.
            Will remove 'pat_id' (TODO sxu: more general in the future) later in train().
            '''
            self._X_train = self._X_train.join(patIds_df, how='left')

            self._X_test = self._X_test.join(patIds_df, how='left')

            # print set(self._X_train['pat_id'].values.tolist()) & set(self._X_test['pat_id'].values.tolist())

            train = self._y_train.join(self._X_train)
            test = self._y_test.join(self._X_test)

            processed_trainMatrix_path = processed_matrix_path.replace(
                "matrix", "train-matrix")
            train.to_csv(processed_trainMatrix_path, sep='\t', index=False)
            processed_testMatrix_path = processed_matrix_path.replace(
                "matrix", "test-matrix")
            test.to_csv(processed_testMatrix_path, sep='\t', index=False)

            processed_matrix = train.append(test)
            '''
            Recover the order of rows before writing into disk, 
            where the index info will be missing.
            '''
            processed_matrix.sort_index(inplace=True)

            # Write output to new matrix file.
            header = self._build_processed_matrix_header(params)
            fm_io.write_data_frame_to_file(processed_matrix, \
                processed_matrix_path, header)
        '''
        Pop out pat_id from the feature matrices. 
        Also check whether there is pat_id leakage. 
        '''
        self._patIds_train = self._X_train.pop('pat_id').values.tolist()
        self._patIds_test = self._X_test.pop('pat_id').values.tolist()
        assert not (set(self._patIds_train) & set(self._patIds_test))
예제 #26
0
class ConditionMortalityPredictor:
    def __init__(self, condition, num_patients, icd_list=None, use_cache=None):
        self._condition = condition
        self._num_patients = num_patients
        self._icd_list = icd_list

        self._FEATURES_TO_REMOVE = [
            'index_time', 'death_date', 'Death.post', 'Death.postTimeDays',
            'Birth.pre', 'Male.preTimeDays', 'Female.preTimeDays',
            'RaceWhiteHispanicLatino.preTimeDays',
            'RaceWhiteNonHispanicLatino.preTimeDays',
            'RaceHispanicLatino.preTimeDays', 'RaceAsian.preTimeDays',
            'RaceBlack.preTimeDays', 'RacePacificIslander.preTimeDays',
            'RaceNativeAmerican.preTimeDays', 'RaceOther.preTimeDays',
            'RaceUnknown.preTimeDays'
        ]
        self._eliminated_features = list()

        self._build_cmm_names()
        if use_cache is None:
            self._build_raw_feature_matrix()
        print('Processing raw feature matrix...')
        self._process_raw_feature_matrix()
        print('Training predictor...')
        self._train_predictor()
        print('Testing predictor...')
        self._test_predictor()

    def _build_cmm_names(self):
        slugified_condition = "-".join(self._condition.split())
        self._build_cmm_name_raw(slugified_condition, self._num_patients)
        self._build_cmm_name_processed(slugified_condition, self._num_patients)

    def _build_cmm_name_raw(self, slugified_condition, num_patients):
        template = '%s-mortality-matrix-%d-pat-raw.tab'
        self._cmm_name_raw = template % (slugified_condition, num_patients)

    def _build_cmm_name_processed(self, slugified_condition, num_patients):
        template = '%s-mortality-matrix-%d-pat-processed.tab'
        self._cmm_name_processed = template % (slugified_condition,
                                               num_patients)

    def _build_raw_feature_matrix(self):
        self._cmm = ConditionMortalityMatrix(self._condition, \
            self._num_patients, self._cmm_name_raw, self._icd_list)

    def _process_raw_feature_matrix(self):
        # Read raw CMM.
        self._fm_io = FeatureMatrixIO()
        print('Reading raw matrix...')
        self._cmm_raw = self._fm_io.read_file_to_data_frame(self._cmm_name_raw)

        # Add and remove features to _cmm_processed.
        self._fmt = FeatureMatrixTransform()
        self._fmt.set_input_matrix(self._cmm_raw)
        print('Adding features...')
        self._add_features()
        print('Imputing data...')
        self._impute_data()
        self._remove_features()
        self._fmt.drop_duplicate_rows()
        self._cmm_processed = self._fmt.fetch_matrix()

        # Divide _cmm_processed into training and test data.
        # This must happen before feature selection so that we don't
        # accidentally learn information from the test data.
        self._train_test_split()
        print('Selecting features...')
        self._select_features()

        # Write output to new matrix.
        train = self._y_train.join(self._X_train)
        test = self._y_test.join(self._X_test)
        self._cmm_processed = train.append(test)

        header = self._build_processed_matrix_header()

        self._fm_io.write_data_frame_to_file(self._cmm_processed,
                                             self._cmm_name_processed, header)

    def _build_processed_matrix_header(self):
        # FeatureMatrixFactory and FeatureMatrixIO expect a list of strings.
        # Each comment below represents the line in the comment.
        header = list()

        # <file_name.tab>
        file_name = self._cmm_name_processed
        header.append(file_name)
        # Created: <timestamp>
        timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
        header.append('Created: %s' % timestamp)
        # Source: __name__
        header.append('Source: %s' % __name__)
        # Command: ConditionMortalityMatrix()
        if self._icd_list:
            command = 'ConditionMortalityPredictor(%s, %s, %s)' % \
                (self._condition, self._num_patients, self._icd_list)
        else:
            command = 'ConditionMortalityPredictor(%s, %s)' % \
                (self._condition, self._num_patients)
        header.append('Command: %s' % command)
        #
        header.append('')
        # Overview:
        header.append('Overview:')
        # This file is a processed version of ___.
        line = 'This file is a post-processed version of %s.' % self._cmm_name_raw
        header.append(line)
        # The outcome label is ___, which is a boolean indicator
        line = 'The outcome label is I(0<=Death.postTimeDays<=28), which is a boolean indicator'
        header.append(line)
        # for whether the patient given by pat_id passed away within 28 days
        line = 'for whether the patient given by pat_id passed away within 28 days'
        header.append(line)
        # of the time index represented by a given row.
        line = 'of the time index represented by a given row.'
        header.append(line)
        # This matrix is the result of the following processing steps on the raw matrix:
        line = 'This matrix is the result of the following processing steps on the raw matrix:'
        header.append(line)
        #   (1) Imputing missing values with the mean value of each column.
        line = '  (1) Imputing missing values with the mean value of each column.'
        header.append(line)
        #   (2) Manually removing low-information features:
        line = '  (2) Manually removing low-information features:'
        header.append(line)
        #       ___
        line = '      %s' % str(self._FEATURES_TO_REMOVE)
        header.append(line)
        #   (3) Algorithmically selecting the top 100 features via recursive feature elimination.
        line = '  (3) Algorithmically selecting the top 100 features via recursive feature elimination.'
        header.append(line)
        #       The following features were eliminated.
        line = '      The following features were eliminated:'
        header.append(line)
        # List all features with rank >100.
        line = '        %s' % str(self._eliminated_features)
        header.append(line)
        #
        line = ''
        header.append(line)
        # Each row represents a decision point (proxied by clinical order).
        line = 'Each row represents a decision point (proxied by clinical order).'
        header.append(line)
        # Each row contains fields summarizing the patient's demographics,
        line = "Each row contains fields summarizing the patient's demographics"
        header.append(line)
        # inpatient admit date, prior vitals, and prior lab results.
        line = 'inpatient admit date, prior vitals, and prior lab results.'
        header.append(line)
        # Most cells in matrix represent a count statistic for an event's
        line = "Most cells in matrix represent a count statistic for an event's"
        header.append(line)
        # occurrence or a difference between an event's time and index_time.
        line = "occurrence or a difference between an event's time and index_time."
        header.append(line)
        #
        header.append('')
        # Fields:
        header.append('Fields:')
        #   pat_id - ID # for patient in the STRIDE data set.
        header.append('  pat_id - ID # for patient in the STRIDE data set.')
        #   index_time - time at which clinical decision was made.
        header.append(
            '  index_time - time at which clinical decision was made.')
        #   death_date - if patient died, date on which they died.
        header.append(
            '  death_date - if patient died, date on which they died.')
        #   AdmitDxDate.[clinical_item] - admit diagnosis, pegged to admit date.
        header.append(
            '  AdmitDxDate.[clinical_item] - admit diagnosis, pegged to admit date.'
        )
        #   Birth.preTimeDays - patient's age in days.
        header.append("  Birth.preTimeDays - patient's age in days.")
        #   [Male|Female].pre - is patient male/female (binary)?
        header.append('  [Male|Female].pre - is patient male/female (binary)?')
        #   [RaceX].pre - is patient race [X]?
        header.append('  [RaceX].pre - is patient race [X]?')
        #   Team.[specialty].[clinical_item] - specialist added to treatment team.
        header.append(
            '  Team.[specialty].[clinical_item] - specialist added to treatment team.'
        )
        #   Comorbidity.[disease].[clinical_item] - disease added to problem list.
        header.append(
            '  Comorbidity.[disease].[clinical_item] - disease added to problem list.'
        )
        #   ___.[flowsheet] - measurements for flowsheet biometrics.
        header.append(
            '  ___.[flowsheet] - measurements for flowsheet biometrics.')
        #       Includes BP_High_Systolic, BP_Low_Diastolic, FiO2,
        header.append('    Includes BP_High_Systolic, BP_Low_Diastolic, FiO2,')
        #           Glasgow Coma Scale Score, Pulse, Resp, Temp, and Urine.
        header.append(
            '      Glasgow Coma Scale Score, Pulse, Resp, Temp, and Urine.')
        #   ___.[lab_result] - lab component results.
        header.append('  ___.[lab_result] - lab component results.')
        #       Included standard components: WBC, HCT, PLT, NA, K, CO2, BUN,
        header.append(
            '    Included standard components: WBC, HCT, PLT, NA, K, CO2, BUN,'
        )
        #           CR, TBIL, ALB, CA, LAC, ESR, CRP, TNI, PHA, PO2A, PCO2A,
        header.append(
            '      CR, TBIL, ALB, CA, LAC, ESR, CRP, TNI, PHA, PO2A, PCO2A,')
        #           PHV, PO2V, PCO2V
        header.append('      PHV, PO2V, PCO2V')
        #
        header.append('')
        #   [clinical_item] fields may have the following suffixes:
        header.append(
            '  [clinical_item] fields may have the following suffixes:')
        #       ___.pre - how many times has this occurred before order_time?
        header.append(
            '    ___.pre - how many times has this occurred before order_time?'
        )
        #       ___.pre.Xd - how many times has this occurred within X days before index_time?
        header.append(
            '    ___.pre.Xd - how many times has this occurred within X days before index_time?'
        )
        #       ___.preTimeDays - how many days before order_time was last occurrence?
        header.append(
            '    ___.preTimeDays - how many days before order_time was last occurrence?'
        )
        #
        header.append('')
        #   [flowsheet] and [lab_result] fields may have the following suffixes:
        header.append(
            '  [flowsheet] and [lab_result] fields may have the following suffixes:'
        )
        #       ___.X_Y.count - # of result values between X and Y days of index_time.
        header.append(
            '    ___.X_Y.count - # of result values between X and Y days of index_time.'
        )
        #       ___.X_Y.countInRange - # of result values in normal range.
        header.append(
            '    ___.X_Y.countInRange - # of result values in normal range.')
        #       ___.X_Y.min - minimum result value.
        header.append('    ___.X_Y.min - minimum result value.')
        #       ___.X_Y.max - maximum result value.
        header.append('    ___.X_Y.max - maximum result value.')
        #       ___.X_Y.median - median result value.
        header.append('    ___.X_Y.median - median result value.')
        #       ___.X_Y.std - standard deviation of result values.
        header.append('    ___.X_Y.std - standard deviation of result values.')
        #       ___.X_Y.first - first result value.
        header.append('    ___.X_Y.first - first result value.')
        #       ___.X_Y.last - last result value.
        header.append('    ___.X_Y.last - last result value.')
        #       ___.X_Y.diff - difference between penultimate and proximate values.
        header.append(
            '    ___.X_Y.diff - difference between penultimate and proximate values.'
        )
        #       ___.X_Y.slope - slope between penultimate and proximate values.
        header.append(
            '    ___.X_Y.slope - slope between penultimate and proximate values.'
        )
        #       ___.X_Y.proximate - closest result value to order_time.
        header.append(
            '    ___.X_Y.proximate - closest result value to order_time.')
        #       ___.X_Y.firstTimeDays - time between first and order_time.
        header.append(
            '    ___.X_Y.firstTimeDays - time between first and order_time.')
        #       ___.X_Y.lastTimeDays - time between last and order_time.
        header.append(
            '    ___.X_Y.lastTimeDays - time between last and order_time.')
        #       ___.X_Y.proximateTimeDays - time between proximate and order_time.
        header.append(
            '    ___.X_Y.proximateTimeDays - time between proximate and order_time.'
        )

        return header

    def _train_predictor(self):
        self._predictor = SupervisedClassifier(
            algorithm=SupervisedClassifier.REGRESS_AND_ROUND)
        self._predictor.train(self._X_train, column_or_1d(self._y_train))

    def _train_test_split(self):
        y = pd.DataFrame(
            self._cmm_processed.pop('I(0<=Death.postTimeDays<=28)'))
        # Without this line, sklearn complains about the format of y.
        # "DataConversionWarning: A column-vector y was passed when a 1d array
        #   was expected. Please change the shape of y to (n_samples, ), for
        #   example using ravel()."
        # Note that this turns y into a numpy array, so need to cast back.
        # y = y.values.ravel()
        X = self._cmm_processed
        self._X_train, self._X_test, self._y_train, self._y_test = train_test_split(
            X, y, shuffle=False)

    def _impute_data(self):
        # Impute missing values with mean value.
        for feature in self._cmm_raw.columns.values:
            if feature in self._FEATURES_TO_REMOVE:
                continue
            # If all values are null, just remove the feature.
            # Otherwise, imputation will fail (there's no mean value),
            # and sklearn will ragequit.
            if self._cmm_raw[feature].isnull().all():
                self._fmt.remove_feature(feature)
                self._eliminated_features.append(feature)
            # Only try to impute if some of the values are null.
            elif self._cmm_raw[feature].isnull().any():
                # TODO(sbala): Impute all time features with non-mean value.
                self._fmt.impute(feature)

    def _add_features(self):
        # Add threshold feature indicating whether death date
        # is within 28 days of index time.
        self._fmt.add_threshold_feature('Death.postTimeDays',
                                        lower_bound=0,
                                        upper_bound=28)

    def _remove_features(self):
        # Prune obviously unhelpful fields.
        # In theory, FeatureSelector should be able to prune these, but no
        # reason not to help it out a little bit.
        for feature in self._FEATURES_TO_REMOVE:
            self._fmt.remove_feature(feature)

    def _select_features(self):
        # Use FeatureSelector to prune all but 100 variables.
        fs = FeatureSelector(algorithm=FeatureSelector.RECURSIVE_ELIMINATION, \
            problem=FeatureSelector.CLASSIFICATION)

        fs.set_input_matrix(self._X_train, column_or_1d(self._y_train))
        num_features_to_select = int(0.01 * len(self._X_train.columns.values))
        fs.select(k=num_features_to_select)

        # Enumerate eliminated features pre-transformation.
        self._feature_ranks = fs.compute_ranks()
        for i in range(len(self._feature_ranks)):
            if self._feature_ranks[i] > num_features_to_select:
                self._eliminated_features.append(self._X_train.columns[i])

        self._X_train = fs.transform_matrix(self._X_train)
        self._X_test = fs.transform_matrix(self._X_test)

    def _test_predictor(self):
        self._accuracy = self._predictor.compute_accuracy(
            self._X_test, self._y_test)

    def predict(self, X):
        return self._predictor.predict(X)

    def summarize(self):
        summary_lines = list()

        # Condition: condition
        condition = self._condition
        line = 'Condition: %s' % condition
        summary_lines.append(line)

        # Algorithm: SupervisedClassifier(algorithm)
        algorithm = 'SupervisedClassifier(REGRESS_AND_ROUND)'
        line = 'Algorithm: %s' % algorithm
        summary_lines.append(line)

        # Train/Test Size: training_size, test_size
        training_size = self._X_train.shape[0]
        test_size = self._X_test.shape[0]
        line = 'Train/Test Size: %s/%s' % (training_size, test_size)
        summary_lines.append(line)

        # Model: sig_features
        coefs = self._predictor.coefs()
        cols = self._X_train.columns
        sig_features = [(coefs[cols.get_loc(f)], f) for f in cols.values
                        if coefs[cols.get_loc(f)] > 0]
        linear_model = ' + '.join('%s*%s' % (weight, feature)
                                  for weight, feature in sig_features)
        line = 'Model: logistic(%s)' % linear_model
        summary_lines.append(line)

        # Baseline Episode Mortality: episode_mortality
        counts = self._y_test[self._y_test.columns[0]].value_counts()
        line = 'Baseline Episode Mortality: %s/%s' % (counts[1], test_size)
        summary_lines.append(line)

        # AUC: auc
        auc = self._predictor.compute_roc_auc(self._X_test, self._y_test)
        line = 'AUC: %s' % auc
        summary_lines.append(line)
        # Accuracy: accuracy
        line = 'Accuracy: %s' % self._accuracy
        summary_lines.append(line)

        return '\n'.join(summary_lines)
예제 #27
0
lab = 'ALK'
data_source = 'UMich'
lab_type = 'component'
data_folderpath = '../data-%s-component-10000-episodes/%s/' % (data_source,
                                                               lab)

rf_model = joblib.load(data_folderpath +
                       "%s-normality-random-forest-model.pkl" % lab)._model

# rf_model = joblib.load('Uric-Acid, Serum - Plasma-normality-random-forest-model.pkl')._model

print(len(rf_model.feature_importances_))

from medinfo.dataconversion.FeatureMatrixIO import FeatureMatrixIO
fm_io = FeatureMatrixIO()
df_processed = fm_io.read_file_to_data_frame(
    data_folderpath + '%s-normality-matrix-processed.tab' % lab)
# df_processed = fm_io.read_file_to_data_frame('Uric Acid, Serum - Plasma-normality-test-matrix-processed_byStanford.tab')
df_processed.pop('pat_id')

if lab_type == 'panel':
    df_processed.pop('all_components_normal')
else:
    df_processed.pop('component_normal')
cols = df_processed.columns.values.tolist()

estimator = rf_model.estimators_[5]

export_graphviz(estimator,
                out_file='tree.dot',
예제 #28
0
        train_ids)].copy()
    y_train = pd.DataFrame(train_matrix.pop(outcome_label))
    X_train = train_matrix

    test_matrix = processed_matrix[processed_matrix[columnToSplitOn].isin(
        test_ids)].copy()
    y_test = pd.DataFrame(test_matrix.pop(outcome_label))
    X_test = test_matrix
    return X_train, y_train, X_test, y_test


'''
Load data
'''
lab = 'LABA1C'
fm_io = FeatureMatrixIO()
processed_matrix = fm_io.read_file_to_data_frame(
    "data-panels/%s/%s-normality-matrix-10000-episodes-processed.tab" %
    (lab, lab))
X_train, y_train, X_test, y_test = _train_test_split(processed_matrix,
                                                     'all_components_normal')
X_train.pop('pat_id')
X_test.pop('pat_id')

features = X_train.columns.tolist()
print(features)

X_train, y_train, X_test, y_test = X_train.values, y_train.values, X_test.values, y_test.values

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
    def _train_and_analyze_predictors(self):
        log.info('Training and analyzing predictors...')
        problem = SupervisedLearningPipeline.CLASSIFICATION
        meta_report = None
        fm_io = FeatureMatrixIO()

        # Build paths for output.
        pipeline_file_name = inspect.getfile(inspect.currentframe())
        data_dir = self._fetch_data_dir_path(pipeline_file_name)

        # Test BifurcatedSupervisedClassifier and SupervisedClassifier.
        algorithms_to_test = list()
        algorithms_to_test.extend(SupervisedClassifier.SUPPORTED_ALGORITHMS)
        for algorithm in SupervisedClassifier.SUPPORTED_ALGORITHMS:
            pass  # TODO:(raikens) something in the BifurcatedSupervisedClassifier pipeline is crashing
            #algorithms_to_test.append('bifurcated-%s' % algorithm)
        log.debug('algorithms_to_test: %s' % algorithms_to_test)

        # Train and analyse algorithms.
        for algorithm in algorithms_to_test:
            log.info('Training and analyzing %s...' % algorithm)
            # If report_dir does not exist, make it.
            report_dir = '/'.join([data_dir, algorithm])
            if not os.path.exists(report_dir):
                os.makedirs(report_dir)

            log.debug('report_dir: %s' % report_dir)

            # Define hyperparams.
            hyperparams = {}
            hyperparams['algorithm'] = algorithm
            hyperparams[
                'hyperparam_strategy'] = SupervisedClassifier.EXHAUSTIVE_SEARCH
            hyperparams['max_iter'] = 1024

            # If bifurcated algorithm, define bifurcator.
            if 'bifurcated' in algorithm:
                # bifrucator = LAB.pre == 0
                hyperparams['bifurcator'] = '%s.pre' % self._var
                hyperparams[
                    'bifurcation_strategy'] = BifurcatedSupervisedClassifier.EQUAL
                hyperparams['bifurcation_value'] = 0
                hyperparams['bifurcated'] = True

            # Train classifier.
            predictor_path = self._build_model_dump_path(algorithm)
            if os.path.exists(
                    predictor_path) and 'bifurcated' not in algorithm:
                log.debug('Loading model from disk...')
                # TODO(sbala): Fix loblib.load so that it works for bifurcated
                # supervised classifiers.
                self._predictor = joblib.load(predictor_path)
                self._features = self._X_train.columns
                status = SupervisedClassifier.TRAINED
            else:
                status = SupervisedLearningPipeline._train_predictor(
                    self, problem, [0, 1], hyperparams)

            # If failed to train, write an error report.
            y_train_counts = self._y_train[
                self._y_train.columns[0]].value_counts()
            y_test_counts = self._y_test[
                self._y_test.columns[0]].value_counts()
            if status == SupervisedClassifier.INSUFFICIENT_SAMPLES:
                # Skip all analysis and reporting.
                # This will be true for all algorithms, so just return.
                # Build error report.
                algorithm_report = DataFrame(
                    {
                        'lab_panel': [self._var],
                        'algorithm': [algorithm],
                        'error': [status],
                        'y_train.value_counts()': [y_train_counts.to_dict()],
                        'y_test.value_counts()': [y_test_counts.to_dict()]
                    },
                    columns=[
                        'lab_panel', 'algorithm', 'error',
                        'y_train.value_counts()', 'y_test.value_counts()'
                    ])
                header = [
                    'LabChangePredictionPipeline("%s", %d)' %
                    (self._var, self._num_rows)
                ]
                # Write error report.
                fm_io.write_data_frame_to_file(algorithm_report, \
                    '/'.join([report_dir, '%s-change-prediction-report.tab' % (self._var)]), \
                    header)
            # If successfully trained, append to a meta report.
            elif status == SupervisedClassifier.TRAINED:
                pipeline_prefix = '%s-change-prediction-%s' % (self._var,
                                                               algorithm)
                SupervisedLearningPipeline._analyze_predictor(
                    self, report_dir, pipeline_prefix)
                if meta_report is None:
                    meta_report = fm_io.read_file_to_data_frame('/'.join(
                        [report_dir,
                         '%s-report.tab' % pipeline_prefix]))
                else:
                    algorithm_report = fm_io.read_file_to_data_frame('/'.join(
                        [report_dir,
                         '%s-report.tab' % pipeline_prefix]))
                    log.debug('algorithm_report: %s' % algorithm_report)
                    meta_report = meta_report.append(algorithm_report)
                # Write predictor to disk.
                predictor = SupervisedLearningPipeline.predictor(self)
                predictor_path = self._build_model_dump_path(algorithm)
                joblib.dump(predictor, predictor_path)

        # After building per-algorithm reports, write to meta report.
        # Note that if there were insufficient samples to build any of the
        # algorithms, then meta_report will still be None.
        if meta_report is not None:
            header = [
                'LabChangePredictionPipeline("%s", %d)' %
                (self._var, self._num_rows)
            ]
            fm_io.write_data_frame_to_file(meta_report, \
                '/'.join([data_dir, '%s-change-prediction-report.tab' % self._var]), header)
예제 #30
0
def get_train_and_evalu_raw_matrices(lab,
                                     data_lab_folderpath,
                                     random_state,
                                     train_size=0.75,
                                     columnToSplitOn='pat_id'):
    '''
    If train and eval exist, direct get from disk
    Avoided saving as 2 raw matrices, too much space!

    elif raw matrix exists, get from dist and split

    else, get from SQL

    Args:
        raw_matrix_filepath:
        random_state:
        use_cached:

    Returns:

    '''
    raw_matrix_filepath = os.path.join(data_lab_folderpath,
                                       raw_matrix_template % lab)
    fm_io = FeatureMatrixIO()

    # TODO: check if raw matrix exists
    raw_matrix = fm_io.read_file_to_data_frame(raw_matrix_filepath)

    pat_split_filepath = os.path.join(data_lab_folderpath, pat_split_filename)
    '''
    Old pipeline style
    '''
    if os.path.exists(pat_split_filepath):
        pat_split_df = pd.read_csv(pat_split_filepath)
        pat_ids_train = pat_split_df[pat_split_df['in_train'] ==
                                     1]['pat_id'].values.tolist()
        raw_matrix_train = raw_matrix[raw_matrix['pat_id'].isin(pat_ids_train)]

        pat_ids_evalu = pat_split_df[pat_split_df['in_train'] ==
                                     0]['pat_id'].values.tolist()
        raw_matrix_evalu = raw_matrix[raw_matrix['pat_id'].isin(pat_ids_evalu)]

    else:
        raw_matrix_train, raw_matrix_evalu = split_rows(
            raw_matrix,
            train_size=train_size,
            columnToSplitOn=columnToSplitOn,
            random_state=random_state)
        pat_ids_train = set(raw_matrix_train['pat_id'].values.tolist())

        pat_split_df = raw_matrix[['pat_id']].copy()
        pat_split_df['in_train'] = pat_split_df['pat_id'].apply(
            lambda x: 1 if x in pat_ids_train else 0)
        # pat_split_df.to_csv(pat_split_filepath, index=False)

    assert set(raw_matrix_train['pat_id'].values.tolist()) & set(
        raw_matrix_evalu['pat_id'].values.tolist()) == set([])
    assert raw_matrix_train.shape[0] + raw_matrix_evalu.shape[
        0] == raw_matrix.shape[0]

    return raw_matrix_train, raw_matrix_evalu