def _load_note_input_file(self):
        if not self._df_notes_labeled_path:
            raise RuntimeError('Please specify a valid note input file.')

        filename = utils.default_dataframe_name(self._df_notes_labeled_path)

        assert os.path.isfile(
            filename), 'Could not find note parquet file: {}'.format(filename)
        self._loaded_df = pd.read_parquet(filename)

        #self._loaded_df = self._loaded_df[self._loaded_df.ROW_ID == 23191] # 3083

        self._loaded_df.columns = [_.upper() for _ in self._loaded_df.columns]
        assert 'SUBJECT_ID' in self._loaded_df.columns and 'CHARTDATE' in self._loaded_df.columns and 'CATEGORY' in self._loaded_df.columns and 'TEXT' in self._loaded_df.columns and 'ROW_ID' in self._loaded_df.columns and 'HADM_ID' in self._loaded_df.columns, 'Notes file needs to have columns: Row_id, Subject_id, Hadm_id, chartdate, category and text'
        logger.log_info('Notes before category removal: {}'.format(
            len(self._loaded_df)))
        self._loaded_df['CATEGORY'] = self._loaded_df['CATEGORY'].str.lower()
        self._keep_categories = [_.lower() for _ in self._keep_categories]
        filtered_df = self._loaded_df[self._loaded_df['CATEGORY'].isin(
            self._keep_categories)].copy()
        del self._loaded_df
        self._loaded_df = filtered_df
        logger.log_info('Notes after category removal: {}'.format(
            len(self._loaded_df)))

        if not self._debug_row_id is None:
            self._loaded_df = self._loaded_df[self._loaded_df.ROW_ID ==
                                              self._debug_row_id]
            if self._loaded_df.empty:
                logger.log_error('Could not find requested debugging row id.')
 def _dump_filtered_df(self):
     filename = utils.default_dataframe_name(self._output_note_file)
     if self._filtered_cohort_df.empty:
         logger.log_warn(
             'There are no more entries left after filtering the dataframe.'
         )
     else:
         self._filtered_cohort_df.to_parquet(filename)
    def _load_note_input_file(self):
        if not self._df_notes_labeled_path:
            raise RuntimeError('Please specify a valid note input file.')
        
        filename = utils.default_dataframe_name(self._df_notes_labeled_path)

        assert os.path.isfile(filename), 'Could not find note parquet file: {}'.format(filename)
        self._loaded_df = pd.read_parquet(filename)
        self._loaded_df.columns = [_.upper() for _ in self._loaded_df.columns]
        assert 'PREDICTED_CATEGORIES' in self._loaded_df.columns and 'SUBJECT_ID' in self._loaded_df.columns and 'CHARTDATE' in self._loaded_df.columns and 'CATEGORY' in self._loaded_df.columns and 'TEXT' in self._loaded_df.columns and 'ROW_ID' in self._loaded_df.columns and 'HADM_ID' in self._loaded_df.columns, 'Notes file need to have columns: Row_id, Subject_id, Hadm_id, predicted_categories, chartdate, category and text'
    def _loading_section_info_file(self):
        filename = utils.default_dataframe_name(self._section_info__file)
        assert os.path.isfile(
            filename), 'Could not find section parquet file: {}'.format(
                filename)
        df = pd.read_parquet(filename)
        df.columns = [_.upper() for _ in df.columns]

        assert 'SECTION_GROUP' in df.columns, 'Section file need to have columns: Section_id, section_group, row_id, text'
        assert 'ROW_ID' in df.columns, 'Section file need to have columns: Section_id, section_group, row_id, text'
        assert 'SECTION_ID' in df.columns, 'Section file need to have columns: Section_id, section_group, row_id, text'
        assert 'TEXT' in df.columns, 'Section file need to have columns: Section_id, section_group, row_id, text'

        self._section_info__file = df
        def load_sentence_info_file(path, allowed_class_labels):
            filename = utils.default_dataframe_name(path)
            assert os.path.isfile(
                filename), 'Could not find note parquet file: {}'.format(
                    filename)
            df = pd.read_parquet(filename)
            df.columns = [_.upper() for _ in df.columns]

            assert 'ROW_ID' in df.columns, 'Notes file need to have columns: Row_id, predicted_categories'
            for _ in allowed_class_labels:
                assert _ in df.columns, "Processed note file has no {} column - class label not found!".format(
                    _)

                for __ in self._required_tag_list:
                    assert _ + '_' + __ in df.columns, "Processed note file has no {} column - the file needs to be generated by the negex_negation_filter module!".format(
                        _ + '_' + __)

            return df
        def load_prediction_file(path):
            filename = utils.default_dataframe_name(path)
            assert os.path.isfile(
                filename), 'Could not find note parquet file: {}'.format(
                    filename)
            df = pd.read_parquet(filename)
            df.columns = [_.upper() for _ in df.columns]

            assert 'ROW_ID' in df.columns, 'Notes file need to have columns: Row_id, predicted_categories'
            assert 'PREDICTED_CATEGORIES' in df.columns, "Processed note file needs to have the PREDICTED_CATEGORIES column generated by e.g. the negation module."
            df['PREDICTED_CATEGORIES'] = df.PREDICTED_CATEGORIES.str.upper()
            df['PREDICTED_CATEGORIES'] = df.PREDICTED_CATEGORIES.str.replace(
                ' ', '_')
            df['PREDICTED_CATEGORIES'] = df.PREDICTED_CATEGORIES.str.split('|')
            if 'FOUND_EVIDENCE' in df.columns:
                df['FOUND_EVIDENCE'] = df['FOUND_EVIDENCE'].astype(bool)
                df = df[df['FOUND_EVIDENCE']]

            return df
 def _dump_df(self):
     logger.log_info('Dumping the extracted notes into a parquet file.')
     filename = utils.default_dataframe_name(self._output_note_file)
     self._labeled_df.to_parquet(filename)
     logger.log_info(
         'DONE: Dumping the extracted notes into a parquet file.')
示例#8
0
 def _dump_processed_df(self):
     filename = utils.default_dataframe_name(self._output_note_file)
     self._loaded_df.to_parquet(filename)