def _load_note_input_file(self): if not self._df_notes_labeled_path: raise RuntimeError('Please specify a valid note input file.') filename = utils.default_dataframe_name(self._df_notes_labeled_path) assert os.path.isfile( filename), 'Could not find note parquet file: {}'.format(filename) self._loaded_df = pd.read_parquet(filename) #self._loaded_df = self._loaded_df[self._loaded_df.ROW_ID == 23191] # 3083 self._loaded_df.columns = [_.upper() for _ in self._loaded_df.columns] assert 'SUBJECT_ID' in self._loaded_df.columns and 'CHARTDATE' in self._loaded_df.columns and 'CATEGORY' in self._loaded_df.columns and 'TEXT' in self._loaded_df.columns and 'ROW_ID' in self._loaded_df.columns and 'HADM_ID' in self._loaded_df.columns, 'Notes file needs to have columns: Row_id, Subject_id, Hadm_id, chartdate, category and text' logger.log_info('Notes before category removal: {}'.format( len(self._loaded_df))) self._loaded_df['CATEGORY'] = self._loaded_df['CATEGORY'].str.lower() self._keep_categories = [_.lower() for _ in self._keep_categories] filtered_df = self._loaded_df[self._loaded_df['CATEGORY'].isin( self._keep_categories)].copy() del self._loaded_df self._loaded_df = filtered_df logger.log_info('Notes after category removal: {}'.format( len(self._loaded_df))) if not self._debug_row_id is None: self._loaded_df = self._loaded_df[self._loaded_df.ROW_ID == self._debug_row_id] if self._loaded_df.empty: logger.log_error('Could not find requested debugging row id.')
def _dump_filtered_df(self): filename = utils.default_dataframe_name(self._output_note_file) if self._filtered_cohort_df.empty: logger.log_warn( 'There are no more entries left after filtering the dataframe.' ) else: self._filtered_cohort_df.to_parquet(filename)
def _load_note_input_file(self): if not self._df_notes_labeled_path: raise RuntimeError('Please specify a valid note input file.') filename = utils.default_dataframe_name(self._df_notes_labeled_path) assert os.path.isfile(filename), 'Could not find note parquet file: {}'.format(filename) self._loaded_df = pd.read_parquet(filename) self._loaded_df.columns = [_.upper() for _ in self._loaded_df.columns] assert 'PREDICTED_CATEGORIES' in self._loaded_df.columns and 'SUBJECT_ID' in self._loaded_df.columns and 'CHARTDATE' in self._loaded_df.columns and 'CATEGORY' in self._loaded_df.columns and 'TEXT' in self._loaded_df.columns and 'ROW_ID' in self._loaded_df.columns and 'HADM_ID' in self._loaded_df.columns, 'Notes file need to have columns: Row_id, Subject_id, Hadm_id, predicted_categories, chartdate, category and text'
def _loading_section_info_file(self): filename = utils.default_dataframe_name(self._section_info__file) assert os.path.isfile( filename), 'Could not find section parquet file: {}'.format( filename) df = pd.read_parquet(filename) df.columns = [_.upper() for _ in df.columns] assert 'SECTION_GROUP' in df.columns, 'Section file need to have columns: Section_id, section_group, row_id, text' assert 'ROW_ID' in df.columns, 'Section file need to have columns: Section_id, section_group, row_id, text' assert 'SECTION_ID' in df.columns, 'Section file need to have columns: Section_id, section_group, row_id, text' assert 'TEXT' in df.columns, 'Section file need to have columns: Section_id, section_group, row_id, text' self._section_info__file = df
def load_sentence_info_file(path, allowed_class_labels): filename = utils.default_dataframe_name(path) assert os.path.isfile( filename), 'Could not find note parquet file: {}'.format( filename) df = pd.read_parquet(filename) df.columns = [_.upper() for _ in df.columns] assert 'ROW_ID' in df.columns, 'Notes file need to have columns: Row_id, predicted_categories' for _ in allowed_class_labels: assert _ in df.columns, "Processed note file has no {} column - class label not found!".format( _) for __ in self._required_tag_list: assert _ + '_' + __ in df.columns, "Processed note file has no {} column - the file needs to be generated by the negex_negation_filter module!".format( _ + '_' + __) return df
def load_prediction_file(path): filename = utils.default_dataframe_name(path) assert os.path.isfile( filename), 'Could not find note parquet file: {}'.format( filename) df = pd.read_parquet(filename) df.columns = [_.upper() for _ in df.columns] assert 'ROW_ID' in df.columns, 'Notes file need to have columns: Row_id, predicted_categories' assert 'PREDICTED_CATEGORIES' in df.columns, "Processed note file needs to have the PREDICTED_CATEGORIES column generated by e.g. the negation module." df['PREDICTED_CATEGORIES'] = df.PREDICTED_CATEGORIES.str.upper() df['PREDICTED_CATEGORIES'] = df.PREDICTED_CATEGORIES.str.replace( ' ', '_') df['PREDICTED_CATEGORIES'] = df.PREDICTED_CATEGORIES.str.split('|') if 'FOUND_EVIDENCE' in df.columns: df['FOUND_EVIDENCE'] = df['FOUND_EVIDENCE'].astype(bool) df = df[df['FOUND_EVIDENCE']] return df
def _dump_df(self): logger.log_info('Dumping the extracted notes into a parquet file.') filename = utils.default_dataframe_name(self._output_note_file) self._labeled_df.to_parquet(filename) logger.log_info( 'DONE: Dumping the extracted notes into a parquet file.')
def _dump_processed_df(self): filename = utils.default_dataframe_name(self._output_note_file) self._loaded_df.to_parquet(filename)