예제 #1
0
    def __init__(self):
        super().__init__()

        self._validation_set = config.get_pipeline_config_item(
            self.module_name(), 'validation_set_file', None)
        self._df_notes_labeled_path = config.get_pipeline_config_item(
            self.module_name(), 'input_note_file', None)

        self._loaded_df = None
        self._compare_df = None
        self._orig_df = None
        self._loaded_validation = None
        self._loaded_validation_labels = None
        self._loaded_validation_label_map = None

        logger.log_info('Loading validation note labeling file')
        self._loading_validation_labeling_file()
        logger.log_info('DONE: Loading validation note labeling file')

        logger.log_info('Loading NLP pipeline processed note files')
        self._loading_note_files()
        logger.log_info('DONE: NLP pipeline processed note files')

        logger.log_info('Computing and outputting statistics')
        self._do_statistics()
    def __init__(self):
        super().__init__()

        if not google_tools.check_google_authenticated():
            logger.log_error('User not authenticated for Google cloud access.')

        self._output_note_file = config.get_pipeline_config_item(
            self.module_name(), 'output_note_file',
            '_labeled_lexicons.parquet')
        self._cohort_file = config.get_pipeline_config_item(
            self.module_name(), 'cohort_file', '')
        self._lexicon_dir = config.get_pipeline_config_item(
            self.module_name(), 'lexicon_dir', './used_lexicons')

        self._lexicon_map = {'positive': {}, 'negative': {}}
        self._lexicon_weights = {'positive': {}, 'negative': {}}
        self._labeled_df = None
        self._ignore_cat_list = config.get_pipeline_config_item(
            self.module_name(), 'ignore_categories', [])
        self._dump_all = config.get_pipeline_config_item(
            self.module_name(), 'dump_all_notes', True)

        self._ignore_cat_list = [
            self._ignore_cat_list
        ] if not isinstance(self._ignore_cat_list,
                            collections.Sequence) else self._ignore_cat_list

        self._parse_lexicons()
        self._query_bigquery()
        self._dump_df()
예제 #3
0
    def __init__(self):
        super().__init__()

        self._output_note_file = config.get_pipeline_config_item(
            self.module_name(), 'output_note_file',
            '_negex_filtered_notes.parquet')
        self._df_notes_labeled_path = config.get_pipeline_config_item(
            self.module_name(), 'input_note_file', None)
        self._catchall = config.get_pipeline_config_item(
            self.module_name(), 'catchall', False)

        self._loaded_df = None
        self._filtered_cohort_df = None
        self._labeled_df = None

        logger.log_info('Loading note file')
        self._load_note_input_file()
        logger.log_info('DONE: Loading note file.')

        logger.log_info('Replacing catchall labels.')
        self._replace_catchall()
        logger.log_info('DONE: Replacing catchall labels.')

        logger.log_info('Dumping processed notes.')
        self._dump_processed_df()
        logger.log_info('DONE: Dumping processed notes.')
    def __init__(self):
        super().__init__()

        self._output_note_file = config.get_pipeline_config_item(self.module_name(), 'output_note_file', '_negex_filtered_notes.parquet')
        self._df_notes_labeled_path = config.get_pipeline_config_item(self.module_name(), 'input_note_file', None)
        self._word_distance = config.get_pipeline_config_item(self.module_name(), 'word_distance', 10)
        self._debug_check = config.get_pipeline_config_item(self.module_name(), 'debug_check', True)
        #self._has_negation_flag = False

        self._loaded_df = None

        self._lexicon_dir = config.get_pipeline_config_item(self.module_name(), 'word_filter_dir', './word_cohort_filters')
        self._lexicon_map = {}

        self._njobs = config.get('njobs', multiprocessing.cpu_count())

        logger.log_info('Loading note file')
        self._load_note_input_file()
        logger.log_info('DONE: Loading note file.')

        logger.log_info('Parsing word filter lexicons.')
        self._load_word_matching_lexicons()
        logger.log_info('DONE: Parsing word filter lexicons.')

        logger.log_info('Word filtering and cohort adaptions.')
        self._change_cohort_mappings()
        logger.log_info('DONE: Word filtering and cohort adaptions.')

        logger.log_info('Dumping changed notes.')
        self._dump_filtered_df()
        logger.log_info('DONE: Dumping changed notes.')
    def __init__(self):
        super().__init__()

        self._validation_set = config.get_pipeline_config_item(
            self.module_name(), 'validation_set_file', None)
        self._df_path_a = config.get_pipeline_config_item(
            self.module_name(), 'file_prediction_a', None)
        self._df_path_b = config.get_pipeline_config_item(
            self.module_name(), 'file_prediction_b', None)
        self._ignore_b = config.get_pipeline_config_item(
            self.module_name(), 'ignore_b', False)
        self._section_info__file = config.get_pipeline_config_item(
            self.module_name(), 'section_info__file', None)

        self._df_path_a_negated = config.get_pipeline_config_item(
            self.module_name(), 'file_sentence_info_a', None)
        self._df_path_b_negated = config.get_pipeline_config_item(
            self.module_name(), 'file_sentence_info_b', None)
        self._get_examples_for_categories = config.get_pipeline_config_item(
            self.module_name(), 'get_examples_for_categories', None)
        self._compare_dumping_dir_name = 'dumped_validaton_sentences_between__{}_and_{}'

        self._required_tag_list = [
            'POSITIVE_LEXICON_SENTENCES', 'POSITIVE_LEXICON_AFFIRMED_PHRASES',
            'POSITIVE_LEXICON_NEGATED_PHRASES', 'NEGATIVE_LEXICON_SENTENCES',
            'NEGATIVE_LEXICON_AFFIRMED_PHRASES',
            'NEGATIVE_LEXICON_NEGATED_PHRASES'
        ]

        self._df_a = None
        self._df_b = None
        self._df_sents_a = None
        self._df_sents_b = None

        self._loaded_validation = None
        self._loaded_validation_labels = None
        self._loaded_validation_label_map = None

        logger.log_info('Processing validation file')
        self._loading_validation_labeling_file()
        logger.log_info('DONE: Processing validation file')

        if self._section_info__file:
            logger.log_info(
                'Loading section file to include section information')
            self._loading_section_info_file()
            logger.log_info(
                'DONE: Loading section file to include section information')

        logger.log_info(
            'Loading prediction and sentence info file for dataframes A and B to be compared.'
        )
        self._loading_note_files()
        logger.log_info(
            'DONE: Loading prediction and sentence info file for dataframes A and B to be compared.'
        )

        logger.log_info('Dumping sentences into folder')
        self._dump_examples_for_comparison()
        logger.log_info('DONE: Dumping sentences into folder')
예제 #6
0
    def __init__(self):
        super().__init__()
        nltk.download('punkt')

        self._output_note_file = config.get_pipeline_config_item(
            self.module_name(), 'output_note_file',
            '_negex_filtered_notes.parquet')
        self._df_notes_labeled_path = config.get_pipeline_config_item(
            self.module_name(), 'input_note_file', None)
        self._negex_triggers = config.get_pipeline_config_item(
            self.module_name(), 'negex_triggers', 'negex_trigger.txt')
        self._add_fullstop_after_newline_uppercase = config.get_pipeline_config_item(
            self.module_name(), 'add_fullstop_after_newline_uppercase', False)
        self._keep_categories = config.get_pipeline_config_item(
            self.module_name(), 'keep_categories', [])
        self._dont_include_predicted_categories = config.get_pipeline_config_item(
            self.module_name(), 'dont_include_predicted_categories', False)
        self._debug_row_id = config.get_pipeline_config_item(
            self.module_name(), 'debug_row_id', None)

        self._loaded_df = None
        self._pre_filtered_df = None
        self._filtered_cohort_df = None
        self._labeled_df = None

        self._lexicon_dir = config.get_pipeline_config_item(
            self.module_name(), 'lexicon_dir', './used_lexicons')
        self._lexicon_map = {'positive': {}}
        self._debug_check = config.get_pipeline_config_item(
            self.module_name(), 'debug_check', False)

        self._njobs = config.get('njobs', multiprocessing.cpu_count())

        logger.log_info('Loading note file')
        self._load_note_input_file()
        logger.log_info('DONE: Loading note file.')

        logger.log_info('Parsing Negex triggers.')
        self._load_negex_triggers()
        logger.log_info('DONE: Parsing Negex triggers.')

        logger.log_info('Negex note filtering.')
        self._parse_lexicons()
        self._check_note_negations()
        logger.log_info('DONE: Negex note filtering.')

        logger.log_info('New cohort labeling.')
        self._label_improve_cohort()
        logger.log_info('DONE: New cohort labeling.')

        logger.log_info('Dumping filtered notes.')
        self._dump_filtered_df()
        logger.log_info('DONE: Dumping filtered notes.')
    def __init__(self):
        super().__init__()

        self._validation_set = config.get_pipeline_config_item(
            self.module_name(), 'validation_set_file', None)
        self._df_notes_labeled_paths = config.get_pipeline_config_item(
            self.module_name(), 'input_note_files', [])

        self._loaded_df = []
        self._compare_df = None
        self._orig_df = None
        self._loaded_validation = None
        self._loaded_validation_labels = None
        self._loaded_validation_label_map = None

        logger.log_info('Loading validation note labeling file')
        self._loading_validation_labeling_file()
        logger.log_info('DONE: Loading validation note labeling file')

        logger.log_info('Loading NLP pipeline processed note files')
        self._loading_note_files()
        logger.log_info('DONE: NLP pipeline processed note files')

        logger.log_info('Computing and outputting statistics')

        line_list = []
        for _ in self._loaded_df:
            line_list.append(';')
            table = self._do_statistics(_)
            for _r in range(len(table[0])):
                elems = [_c[_r] for _c in table]
                line_list.append(';'.join(elems))
            line_list.append(';')
            line_list.append(';')

        logger.log_info('CSV Table Output:')

        for _ in line_list:
            print(_)
    def _query_bigquery(self):

        sql_search = ""

        merged_lexicon_map = {
            _k: self._lexicon_map['positive'][_k] +
            self._lexicon_map['negative'][_k]
            for _k in self._lexicon_map['positive'].keys()
        }

        for _name, _terms in merged_lexicon_map.items():
            if not _terms:
                sql_search = sql_search + "," + " FALSE AS " + _name
            else:
                lex = [r'\\b' + x + r'\\b' for x in _terms]
                sql_search = sql_search + "," + " REGEXP_CONTAINS(text, '(?i)(" + '|'.join(
                    lex) + ")') AS " + _name

        ignore_str = '\n'.join([
            'AND category NOT LIKE "%{}%"'.format(_)
            for _ in self._ignore_cat_list
        ])

        use_bqstorage_api = config.get_pipeline_config_item(
            self.module_name(), "use_bqstorage_api", False)

        limitstr = ""
        if config.get_pipeline_config_item(self.module_name(),
                                           "debug_download", False):
            limitstr = 'LIMIT 10'

        cohort_ids = []
        if self._cohort_file and os.path.isfile(self._cohort_file):
            cohort_ids = pd.read_csv(self._cohort_file)
            cohort_ids.columns = [_.lower() for _ in cohort_ids.columns]
            cohort_ids = list(cohort_ids.loc[:, 'hadm_id'])

        sql = """
        SELECT row_id, subject_id, hadm_id, chartdate, category, text{}
        FROM `physionet-data.mimiciii_notes.noteevents`
        WHERE hadm_id IS NOT NULL 
        AND hadm_id IN ({})
        {}
        {}
        """.format(sql_search, ','.join([str(_) for _ in cohort_ids]),
                   ignore_str, limitstr)

        logger.log_info('Querying noteevents for lexicon occurences.')
        self._labeled_df = pandas_gbq.read_gbq(
            sql,
            project_id=google_tools.PROJECT_ID,
            dialect='standard',
            use_bqstorage_api=use_bqstorage_api
        )  #, progress_bar_type=utils.PROGRESSBAR_TYPE)
        self._labeled_df.columns = [
            _.upper() for _ in self._labeled_df.columns
        ]

        if not self._dump_all:
            mask = None
            for _ in self._labeled_df.columns:
                if _.lower() in [
                        'subject_id', 'row_id', 'hadm_id', 'chartdate',
                        'category', 'text'
                ]:
                    continue
                if mask is None:
                    mask = self._labeled_df[_].astype(bool)
                else:
                    mask = mask | self._labeled_df[_].astype(bool)
            self._labeled_df = self._labeled_df[mask].copy()

        logger.log_info('DONE: Querying noteevents for lexicon occurences.')
        logger.log_debug('Number of admissions {}, number of notes {}.'.format(
            self._labeled_df['HADM_ID'].nunique(), len(self._labeled_df)))
        for _key in self._lexicon_map['positive'].keys():
            _key = _key.upper()
            logger.log_debug('Number of notes with {}: {}.'.format(
                _key.lower(), self._labeled_df[_key.upper()].sum()))