示例#1
0
    def __init__(self):
        super().__init__()

        self._validation_set = config.get_pipeline_config_item(
            self.module_name(), 'validation_set_file', None)
        self._df_notes_labeled_path = config.get_pipeline_config_item(
            self.module_name(), 'input_note_file', None)

        self._loaded_df = None
        self._compare_df = None
        self._orig_df = None
        self._loaded_validation = None
        self._loaded_validation_labels = None
        self._loaded_validation_label_map = None

        logger.log_info('Loading validation note labeling file')
        self._loading_validation_labeling_file()
        logger.log_info('DONE: Loading validation note labeling file')

        logger.log_info('Loading NLP pipeline processed note files')
        self._loading_note_files()
        logger.log_info('DONE: NLP pipeline processed note files')

        logger.log_info('Computing and outputting statistics')
        self._do_statistics()
    def _load_note_input_file(self):
        if not self._df_notes_labeled_path:
            raise RuntimeError('Please specify a valid note input file.')

        filename = utils.default_dataframe_name(self._df_notes_labeled_path)

        assert os.path.isfile(
            filename), 'Could not find note parquet file: {}'.format(filename)
        self._loaded_df = pd.read_parquet(filename)

        #self._loaded_df = self._loaded_df[self._loaded_df.ROW_ID == 23191] # 3083

        self._loaded_df.columns = [_.upper() for _ in self._loaded_df.columns]
        assert 'SUBJECT_ID' in self._loaded_df.columns and 'CHARTDATE' in self._loaded_df.columns and 'CATEGORY' in self._loaded_df.columns and 'TEXT' in self._loaded_df.columns and 'ROW_ID' in self._loaded_df.columns and 'HADM_ID' in self._loaded_df.columns, 'Notes file needs to have columns: Row_id, Subject_id, Hadm_id, chartdate, category and text'
        logger.log_info('Notes before category removal: {}'.format(
            len(self._loaded_df)))
        self._loaded_df['CATEGORY'] = self._loaded_df['CATEGORY'].str.lower()
        self._keep_categories = [_.lower() for _ in self._keep_categories]
        filtered_df = self._loaded_df[self._loaded_df['CATEGORY'].isin(
            self._keep_categories)].copy()
        del self._loaded_df
        self._loaded_df = filtered_df
        logger.log_info('Notes after category removal: {}'.format(
            len(self._loaded_df)))

        if not self._debug_row_id is None:
            self._loaded_df = self._loaded_df[self._loaded_df.ROW_ID ==
                                              self._debug_row_id]
            if self._loaded_df.empty:
                logger.log_error('Could not find requested debugging row id.')
    def _label_improve_cohort(self):
        self._pre_filtered_df['FOUND_EVIDENCE_NEGATED'] = 0
        self._pre_filtered_df['PREDICTED_CATEGORIES'] = ''

        self._pre_filtered_df['MAX_SCORE_CAT'] = ''
        self._pre_filtered_df['MAX_SCORE'] = -np.inf

        for _k in self._lexicon_map['positive'].keys():
            _k = _k.upper()

            self._pre_filtered_df[
                'MAX_SCORE_CAT'] = self._pre_filtered_df.apply(
                    lambda x: _k if x[_k + '_TOTAL_SCORE_SUM'] > x[
                        'MAX_SCORE'] else x['MAX_SCORE_CAT'],
                    axis=1)
            self._pre_filtered_df['MAX_SCORE'] = self._pre_filtered_df.apply(
                lambda x: x[_k + '_TOTAL_SCORE_SUM']
                if x[_k + '_TOTAL_SCORE_SUM'] > x['MAX_SCORE'] else x[
                    'MAX_SCORE'],
                axis=1)

            self._pre_filtered_df[
                '_PREDICTED_CATEGORIES'] = self._pre_filtered_df[
                    _k +
                    '_TOTAL_SCORE_SUM'].apply(lambda x: _k if x > 0 else '')
            self._pre_filtered_df[
                'PREDICTED_CATEGORIES'] = self._pre_filtered_df.apply(
                    lambda x: x['PREDICTED_CATEGORIES'] + '|' + _k
                    if len(x['_PREDICTED_CATEGORIES']) > 0 else x[
                        'PREDICTED_CATEGORIES'],
                    axis=1)
            del self._pre_filtered_df['_PREDICTED_CATEGORIES']

        self._pre_filtered_df.loc[self._pre_filtered_df['MAX_SCORE'] > 0,
                                  'FOUND_EVIDENCE_NEGATED'] = 1
        self._filtered_cohort_df = self._pre_filtered_df

        self._filtered_cohort_df[
            'PREDICTED_CATEGORIES'] = self._filtered_cohort_df[
                'PREDICTED_CATEGORIES'].apply(lambda x: x[1:]
                                              if len(x) > 1 else x)

        if 'FOUND_EVIDENCE' in self._filtered_cohort_df.columns:
            self._filtered_cohort_df['FOUND_EVIDENCE'] = (
                self._filtered_cohort_df['FOUND_EVIDENCE'] >
                0) & (self._filtered_cohort_df['FOUND_EVIDENCE_NEGATED'] > 0)
        else:
            self._filtered_cohort_df[
                'FOUND_EVIDENCE'] = self._filtered_cohort_df[
                    'FOUND_EVIDENCE_NEGATED'] > 0
        del self._filtered_cohort_df['FOUND_EVIDENCE_NEGATED']

        logger.log_info(
            'Total patients (after negex filtering): {} / Total admissions: {}'
            .format(self._filtered_cohort_df['SUBJECT_ID'].nunique(),
                    self._filtered_cohort_df['HADM_ID'].nunique()))
    def _parse_lexicons(self):
        assert os.path.isdir(
            self._lexicon_dir), 'Invalid lexicon dir. Does not exist.'
        assert len(os.listdir(self._lexicon_dir)) > 0, 'Lexicon dir is empty.'
        pos_dir = os.path.join(self._lexicon_dir, 'positive')
        neg_dir = os.path.join(self._lexicon_dir, 'negative')

        assert os.path.isdir(
            pos_dir
        ), 'There needs to be a positive lexicon. If you just want to use one variant of the lexicons, create a positive lexicon folders with empty text files.'
        assert os.path.isdir(
            neg_dir
        ), 'There needs to be a negative lexicon. If you just want to use one variant of the lexicons, create a positive lexicon folders with empty text files.'

        pos_files = os.listdir(pos_dir)
        neg_files = os.listdir(neg_dir)

        def parse_dir(dirlist, prefix):
            for _lexi in dirlist:
                file = open(os.path.join(self._lexicon_dir, prefix, _lexi),
                            'r')
                filename = _lexi.strip()
                filename = re.sub(r'\..*', '', filename)
                filename = helper_classes.Module.camelcase_to_snakecase(
                    filename)
                filename = filename.replace(' ', '_')
                lines = file.readlines()
                lines = [_.strip() for _ in lines if len(_) > 0]

                self._lexicon_map[prefix][filename] = []
                if lines:
                    for _ in lines:
                        term = _.split(';;')
                        if len(term[0]) < 1:
                            continue
                        assert len(
                            term
                        ) > 0, 'Invalid line found in {} lexicon: {}'.format(
                            prefix, _)
                        if len(term) < 2:
                            self._lexicon_map[prefix][filename].append(term[0])
                        else:
                            self._lexicon_map[prefix][filename].append(term[0])
                file.close()

        logger.log_debug('Parsing the lexicons..')
        parse_dir(pos_files, 'positive')
        parse_dir(neg_files, 'negative')

        for cat in ['positive', 'negative']:
            for _k, _v in self._lexicon_map[cat].items():
                logger.log_info('{} {} lexicon: {} entries'.format(
                    cat, _k, len(_v)))

        logger.log_info('Parsed and stored all lexicons.')
    def _change_cohort_mappings(self):
        for _search_type, _lexicons in self._lexicon_map.items():
            assert _search_type + '_POSITIVE_LEXICON_SENTENCES' in self._loaded_df.columns, "Missing column in dataframe: {}. This module only suports inputs from the negex_negation_filter module.".format(_search_type + '_POSITIVE_LEXICON_SENTENCES')
            assert _search_type + '_POSITIVE_LEXICON_NEGATED_PHRASES' in self._loaded_df.columns, "Missing column in dataframe: {}. This module only suports inputs from the negex_negation_filter module.".format(_search_type + '_POSITIVE_LEXICON_NEGATED_PHRASES')
            assert _search_type + '_POSITIVE_LEXICON_AFFIRMED_PHRASES' in self._loaded_df.columns, "Missing column in dataframe: {}. This module only suports inputs from the negex_negation_filter module.".format(_search_type + '_POSITIVE_LEXICON_AFFIRMED_PHRASES')

            if self._debug_check:
                self._loaded_df = self._loaded_df.iloc[:100]

            logger.log_info('Long dist. matching for: ' + str(_lexicons.keys()))

            notes_parsed = Parallel(n_jobs=self._njobs)(delayed(self._process_note)(note, _search_type, _lexicons) for note in ([self._loaded_df.iloc[_:min(_+7000, len(self._loaded_df)), :] for _ in range(0, len(self._loaded_df), 7000)]))
            notes_parsed = [__ for _ in notes_parsed for __ in _]
            self._loaded_df = pd.DataFrame(notes_parsed, columns=self._loaded_df.columns)
    def _parse_lexicons(self):
        assert os.path.isdir(
            self._lexicon_dir), 'Invalid lexicon dir. Does not exist.'
        assert len(os.listdir(self._lexicon_dir)) > 0, 'Lexicon dir is empty.'
        pos_dir = os.path.join(self._lexicon_dir, 'positive')
        neg_dir = os.path.join(self._lexicon_dir, 'negative')
        assert os.path.isdir(pos_dir), 'There needs to be a positive lexicon.'
        assert os.path.isdir(neg_dir), 'There needs to be a negative lexicon.'

        pos_files = os.listdir(pos_dir)
        neg_files = os.listdir(neg_dir)

        unknown_files = [_ for _ in neg_files if not _ in pos_files]
        assert len(
            unknown_files
        ) == 0, 'The lexicon filenames in the positive and negative dirs need to match! Found: ' + str(
            unknown_files)

        def parse_dir(dirlist, prefix):
            for _lexi in dirlist:
                file = open(os.path.join(self._lexicon_dir, prefix, _lexi),
                            'r')
                filename = _lexi.strip()
                filename = re.sub(r'\..*', '', filename)
                filename = helper_classes.Module.camelcase_to_snakecase(
                    filename)
                filename = filename.replace(' ', '_')
                lines = file.readlines()
                lines = [_.strip() for _ in lines if len(_) > 0]

                self._lexicon_map[prefix][filename] = []
                self._lexicon_weights[prefix][filename] = {}
                if lines:
                    for _ in lines:
                        term = _.split(';;')
                        assert len(
                            term
                        ) > 0, 'Invalid line found in {} lexicon: {}'.format(
                            prefix, _)
                        if len(term[0]) < 1:
                            continue
                        if len(term) < 2:
                            self._lexicon_map[prefix][filename].append(term[0])
                            self._lexicon_weights[prefix][filename][
                                term[0].lower()] = 2
                        else:
                            self._lexicon_map[prefix][filename].append(term[0])
                            self._lexicon_weights[prefix][filename][
                                term[0].lower()] = (int(term[1]))
                file.close()

        logger.log_debug('Parsing the lexicons now..')
        parse_dir(pos_files, 'positive')
        parse_dir(neg_files, 'negative')

        for cat in ['positive', 'negative']:
            for _k, _v in self._lexicon_map[cat].items():
                logger.log_info('{} {} lexicon: {} entries'.format(
                    cat, _k, len(_v)))

        logger.log_info('Parsed and stored all lexicons.')
 def _dump_df(self):
     logger.log_info('Dumping the extracted notes into a parquet file.')
     filename = utils.default_dataframe_name(self._output_note_file)
     self._labeled_df.to_parquet(filename)
     logger.log_info(
         'DONE: Dumping the extracted notes into a parquet file.')
    def _query_bigquery(self):

        sql_search = ""

        merged_lexicon_map = {
            _k: self._lexicon_map['positive'][_k] +
            self._lexicon_map['negative'][_k]
            for _k in self._lexicon_map['positive'].keys()
        }

        for _name, _terms in merged_lexicon_map.items():
            if not _terms:
                sql_search = sql_search + "," + " FALSE AS " + _name
            else:
                lex = [r'\\b' + x + r'\\b' for x in _terms]
                sql_search = sql_search + "," + " REGEXP_CONTAINS(text, '(?i)(" + '|'.join(
                    lex) + ")') AS " + _name

        ignore_str = '\n'.join([
            'AND category NOT LIKE "%{}%"'.format(_)
            for _ in self._ignore_cat_list
        ])

        use_bqstorage_api = config.get_pipeline_config_item(
            self.module_name(), "use_bqstorage_api", False)

        limitstr = ""
        if config.get_pipeline_config_item(self.module_name(),
                                           "debug_download", False):
            limitstr = 'LIMIT 10'

        cohort_ids = []
        if self._cohort_file and os.path.isfile(self._cohort_file):
            cohort_ids = pd.read_csv(self._cohort_file)
            cohort_ids.columns = [_.lower() for _ in cohort_ids.columns]
            cohort_ids = list(cohort_ids.loc[:, 'hadm_id'])

        sql = """
        SELECT row_id, subject_id, hadm_id, chartdate, category, text{}
        FROM `physionet-data.mimiciii_notes.noteevents`
        WHERE hadm_id IS NOT NULL 
        AND hadm_id IN ({})
        {}
        {}
        """.format(sql_search, ','.join([str(_) for _ in cohort_ids]),
                   ignore_str, limitstr)

        logger.log_info('Querying noteevents for lexicon occurences.')
        self._labeled_df = pandas_gbq.read_gbq(
            sql,
            project_id=google_tools.PROJECT_ID,
            dialect='standard',
            use_bqstorage_api=use_bqstorage_api
        )  #, progress_bar_type=utils.PROGRESSBAR_TYPE)
        self._labeled_df.columns = [
            _.upper() for _ in self._labeled_df.columns
        ]

        if not self._dump_all:
            mask = None
            for _ in self._labeled_df.columns:
                if _.lower() in [
                        'subject_id', 'row_id', 'hadm_id', 'chartdate',
                        'category', 'text'
                ]:
                    continue
                if mask is None:
                    mask = self._labeled_df[_].astype(bool)
                else:
                    mask = mask | self._labeled_df[_].astype(bool)
            self._labeled_df = self._labeled_df[mask].copy()

        logger.log_info('DONE: Querying noteevents for lexicon occurences.')
        logger.log_debug('Number of admissions {}, number of notes {}.'.format(
            self._labeled_df['HADM_ID'].nunique(), len(self._labeled_df)))
        for _key in self._lexicon_map['positive'].keys():
            _key = _key.upper()
            logger.log_debug('Number of notes with {}: {}.'.format(
                _key.lower(), self._labeled_df[_key.upper()].sum()))
示例#9
0
 def print_row(*args):
     logger.log_info(' '.join([str(_) for _ in args]))
示例#10
0
    def _do_statistics(self):
        validset = self._loaded_validation.sort_values('ROW_ID').reset_index(
            drop=True)[['ROW_ID', 'NOTE_TYPES']].copy()
        validset = validset.drop_duplicates(subset=['ROW_ID'])

        predicted = self._loaded_df[['ROW_ID', 'PREDICTED_CATEGORIES']].copy()
        predicted = predicted.rename(
            columns={'PREDICTED_CATEGORIES': 'PREDICTED_CAT'})
        predicted = predicted.drop_duplicates(subset=['ROW_ID'])

        validset = validset.merge(predicted, how='left', on='ROW_ID')
        validset.loc[validset['PREDICTED_CAT'].isnull(),
                     'PREDICTED_CAT'] = pd.Series([[1]] * validset.shape[0])
        validset.loc[validset['NOTE_TYPES'].isnull(),
                     'NOTE_TYPES'] = pd.Series([[1]] * validset.shape[0])

        validset['MATCHED'] = validset.apply(
            lambda x: [_ for _ in x.NOTE_TYPES if _ in x.PREDICTED_CAT],
            axis=1)
        validset['UNMATCHED_VALID'] = validset.apply(
            lambda x: [_ for _ in x.NOTE_TYPES if _ not in x.PREDICTED_CAT],
            axis=1)
        validset['UNMATCHED_PREDICTED'] = validset.apply(
            lambda x: [_ for _ in x.PREDICTED_CAT if _ not in x.NOTE_TYPES],
            axis=1)
        validset['CORRECT_NOTE'] = False

        validset.loc[(validset.UNMATCHED_VALID.str.len() == 0) &
                     (validset.UNMATCHED_PREDICTED.str.len() == 0),
                     'CORRECT_NOTE'] = True

        max_index = max(self._loaded_validation_label_map.values())

        one_hot_valid = np.zeros((validset.shape[0], max_index))
        one_hot_pred = np.zeros((validset.shape[0], max_index))

        _i = 0
        for _, _row in validset.iterrows():
            predicted = [_ - 1 for _ in _row.PREDICTED_CAT if _ != 0]
            valid = [_ - 1 for _ in _row.NOTE_TYPES if _ != 0]

            if valid:
                one_hot_valid[_i, valid] = 1
            if predicted:
                one_hot_pred[_i, predicted] = 1

            _i += 1

        # validset_types = []
        # for _, _row in validset.iterrows():
        #     unmatched_valid = sorted(_row.UNMATCHED_VALID)
        #     unmatched_predicted = sorted(_row.UNMATCHED_PREDICTED)
        #     if 0 in unmatched_valid:
        #         for _ in _row.PREDICTED_CAT:
        #            assert _ > 0
        #            _row['PREDICTED'] = _
        #            _row['VALIDATION'] = 0
        #         continue

        #     if 0 in unmatched_predicted:
        #         for _ in _row.NOTE_TYPES:
        #             assert _ > 0
        #             _row['PREDICTED'] = 0
        #             _row['VALIDATION'] = _
        #             validset_types.append(list(_row))
        #         continue

        #     for _nomatch in unmatched_valid:
        #         _row['PREDICTED']  = 0
        #         _row['VALIDATION'] = _nomatch
        #         validset_types.append(list(_row))

        #     for _nomatch in unmatched_predicted:
        #         _row['PREDICTED']  = _nomatch
        #         _row['VALIDATION'] = 0
        #         validset_types.append(list(_row))

        #     for _match in sorted(_row.MATCHED):
        #         _row['PREDICTED'] = _match
        #         _row['VALIDATION'] = _match
        #         validset_types.append(list(_row))

        # validset = pd.DataFrame(validset_types, columns=list(validset.columns) + ['PREDICTED', 'VALIDATION'])

        # validset['_CORRECT_ENTRIES'] = validset['PREDICTED'] == validset['VALIDATION']
        # validset['_CORRECT_ENTRIES'] *= 1
        # validset['_CORRECT_ENTRIES'] = validset.groupby('ROW_ID')['_CORRECT_ENTRIES'].transform(lambda x: sum(x))
        # validset['_TOTAL_ENTRIES'] = validset.groupby('ROW_ID')['_CORRECT_ENTRIES'].transform(lambda x: len(x))
        # validset['CORRECT_NOTE'] = validset['_TOTAL_ENTRIES'] == validset['_CORRECT_ENTRIES']
        # validset = validset.drop(columns=['_CORRECT_ENTRIES', '_TOTAL_ENTRIES'])

        assert len(
            validset.groupby('ROW_ID').first().reset_index()) == len(validset)

        logger.log_info('Correctly identified notes: {}/{} ({}%)'.format(
            validset['CORRECT_NOTE'].sum(), len(validset),
            validset['CORRECT_NOTE'].sum() * 100 / len(validset)))

        # predicted_labels = validset['PREDICTED'].values
        # valid_labels = validset['VALIDATION'].values

        def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
            '''
            Compute the Hamming score (a.k.a. label-based accuracy) for the multi-label case
            https://stackoverflow.com/q/32239577/395857
            '''
            acc_list = []
            for i in range(y_true.shape[0]):
                set_true = set(np.where(y_true[i])[0])
                set_pred = set(np.where(y_pred[i])[0])
                #print('\nset_true: {0}'.format(set_true))
                #print('set_pred: {0}'.format(set_pred))
                tmp_a = None
                if len(set_true) == 0 and len(set_pred) == 0:
                    tmp_a = 1
                else:
                    tmp_a = len(set_true.intersection(set_pred))/\
                            float( len(set_true.union(set_pred)) )
                #print('tmp_a: {0}'.format(tmp_a))
                acc_list.append(tmp_a)
            return np.mean(acc_list)

        logger.log_info('Hamming score: {0}'.format(
            hamming_score(one_hot_valid,
                          one_hot_pred)))  # 0.375 (= (0.5+1+0+0)/4)

        # Subset accuracy
        # 0.25 (= 0+1+0+0 / 4) --> 1 if the prediction for one sample fully matches the gold. 0 otherwise.
        logger.log_info('Subset accuracy: {0}'.format(
            sklearn.metrics.accuracy_score(one_hot_valid,
                                           one_hot_pred,
                                           normalize=True,
                                           sample_weight=None)))

        logger.log_info('Hamming loss: {0}'.format(
            sklearn.metrics.hamming_loss(one_hot_valid, one_hot_pred)))
        logger.log_info('Total instances: {0}'.format(len(one_hot_valid)))

        logger.log_info('')

        def print_report(y_true, y_pred):
            def print_row(*args):
                logger.log_info(' '.join([str(_) for _ in args]))

            #print_row('F1 Score', sklearn.metrics.f1_score(y_true, y_pred))
            print_row('Precision',
                      sklearn.metrics.precision_score(y_true, y_pred))
            print_row('Recall', sklearn.metrics.recall_score(y_true, y_pred))
            print_row('Accuracy',
                      sklearn.metrics.accuracy_score(y_true, y_pred),
                      np.sum(y_true == y_pred), '/', len(y_pred),
                      '({}, {})'.format(y_true.sum(), y_pred.sum()))

        for _cat in range(1, max_index + 1):
            mapname = [
                _[0] for _ in self._loaded_validation_label_map.items()
                if _cat == _[1]
            ][0]
            logger.log_info('{} =================='.format(mapname))
            print_report(one_hot_valid[:, _cat - 1], one_hot_pred[:, _cat - 1])
            logger.log_info('')
    def __init__(self):
        super().__init__()

        self._validation_set = config.get_pipeline_config_item(
            self.module_name(), 'validation_set_file', None)
        self._df_notes_labeled_paths = config.get_pipeline_config_item(
            self.module_name(), 'input_note_files', [])

        self._loaded_df = []
        self._compare_df = None
        self._orig_df = None
        self._loaded_validation = None
        self._loaded_validation_labels = None
        self._loaded_validation_label_map = None

        logger.log_info('Loading validation note labeling file')
        self._loading_validation_labeling_file()
        logger.log_info('DONE: Loading validation note labeling file')

        logger.log_info('Loading NLP pipeline processed note files')
        self._loading_note_files()
        logger.log_info('DONE: NLP pipeline processed note files')

        logger.log_info('Computing and outputting statistics')

        line_list = []
        for _ in self._loaded_df:
            line_list.append(';')
            table = self._do_statistics(_)
            for _r in range(len(table[0])):
                elems = [_c[_r] for _c in table]
                line_list.append(';'.join(elems))
            line_list.append(';')
            line_list.append(';')

        logger.log_info('CSV Table Output:')

        for _ in line_list:
            print(_)
    def __init__(self):
        super().__init__()

        self._output_note_file = config.get_pipeline_config_item(self.module_name(), 'output_note_file', '_negex_filtered_notes.parquet')
        self._df_notes_labeled_path = config.get_pipeline_config_item(self.module_name(), 'input_note_file', None)
        self._word_distance = config.get_pipeline_config_item(self.module_name(), 'word_distance', 10)
        self._debug_check = config.get_pipeline_config_item(self.module_name(), 'debug_check', True)
        #self._has_negation_flag = False

        self._loaded_df = None

        self._lexicon_dir = config.get_pipeline_config_item(self.module_name(), 'word_filter_dir', './word_cohort_filters')
        self._lexicon_map = {}

        self._njobs = config.get('njobs', multiprocessing.cpu_count())

        logger.log_info('Loading note file')
        self._load_note_input_file()
        logger.log_info('DONE: Loading note file.')

        logger.log_info('Parsing word filter lexicons.')
        self._load_word_matching_lexicons()
        logger.log_info('DONE: Parsing word filter lexicons.')

        logger.log_info('Word filtering and cohort adaptions.')
        self._change_cohort_mappings()
        logger.log_info('DONE: Word filtering and cohort adaptions.')

        logger.log_info('Dumping changed notes.')
        self._dump_filtered_df()
        logger.log_info('DONE: Dumping changed notes.')
    def __init__(self):
        super().__init__()
        nltk.download('punkt')

        self._output_note_file = config.get_pipeline_config_item(
            self.module_name(), 'output_note_file',
            '_negex_filtered_notes.parquet')
        self._df_notes_labeled_path = config.get_pipeline_config_item(
            self.module_name(), 'input_note_file', None)
        self._keep_categories = config.get_pipeline_config_item(
            self.module_name(), 'keep_categories', [])
        self._dont_include_predicted_categories = config.get_pipeline_config_item(
            self.module_name(), 'dont_include_predicted_categories', False)
        self._debug_row_id = config.get_pipeline_config_item(
            self.module_name(), 'debug_row_id', None)

        self._loaded_df = None
        self._pre_filtered_df = None
        self._filtered_cohort_df = None
        self._labeled_df = None

        self._lexicon_dir = config.get_pipeline_config_item(
            self.module_name(), 'lexicon_dir', './used_lexicons')
        self._lexicon_map = {'positive': {}, 'negative': {}}
        self._debug_check = config.get_pipeline_config_item(
            self.module_name(), 'debug_check', False)

        self._njobs = config.get('njobs', multiprocessing.cpu_count())

        logger.log_info('Loading note file')
        self._load_note_input_file()
        logger.log_info('DONE: Loading note file.')

        logger.log_info('Negex note filtering.')
        self._parse_lexicons()
        self._check_note_negations()
        logger.log_info('DONE: Negex note filtering.')

        logger.log_info('New cohort labeling.')
        self._label_improve_cohort()
        logger.log_info('DONE: New cohort labeling.')

        logger.log_info('Dumping filtered notes.')
        self._dump_filtered_df()
        logger.log_info('DONE: Dumping filtered notes.')
    def _check_note_negations(self):
        mask = None
        for _ in self._loaded_df.columns:
            if _.lower() in [
                    'subject_id', 'row_id', 'hadm_id', 'chartdate', 'category',
                    'text', 'section_id', 'section_group', 'section_group_new',
                    'section_name'
            ]:
                continue
            if mask is None:
                mask = self._loaded_df[_].astype(bool)
            else:
                mask = mask | self._loaded_df[_].astype(bool)

        logger.log_info('Starting negation checking loop')
        logger.log_debug(
            str(self._njobs) + ' processes used for check routine.')
        note_pos_df = self._loaded_df[mask].copy()

        logger.log_info(
            'Total patients (before negex filtering): {} / Total admissions: {}'
            .format(note_pos_df['SUBJECT_ID'].nunique(),
                    note_pos_df['HADM_ID'].nunique()))
        del self._loaded_df

        if self._debug_check:
            #note_pos_df = note_pos_df[note_pos_df['ROW_ID'] == 33059]
            note_pos_df = note_pos_df.iloc[0:10]

        note_infos = Parallel(n_jobs=self._njobs)(
            delayed(self._process_note)(note)
            for _, note in progressbar(note_pos_df.iterrows(),
                                       total=len(note_pos_df)))
        logger.log_debug('Found {} note infos.'.format(len(note_infos)))
        if note_infos:
            example_note = note_infos[0]
            logger.log_debug(str(example_note))

        note_infos_df = []
        cols = ['ROW_ID']

        for _entry in note_infos:
            for _id, _cat_dict in _entry.items():
                lis = [_id]

                for _cat in self._lexicon_map['positive'].keys():
                    if not _cat in _cat_dict:
                        lis.append(0)
                        lis.append(0)

                        lis.append('')
                        lis.append('')
                        lis.append('')
                        continue

                    _negated = _cat_dict[_cat]['negated']
                    _key = _cat.upper()

                    positive_terms = len(_cat_dict[_cat]['occurences'])

                    lis.append(positive_terms)
                    lis.append(positive_terms)

                    lis.append('\n'.join(_cat_dict[_cat]['sentences']))
                    lis.append('\n'.join(
                        [str(_) for _ in _cat_dict[_cat]['occurences']]))
                    lis.append('')

                note_infos_df.append(lis)

        cols_suffix = [
            'TOTAL_SCORE_SUM', 'SCORE_SUM_POSITIVE',
            'POSITIVE_LEXICON_SENTENCES', 'POSITIVE_LEXICON_AFFIRMED_PHRASES',
            'POSITIVE_LEXICON_NEGATED_PHRASES'
        ]
        for _key in self._lexicon_map['positive'].keys():
            for _suff in cols_suffix:
                cols.append(_key.upper() + '_' + _suff.upper())

        info_df = pd.DataFrame(note_infos_df, columns=cols)
        note_pos_df = note_pos_df.merge(info_df, how='left', on='ROW_ID')

        note_pos_df.loc[:, cols] = note_pos_df[cols].fillna('')
        self._pre_filtered_df = note_pos_df
    def _loading_note_files(self):
        if not self._df_path_a or not self._df_path_b:
            raise RuntimeError('Please specify a valid note input file.')

        def load_prediction_file(path):
            filename = utils.default_dataframe_name(path)
            assert os.path.isfile(
                filename), 'Could not find note parquet file: {}'.format(
                    filename)
            df = pd.read_parquet(filename)
            df.columns = [_.upper() for _ in df.columns]

            assert 'ROW_ID' in df.columns, 'Notes file need to have columns: Row_id, predicted_categories'
            assert 'PREDICTED_CATEGORIES' in df.columns, "Processed note file needs to have the PREDICTED_CATEGORIES column generated by e.g. the negation module."
            df['PREDICTED_CATEGORIES'] = df.PREDICTED_CATEGORIES.str.upper()
            df['PREDICTED_CATEGORIES'] = df.PREDICTED_CATEGORIES.str.replace(
                ' ', '_')
            df['PREDICTED_CATEGORIES'] = df.PREDICTED_CATEGORIES.str.split('|')
            if 'FOUND_EVIDENCE' in df.columns:
                df['FOUND_EVIDENCE'] = df['FOUND_EVIDENCE'].astype(bool)
                df = df[df['FOUND_EVIDENCE']]

            return df

        def load_sentence_info_file(path, allowed_class_labels):
            filename = utils.default_dataframe_name(path)
            assert os.path.isfile(
                filename), 'Could not find note parquet file: {}'.format(
                    filename)
            df = pd.read_parquet(filename)
            df.columns = [_.upper() for _ in df.columns]

            assert 'ROW_ID' in df.columns, 'Notes file need to have columns: Row_id, predicted_categories'
            for _ in allowed_class_labels:
                assert _ in df.columns, "Processed note file has no {} column - class label not found!".format(
                    _)

                for __ in self._required_tag_list:
                    assert _ + '_' + __ in df.columns, "Processed note file has no {} column - the file needs to be generated by the negex_negation_filter module!".format(
                        _ + '_' + __)

            return df

        self._df_a = load_prediction_file(self._df_path_a)
        self._df_b = load_prediction_file(self._df_path_b)

        # Identify and map all class labels to integer numbers
        unique_labels = []
        for _ in [
                *self._df_a.PREDICTED_CATEGORIES,
                *self._df_b.PREDICTED_CATEGORIES,
                self._loaded_validation_labels
        ]:
            unique_labels.extend(_)

        unique_labels = set(unique_labels)
        unique_labels = set([_.upper() for _ in unique_labels])
        unique_labels_unmatched = unique_labels - self._loaded_validation_labels

        logger.log_info(
            'Found the following labels which are present in the predicted notes but not in the validation set: '
            + str(unique_labels_unmatched))
        lbl_id = 1
        self._loaded_validation_label_map = {'NONE': 0}
        self._inv_loaded_validation_label_map = {0: 'NONE'}

        for _lbl in unique_labels:
            self._loaded_validation_label_map[_lbl] = lbl_id
            self._inv_loaded_validation_label_map[lbl_id] = _lbl
            lbl_id += 1

        for _lbl in unique_labels_unmatched:
            self._loaded_validation_label_map[_lbl] = 0
            self._inv_loaded_validation_label_map[0] = _lbl

        logger.log_info('Label string to int map: {}'.format(
            str(self._loaded_validation_label_map)))

        class_labels = [
            _ for _ in self._loaded_validation_label_map.keys() if _ != 'NONE'
        ]

        self._df_sents_a = load_sentence_info_file(self._df_path_a_negated,
                                                   class_labels)
        self._df_sents_b = load_sentence_info_file(self._df_path_b_negated,
                                                   class_labels)

        self._df_a[
            'PREDICTED_CATEGORIES'] = self._df_a.PREDICTED_CATEGORIES.apply(
                lambda x: [self._loaded_validation_label_map[_] for _ in x])
        self._df_b[
            'PREDICTED_CATEGORIES'] = self._df_b.PREDICTED_CATEGORIES.apply(
                lambda x: [self._loaded_validation_label_map[_] for _ in x])

        self._loaded_validation[
            'NOTE_TYPES'] = self._loaded_validation.NOTE_TYPES.apply(
                lambda x: [self._loaded_validation_label_map[_] for _ in x])

        if not self._get_examples_for_categories:
            self._get_examples_for_categories = [*class_labels, 'NONE']
        else:
            self._get_examples_for_categories = [
                _.upper() for _ in self._get_examples_for_categories
            ]
        self._get_examples_for_categories = [
            _ for _ in self._get_examples_for_categories if _ != 'NONE'
        ]

        logger.log_info(
            'Dumping the following class labels of interest: {}'.format(
                str(self._get_examples_for_categories)))
示例#16
0
    def __init__(self):
        super().__init__()

        self._output_note_file = config.get_pipeline_config_item(
            self.module_name(), 'output_note_file',
            '_negex_filtered_notes.parquet')
        self._df_notes_labeled_path = config.get_pipeline_config_item(
            self.module_name(), 'input_note_file', None)
        self._catchall = config.get_pipeline_config_item(
            self.module_name(), 'catchall', False)

        self._loaded_df = None
        self._filtered_cohort_df = None
        self._labeled_df = None

        logger.log_info('Loading note file')
        self._load_note_input_file()
        logger.log_info('DONE: Loading note file.')

        logger.log_info('Replacing catchall labels.')
        self._replace_catchall()
        logger.log_info('DONE: Replacing catchall labels.')

        logger.log_info('Dumping processed notes.')
        self._dump_processed_df()
        logger.log_info('DONE: Dumping processed notes.')
    def __init__(self):
        super().__init__()

        self._validation_set = config.get_pipeline_config_item(
            self.module_name(), 'validation_set_file', None)
        self._df_path_a = config.get_pipeline_config_item(
            self.module_name(), 'file_prediction_a', None)
        self._df_path_b = config.get_pipeline_config_item(
            self.module_name(), 'file_prediction_b', None)
        self._ignore_b = config.get_pipeline_config_item(
            self.module_name(), 'ignore_b', False)
        self._section_info__file = config.get_pipeline_config_item(
            self.module_name(), 'section_info__file', None)

        self._df_path_a_negated = config.get_pipeline_config_item(
            self.module_name(), 'file_sentence_info_a', None)
        self._df_path_b_negated = config.get_pipeline_config_item(
            self.module_name(), 'file_sentence_info_b', None)
        self._get_examples_for_categories = config.get_pipeline_config_item(
            self.module_name(), 'get_examples_for_categories', None)
        self._compare_dumping_dir_name = 'dumped_validaton_sentences_between__{}_and_{}'

        self._required_tag_list = [
            'POSITIVE_LEXICON_SENTENCES', 'POSITIVE_LEXICON_AFFIRMED_PHRASES',
            'POSITIVE_LEXICON_NEGATED_PHRASES', 'NEGATIVE_LEXICON_SENTENCES',
            'NEGATIVE_LEXICON_AFFIRMED_PHRASES',
            'NEGATIVE_LEXICON_NEGATED_PHRASES'
        ]

        self._df_a = None
        self._df_b = None
        self._df_sents_a = None
        self._df_sents_b = None

        self._loaded_validation = None
        self._loaded_validation_labels = None
        self._loaded_validation_label_map = None

        logger.log_info('Processing validation file')
        self._loading_validation_labeling_file()
        logger.log_info('DONE: Processing validation file')

        if self._section_info__file:
            logger.log_info(
                'Loading section file to include section information')
            self._loading_section_info_file()
            logger.log_info(
                'DONE: Loading section file to include section information')

        logger.log_info(
            'Loading prediction and sentence info file for dataframes A and B to be compared.'
        )
        self._loading_note_files()
        logger.log_info(
            'DONE: Loading prediction and sentence info file for dataframes A and B to be compared.'
        )

        logger.log_info('Dumping sentences into folder')
        self._dump_examples_for_comparison()
        logger.log_info('DONE: Dumping sentences into folder')
示例#18
0
    def _label_improve_cohort(self):
        self._pre_filtered_df['FOUND_EVIDENCE_NEGATED'] = 0
        if not self._dont_include_predicted_categories:
            self._pre_filtered_df['PREDICTED_CATEGORIES'] = ''

        self._pre_filtered_df['MAX_SCORE_CAT'] = ''
        self._pre_filtered_df['MAX_SCORE'] = -np.inf

        for _k in self._lexicon_map['positive'].keys():
            _k = _k.upper()

            if self._use_only_positive_lexicons:
                self._pre_filtered_df[
                    _k + '_TOTAL_SCORE_SUM'] = self._pre_filtered_df[
                        _k + '_SCORE_SUM_POSITIVE']

            if self._use_old_negation_scheme:
                mask = (self._pre_filtered_df[
                    _k + '_POSITIVE_LEXICONS_NEGATED_OCCURENCES']
                        == 1) & (self._pre_filtered_df[
                            _k + '_POSITIVE_LEXICON_OCCURENCES'] == 1)
                self._pre_filtered_df[
                    _k + '_TOTAL_SCORE_SUM'] = self._pre_filtered_df[
                        _k + '_POSITIVE_LEXICON_OCCURENCES']
                self._pre_filtered_df.loc[mask, _k + '_TOTAL_SCORE_SUM'] = -1
                self._pre_filtered_df.loc[~mask, _k + '_TOTAL_SCORE_SUM'] = 1
                self._pre_filtered_df.loc[
                    self._pre_filtered_df[_k +
                                          '_POSITIVE_LEXICON_OCCURENCES'] == 0,
                    _k + '_TOTAL_SCORE_SUM'] = 0

            self._pre_filtered_df[
                'MAX_SCORE_CAT'] = self._pre_filtered_df.apply(
                    lambda x: _k if x[_k + '_TOTAL_SCORE_SUM'] > x[
                        'MAX_SCORE'] else x['MAX_SCORE_CAT'],
                    axis=1)
            self._pre_filtered_df['MAX_SCORE'] = self._pre_filtered_df.apply(
                lambda x: x[_k + '_TOTAL_SCORE_SUM']
                if x[_k + '_TOTAL_SCORE_SUM'] > x['MAX_SCORE'] else x[
                    'MAX_SCORE'],
                axis=1)
            if not self._dont_include_predicted_categories:
                self._pre_filtered_df[
                    '_PREDICTED_CATEGORIES'] = self._pre_filtered_df[
                        _k + '_TOTAL_SCORE_SUM'].apply(lambda x: _k
                                                       if x > 0 else '')
                self._pre_filtered_df[
                    'PREDICTED_CATEGORIES'] = self._pre_filtered_df.apply(
                        lambda x: x['PREDICTED_CATEGORIES'] + '|' + _k
                        if len(x['_PREDICTED_CATEGORIES']) > 0 else x[
                            'PREDICTED_CATEGORIES'],
                        axis=1)
                del self._pre_filtered_df['_PREDICTED_CATEGORIES']

        if not self._dont_include_predicted_categories:
            self._pre_filtered_df.loc[self._pre_filtered_df['MAX_SCORE'] > 0,
                                      'FOUND_EVIDENCE_NEGATED'] = 1
            if not self._dump_all_notes:
                self._filtered_cohort_df = self._pre_filtered_df[
                    self._pre_filtered_df['FOUND_EVIDENCE_NEGATED'] ==
                    1].copy()
            else:
                self._filtered_cohort_df = self._pre_filtered_df
        else:
            self._filtered_cohort_df = self._pre_filtered_df

        if not self._dont_include_predicted_categories:
            self._filtered_cohort_df[
                'PREDICTED_CATEGORIES'] = self._filtered_cohort_df[
                    'PREDICTED_CATEGORIES'].apply(lambda x: x[1:]
                                                  if len(x) > 1 else x)

            if 'FOUND_EVIDENCE' in self._filtered_cohort_df.columns:
                self._filtered_cohort_df['FOUND_EVIDENCE'] = (
                    self._filtered_cohort_df['FOUND_EVIDENCE'] > 0) & (
                        self._filtered_cohort_df['FOUND_EVIDENCE_NEGATED'] > 0)
            else:
                self._filtered_cohort_df[
                    'FOUND_EVIDENCE'] = self._filtered_cohort_df[
                        'FOUND_EVIDENCE_NEGATED']
            del self._filtered_cohort_df['FOUND_EVIDENCE_NEGATED']

        logger.log_info(
            'Total patients (after negex filtering): {} / Total admissions: {}'
            .format(self._filtered_cohort_df['SUBJECT_ID'].nunique(),
                    self._filtered_cohort_df['HADM_ID'].nunique()))