Exemplo n.º 1
0
    def load_raw(self):
        """Load for sklearn training"""

        labels = []  # string labels
        examples = []  # examples as strings
        no_labels = []  # docs with no labels

        # document id -> label mapping
        doc2label = i2b2.parse_standoff(self.annot_xml, self.disease,
                                        self.judgement)

        for f in os.listdir(self.corpus_path):
            doc_id = f.split('.')[0]
            file_path = os.path.join(self.corpus_path, f)
            file_feat_list = utils.read_cuis(file_path)

            # no labels for some documents for some reason
            if doc_id in doc2label:
                string_label = doc2label[doc_id]
                int_label = LABEL2INT[string_label]
                labels.append(int_label)
                examples.append(' '.join(file_feat_list))
            else:
                no_labels.append(doc_id)

        print '%d documents with no labels for %s/%s in %s' \
          % (len(no_labels), self.disease,
             self.judgement, self.annot_xml.split('/')[-1])
        return examples, labels
Exemplo n.º 2
0
    def make_token_alphabet(self):
        """Map tokens (CUIs) to integers"""

        # count tokens in the entire corpus
        token_counts = collections.Counter()

        for f in os.listdir(self.corpus_path):
            file_path = os.path.join(self.corpus_path, f)
            file_feat_list = None
            if self.use_cuis:
                file_feat_list = utils.read_cuis(file_path)
            else:
                file_feat_list = utils.read_tokens(file_path)
            token_counts.update(file_feat_list)

        # now make alphabet (high freq tokens first)
        index = 1
        self.token2int['oov_word'] = 0
        outfile = open(ALPHABET_FILE, 'w')
        for token, count in token_counts.most_common():
            if count > self.min_token_freq:
                outfile.write('%s|%s\n' % (token, count))
                self.token2int[token] = index
                index = index + 1

        # pickle alphabet
        pickle_file = open(self.alphabet_pickle, 'wb')
        pickle.dump(self.token2int, pickle_file)
Exemplo n.º 3
0
    def __init__(self, corpus_path, max_tokens_in_file):
        """Load documents as strings"""

        self.samples = []

        for f in os.listdir(corpus_path):
            file_path = os.path.join(corpus_path, f)
            file_feat_list = utils.read_cuis(file_path)
            if len(file_feat_list) < max_tokens_in_file:
                self.samples.append(' '.join(file_feat_list))
Exemplo n.º 4
0
    def load(self, maxlen=float('inf'), tokens_as_set=True):
        """Convert examples into lists of indices for keras"""

        labels = []  # int labels
        examples = []  # examples as int sequences
        no_labels = []  # docs with no labels

        # document id -> label mapping
        doc2label = i2b2.parse_standoff(self.annot_xml, self.disease,
                                        self.judgement)

        # load examples and labels
        for f in os.listdir(self.corpus_path):
            doc_id = f.split('.')[0]
            file_path = os.path.join(self.corpus_path, f)
            file_ngram_list = None
            if self.use_cuis == True:
                file_feat_list = utils.read_cuis(file_path)
            else:
                file_feat_list = utils.read_tokens(file_path)

            example = []
            if tokens_as_set:
                file_feat_list = set(file_feat_list)

            for token in file_feat_list:
                if token in self.token2int:
                    example.append(self.token2int[token])
                else:
                    example.append(self.token2int['oov_word'])

            if len(example) > maxlen:
                example = example[0:maxlen]

            # no labels for some documents for some reason
            if doc_id in doc2label:
                string_label = doc2label[doc_id]
                int_label = LABEL2INT[string_label]
                labels.append(int_label)
                examples.append(example)
            else:
                no_labels.append(doc_id)

        print('%d documents with no labels for %s/%s in %s' \
          % (len(no_labels), self.disease,
             self.judgement, self.annot_xml.split('/')[-1]))
        return examples, labels
Exemplo n.º 5
0
    def load_vectorized(self, exclude, maxlen=float('inf')):
        """Same as above but labels are vectors"""

        labels = []  # int labels
        examples = []  # examples as int sequences
        no_labels = []  # docs with no labels

        # document id -> vector of labels
        doc2labels = i2b2.parse_standoff_vectorized(self.annot_xml,
                                                    self.judgement, exclude)

        # load examples and labels
        for f in os.listdir(self.corpus_path):
            doc_id = f.split('.')[0]
            file_path = os.path.join(self.corpus_path, f)
            file_feat_list = utils.read_cuis(file_path)

            example = []
            # TODO: use unique tokens or not?
            for token in set(file_feat_list):
                if token in self.token2int:
                    example.append(self.token2int[token])
                else:
                    example.append(self.token2int['oov_word'])

            if len(example) > maxlen:
                example = example[0:maxlen]

            # no labels for some documents for some reason
            if doc_id in doc2labels:
                label_vector = doc2labels[doc_id]
                labels.append(label_vector)
                examples.append(example)
            else:
                no_labels.append(doc_id)

        print '%d documents with no labels for %s/%s in %s' \
          % (len(no_labels), self.disease,
             self.judgement, self.annot_xml.split('/')[-1])
        return examples, labels