Exemplo n.º 1
0
    def make_token_alphabet(self):
        """Map tokens (CUIs) to integers"""

        # count tokens in the entire corpus
        token_counts = collections.Counter()

        for f in os.listdir(self.corpus_path):
            file_path = os.path.join(self.corpus_path, f)
            file_feat_list = None
            if self.use_cuis:
                file_feat_list = utils.read_cuis(file_path)
            else:
                file_feat_list = utils.read_tokens(file_path)
            token_counts.update(file_feat_list)

        # now make alphabet (high freq tokens first)
        index = 1
        self.token2int['oov_word'] = 0
        outfile = open(ALPHABET_FILE, 'w')
        for token, count in token_counts.most_common():
            if count > self.min_token_freq:
                outfile.write('%s|%s\n' % (token, count))
                self.token2int[token] = index
                index = index + 1

        # pickle alphabet
        pickle_file = open(self.alphabet_pickle, 'wb')
        pickle.dump(self.token2int, pickle_file)
Exemplo n.º 2
0
    def load_raw(self):
        """Load for sklearn training"""

        labels = []  # string labels
        examples = []  # examples as strings
        no_labels = []  # docs with no labels

        # document id -> label mapping
        doc2label = i2b2.parse_standoff(self.annot_xml, self.disease,
                                        self.judgement)

        for f in os.listdir(self.corpus_path):
            doc_id = f.split('.')[0]
            file_path = os.path.join(self.corpus_path, f)
            file_feat_list = None
            if self.use_cuis == True:
                file_feat_list = utils.read_cuis(file_path)
            else:
                file_feat_list = utils.read_tokens(file_path)

            # no labels for some documents for some reason
            if doc_id in doc2label:
                string_label = doc2label[doc_id]
                int_label = LABEL2INT[string_label]
                labels.append(int_label)
                examples.append(' '.join(file_feat_list))
            else:
                no_labels.append(doc_id)

        print('%d documents with no labels for %s/%s in %s' \
          % (len(no_labels), self.disease,
             self.judgement, self.annot_xml.split('/')[-1]))
        return examples, labels
Exemplo n.º 3
0
    def load(self, maxlen=float('inf'), tokens_as_set=True):
        """Convert examples into lists of indices for keras"""

        labels = []  # int labels
        examples = []  # examples as int sequences
        no_labels = []  # docs with no labels

        # document id -> label mapping
        doc2label = i2b2.parse_standoff(self.annot_xml, self.disease,
                                        self.judgement)

        # load examples and labels
        for f in os.listdir(self.corpus_path):
            doc_id = f.split('.')[0]
            file_path = os.path.join(self.corpus_path, f)
            file_ngram_list = None
            if self.use_cuis == True:
                file_feat_list = utils.read_cuis(file_path)
            else:
                file_feat_list = utils.read_tokens(file_path)

            example = []
            if tokens_as_set:
                file_feat_list = set(file_feat_list)

            for token in file_feat_list:
                if token in self.token2int:
                    example.append(self.token2int[token])
                else:
                    example.append(self.token2int['oov_word'])

            if len(example) > maxlen:
                example = example[0:maxlen]

            # no labels for some documents for some reason
            if doc_id in doc2label:
                string_label = doc2label[doc_id]
                int_label = LABEL2INT[string_label]
                labels.append(int_label)
                examples.append(example)
            else:
                no_labels.append(doc_id)

        print('%d documents with no labels for %s/%s in %s' \
          % (len(no_labels), self.disease,
             self.judgement, self.annot_xml.split('/')[-1]))
        return examples, labels
Exemplo n.º 4
0
    parser.add_option("-d",
                      "--debug",
                      action="store_true",
                      help="turn on debug mode")

    (options, args) = parser.parse_args()
    if len(args) != 2:
        parser.error("Please provide required arguments")

    if options.debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.CRITICAL)

    training_file = args[0]
    training_sents = utils.read_tokens(training_file)
    test_file = args[1]
    test_sents = utils.read_tokens(test_file)

    model = create_model(training_sents)

    ## read sentences again because predict_tags(...) rewrites the tags
    sents = utils.read_tokens(training_file)
    predictions = predict_tags(sents, model)
    accuracy = utils.calc_accuracy(training_sents, predictions)
    print "Accuracy in training [%s sentences]: %s" % (len(sents), accuracy)

    ## read sentences again because predict_tags(...) rewrites the tags
    sents = utils.read_tokens(test_file)
    predictions = predict_tags(sents, model)
    accuracy = utils.calc_accuracy(test_sents, predictions)