Пример #1
0
    def setUpClass(cls):
        print("Setting up full integration tests")
        testing_dir = util.get_testing_dir()
        pretrained = os.path.join(testing_dir, "pretrained",
                                  "random_forest.sav")
        cls.classifier = clf.Classifier()
        cls.classifier.load_model(pretrained)

        test_xml = os.path.join(testing_dir, "xml")
        paper_paths = util.get_paper_paths(test_xml)
        paper_soups = util.load_paper_xmls(paper_paths)
        cls.good = paper_soups[0]
        cls.good_col = tu.TokenCollection(cls.good)
        cls.good_col.build_tokens()

        feature_matrix = cls.good_col.generate_feature_matrix()
        tokens_count, _ = feature_matrix.shape
        bias_vec = np.ones((tokens_count, 1))
        feature_matrix = np.hstack([feature_matrix, bias_vec])

        # classify A1, A2, R1, R2, OC, P
        prob_matrix = cls.classifier.clf.predict_proba(feature_matrix)
        cls.final_prob_matrix = prob_matrix.copy()

        for tok_i in range(len(cls.final_prob_matrix)):
            cls.final_prob_matrix[tok_i, :] /= \
                np.sum(cls.final_prob_matrix[tok_i,:])

        cls.predictions = {}
        for ev_label in tu.EvLabel:
            cls.predictions[ev_label] = \
                cls.final_prob_matrix[:, ev_label.value + 1]
Пример #2
0
    def train(self, paper_paths):
        paper_soups = util.load_paper_xmls(paper_paths)
        paper_count = len(paper_soups)

        # initializing the label vectors
        # each label has an empty list []
        train_start = time.time()
        print("Training on {} paper(s)...".format(paper_count))

        # Extract feature vectors from all papers
        token_cols = [None] * paper_count
        cum_feat_matrix = np.zeros((0, ft.feature_count + 1))  # +1 is bias
        cum_labels_vec = np.zeros((0, 1))
        cum_labels = np.zeros((0, len(tu.EvLabel.__members__.items())))

        for i in range(paper_count):
            # going through all papers
            soup = paper_soups[i]
            paper_id = soup.pmid.text
            # print("Processing papers {} out of {}\r".format(i + 1, paper_count))
            # print("Paper #", paper_id)

            # start = time.time()
            col = tu.TokenCollection(soup)
            col.build_tokens()
            feature_matrix = col.generate_feature_matrix()
            tokens_count, _ = feature_matrix.shape
            bias_vec = np.ones((tokens_count, 1))
            feature_matrix = np.hstack([feature_matrix, bias_vec])

            # converts a one-hot matrix (labels) into a vector of size
            # (tokens_count,1) where each value corresponds to the class ID
            # from Enum EvLabel or -1 for unclassified tokens
            labels = col.generate_train_labels()
            labels_vec = np.ones((tokens_count, 1)) * -1
            for token_i in range(tokens_count):
                for ev_label in tu.EvLabel:
                    if labels[token_i, ev_label.value] > 0:
                        labels_vec[token_i] = ev_label.value

            cum_labels = np.vstack((cum_labels, labels))

            # append current feature_matrix to cum_feat_matrix
            cum_feat_matrix = np.vstack((cum_feat_matrix, feature_matrix))
            cum_labels_vec = np.vstack((cum_labels_vec, labels_vec))
            token_cols[i] = col

            # end = time.time()
            # print("Time elapsed on paper #{} ({}): {}"
            #       .format(i + 1, paper_id, np.round(end - start, 4)))

        self.clf.fit(cum_feat_matrix, cum_labels_vec.flatten())

        train_end = time.time()
        print("Done training. Time elapsed: ", train_end - train_start)
        self.last_train_paths = paper_paths
Пример #3
0
    def test(self, paper_paths):
        # Test how good our prediction is
        paper_soups = util.load_paper_xmls(paper_paths)
        paper_count = len(paper_soups)
        print("Testing on {} paper(s)...".format(paper_count))

        # Extract feature vectors from all papers
        test_results = [None] * paper_count
        losses = np.zeros((paper_count, ))
        precisions = [0] * 6
        for paper_i in range(paper_count):
            soup = paper_soups[paper_i]
            print("---- Paper #{} [{}]".format(paper_i + 1, soup.pmid.text))

            col = tu.TokenCollection(soup)
            col.build_tokens()

            feature_matrix = col.generate_feature_matrix()
            tokens_count, _ = feature_matrix.shape
            bias_vec = np.ones((tokens_count, 1))
            feature_matrix = np.hstack([feature_matrix, bias_vec])

            # classify A1, A2, R1, R2, OC, P
            prob_matrix = self.clf.predict_proba(feature_matrix)
            final_prob_matrix = prob_matrix.copy()
            for tok_i in range(len(final_prob_matrix)):
                final_prob_matrix[tok_i, :] /= np.sum(
                    final_prob_matrix[tok_i, :])

            predictions = {}
            for ev_label in tu.EvLabel:
                predictions[ev_label] = final_prob_matrix[:,
                                                          ev_label.value + 1]

            label_assignment = self.assign_ev_labels(col, predictions)
            loss = self.eval_loss(col, prob_matrix)
            losses[paper_i] = loss

            predicted_phrases = [None] * 6

            for ev_label in tu.EvLabel:
                true_ev_label_data = col.ev_labels.get(ev_label)
                if true_ev_label_data is None:
                    print("Label not found: ", ev_label)
                else:
                    ev_label_data = label_assignment[ev_label]
                    if ev_label_data.token.chunk is None:
                        predicted_phrase = ev_label_data.token.word
                    else:
                        c_i = col.chunks.index(ev_label_data.token.chunk)
                        predicted_phrase = ev_label_data.token.chunk.string

                        if len(ev_label_data.token.chunk.tokens) == 1:
                            next_tok_i = 1 + col.tokens.index(
                                ev_label_data.token.chunk.tokens[-1])
                            next_tok = col.tokens[next_tok_i].word

                            if next_tok == "(":
                                predicted_phrase += " ("
                            elif next_tok == ",":
                                predicted_phrase += ","

                            predicted_phrase = predicted_phrase + " {}".format(
                                col.chunks[c_i + 1].string)
                            if col.chunks[c_i + 1].string == "with":
                                predicted_phrase = predicted_phrase + " {}".format(
                                    col.chunks[c_i + 2].string)

                            if next_tok == "(":
                                predicted_phrase += " )"

                        elif ev_label == tu.EvLabel.P:
                            c_i = col.chunks.index(ev_label_data.token.chunk)
                            predicted_phrase = predicted_phrase + " {} {}".format(
                                col.chunks[c_i + 1].string,
                                col.chunks[c_i + 2].string)

                    print("Predicted: ", ev_label.name, predicted_phrase,
                          " --- True Label: ", true_ev_label_data.word)

                    predicted_phrases[ev_label.value] = predicted_phrase
                    phrase_lowered = predicted_phrase.lower()
                    if true_ev_label_data.word in phrase_lowered or \
                            (true_ev_label_data.word == "iop" and
                             "pressure" in phrase_lowered) or \
                            (true_ev_label_data.word == "pressure" and
                             "iop" in phrase_lowered):
                        precisions[ev_label.value] += 1

            loss = np.round(loss, 4)
            print("loss for this paper is: ", loss)
            test_result = {
                "soup": soup,
                "paper_path": paper_paths[paper_i],
                "token_collection": col,
                "true_label_assignment": col.ev_labels,
                "predicted_label_assignment": label_assignment,
                "predicted_phrases": predicted_phrases,
                "feature_matrix": feature_matrix,
                "loss": loss
            }
            test_results[paper_i] = test_result

        total_loss = np.sum(losses)
        print("\n\n---------------")
        precisions = [np.round(p / paper_count, 4) for p in precisions]
        print("Average precisions for this run is: \nA1:{}\t A2:{}\t R1:"
              "{}\t R2:{}\t OC:{}\t P:{}".format(precisions[0], precisions[1],
                                                 precisions[2], precisions[3],
                                                 precisions[4], precisions[5]))
        dist_loss = np.round(total_loss, 4)
        print("total loss is: ", np.round(dist_loss, 4))
        print("average loss is: ", np.round(dist_loss / paper_count, 4))
        self.last_total_loss = dist_loss
        self.last_precisions = precisions
        self.last_test_results = test_results
        return total_loss, precisions
Пример #4
0
 def setUp(self):
     self.good = self.paper_soups[0]
     self.good_col = tu.TokenCollection(self.good)
     self.bad = self.paper_soups[1]
     self.bad_col = tu.TokenCollection(self.bad)
Пример #5
0
    def process_word(word_i, word):
        # print('Processing word {}'.format(word_i + 1))
        cached_classes = umls_cache.get(word)
        if cached_classes is not None:
            return word, cached_classes

        classes = util.get_umls_classes(word)
        return word, classes

    for i in range(len(paper_paths)):
        soup = util.parse_paper(paper_paths[i])

        print('---- Paper #{}/{} [{}]'.format(i + 1, paper_count,
                                              soup.pmid.text))

        col = tu.TokenCollection(soup)
        col.build_tokens(umls_cache=True)

        print('Total tokens: {}'.format(len(col.tokens)))

        result = Parallel(n_jobs=4)(delayed(process_word)(i, token.word)
                                    for i, token in enumerate(col.tokens))

        for word, classes in result:
            umls_cache.set(word, classes)

        umls_cache.save()
        print('\n\n')

    print('Done!')