예제 #1
0
 def __init__(self):
     self.model = Model('model.pkl')
     self.path = '../i2b2_data/data/'
     self.identifiers = {
         'DOS': 'do',
         'UNIT': 'do',
         'FREQ': 'f',
         'PER': 'du'
     }
     label_files = os.listdir(self.path + 'annotations_ground_truth/pool/')
     self.label_file_dict = {
         name.split('.')[0]: name
         for name in label_files
     }
예제 #2
0
def view_tree(sentence):
    model = Model('../model.pkl')
    ne_labels = model.predict(sentence)
    sent = nltk.word_tokenize(sentence)
    pos = nltk.pos_tag(sent)

    sent = list(zip([x[0] for x in pos], [x[1] for x in pos], ne_labels[0]))
    # sent = convert_to_IOB(sent)
    print(sent)
    text = ''
    for t, p, n in sent:
        text += t + ' ' + p + ' ' + n[1] + '\n'

    tree = nltk.chunk.conllstr2tree(
        text, chunk_types=['DOS', 'UNIT', 'WHO', 'O', 'FREQ', 'PER'])
    tree.draw()
    def __init__(self, parent=None):
        super(App, self).__init__(parent)
        self.setupUi(self)

        self.model = Model()
        self.trainer = Trainer()

        self.pushButton.clicked.connect(self.__extract__)
        self.pushButton_2.clicked.connect(self.__load_text__)
        self.pushButton_3.clicked.connect(self.__train_model__)
        self.pushButton_4.clicked.connect(self.__load_model__)

        self.css = '''
        label {
            font-style: normal;
            padding-right: 4 px;
        }
        '''
        self.text_doc = QtGui.QTextDocument()
        self.text_doc.setDefaultStyleSheet(self.css)
        self.text_doc.setHtml('<body></body>')
        self.textEdit.setDocument(self.text_doc)
        self.color_dict = {
            'O': '#FFFFFF',
            'DOS': '#EE964B',
            'UNIT': '#F95738',
            'WHO': '#8F78AD',
            'FREQ': '#D4BA6A',
            'DUR': '#BBCD67'
        }

        self.label.setText(f"<html><head/><body><p><span style=\" background-color:"
                           f"{self.color_dict['DOS']};\">Dosage</span></p></body></html>")
        self.label_2.setText(f"<html><head/><body><p><span style=\" background-color:"
                             f"{self.color_dict['UNIT']};\">Unit</span></p></body></html>")
        self.label_3.setText(f"<html><head/><body><p><span style=\" background-color:"
                             f"{self.color_dict['WHO']};\">Who</span></p></body></html>")
        self.label_4.setText(f"<html><head/><body><p><span style=\" background-color:"
                             f"{self.color_dict['FREQ']};\">Frequency</span></p></body></html>")
        self.label_5.setText(f"<html><head/><body><p><span style=\" background-color:"
                             f"{self.color_dict['DUR']};\">Period</span></p></body></html>")
예제 #4
0
def convert_to_ne_tree(sentence):
    """takes a string sentence as input and returns a tree structure
    with named entities grouped in subtrees

    returns a bracket notation tree
    """
    model = Model()
    ne_labels = model.predict(sentence)
    sent = nltk.word_tokenize(sentence)
    pos = nltk.pos_tag(sent)

    sent = list(zip([x[0] for x in pos], [x[1] for x in pos], ne_labels))
    sent = convert_to_IOB(sent)

    text = ''
    for t, p, n in sent:
        text += t + ' ' + p + ' ' + n + '\n'

    tree = nltk.chunk.conllstr2tree(
        text, chunk_types=['DOS', 'UNIT', 'WHO', 'O', 'FREQ', 'PER'])

    return tree
class App(QtWidgets.QMainWindow, design.Ui_MainWindow):
    """
    MVC controller.
    """
    def __init__(self, parent=None):
        super(App, self).__init__(parent)
        self.setupUi(self)

        self.model = Model()
        self.trainer = Trainer()

        self.pushButton.clicked.connect(self.__extract__)
        self.pushButton_2.clicked.connect(self.__load_text__)
        self.pushButton_3.clicked.connect(self.__train_model__)
        self.pushButton_4.clicked.connect(self.__load_model__)

        self.css = '''
        label {
            font-style: normal;
            padding-right: 4 px;
        }
        '''
        self.text_doc = QtGui.QTextDocument()
        self.text_doc.setDefaultStyleSheet(self.css)
        self.text_doc.setHtml('<body></body>')
        self.textEdit.setDocument(self.text_doc)
        self.color_dict = {
            'O': '#FFFFFF',
            'DOS': '#EE964B',
            'UNIT': '#F95738',
            'WHO': '#8F78AD',
            'FREQ': '#D4BA6A',
            'DUR': '#BBCD67'
        }

        self.label.setText(f"<html><head/><body><p><span style=\" background-color:"
                           f"{self.color_dict['DOS']};\">Dosage</span></p></body></html>")
        self.label_2.setText(f"<html><head/><body><p><span style=\" background-color:"
                             f"{self.color_dict['UNIT']};\">Unit</span></p></body></html>")
        self.label_3.setText(f"<html><head/><body><p><span style=\" background-color:"
                             f"{self.color_dict['WHO']};\">Who</span></p></body></html>")
        self.label_4.setText(f"<html><head/><body><p><span style=\" background-color:"
                             f"{self.color_dict['FREQ']};\">Frequency</span></p></body></html>")
        self.label_5.setText(f"<html><head/><body><p><span style=\" background-color:"
                             f"{self.color_dict['DUR']};\">Period</span></p></body></html>")

    def __load_text__(self):
        file_path = QtWidgets.QFileDialog.getOpenFileName(
            QtWidgets.QFileDialog(), 'Open file', '~', "All files *")[0]
        if not file_path == '':
            with open(file_path) as file:
                text = file.read(-1)
                file.close()

            self.text_doc.setHtml('<body>' + text + '</body>')

    def __train_finished__(self, value):
        print('received result: ' + str(value))
        self.pushButton_3.setText("Train")
        self.pushButton_3.setEnabled(True)

    def __train_model__(self):
        def __train_in_thread__(data_set):
            process = subprocess.Popen(['python', 'model/crf_trainer.py', data_set]
                                       , stdout=subprocess.PIPE)
            out, _ = process.communicate()
            process.wait()
            self.__train_finished__(out)

        data_set = QtWidgets.QFileDialog.getOpenFileName(
            QtWidgets.QFileDialog(), 'Select data set', '~', "*.tsv")[0]

        thread = threading.Thread(target=__train_in_thread__, args=(data_set,))
        thread.start()

        self.pushButton_3.setText("Training...")
        self.pushButton_3.setEnabled(False)

    def __load_model__(self):
        model_file = QtWidgets.QFileDialog.getOpenFileName(
            QtWidgets.QFileDialog(), 'Select model', '~', "*.pkl")[0]
        if model_file:
            self.model.load(model_file)

    def __extract__(self):
        text = self.textEdit.toPlainText()

        parsed_sentences = self.model.predict(text)
        self.textEdit.clear()

        labeled_text = []
        for sentence in parsed_sentences:
            labeled_text += sentence

        self.textEdit.setDocument(self.__get_rich_text(labeled_text))

    def __get_rich_text(self, parsed_text):
        rich_text = f'<body>'
        length = len(parsed_text)
        for i, (word, label) in enumerate(parsed_text):
            if label == 'O':
                rich_text += f'<label>{word}</label>'
            else:
                label = label.split('-')[1]
                rich_text += f'<nobr><label style=\"background-color:' \
                             f'{self.color_dict[label]}\">{word}</label></nobr>'

            # to keep the full stop next to the last word
            if i < length - 2:
                rich_text += f"<label> </label>"

        rich_text += f'</body>'
        self.text_doc.setHtml(rich_text)
        return self.text_doc
 def __init__(self, model_file='model/model.pkl'):
     self.model = Model()
class Trainer:
    """
    Generates a CRF model given a data set of labeled words.
    """
    def __init__(self, model_file='model/model.pkl'):
        self.model = Model()

    def generate_model(self, data_set):
        """
        Generates a CRF model given the data set.
        It saves the model to disk with the name 'model.pkl'.
        :param data_set: Path to the labeled data set.
        :return: Performance results.
        """
        x_train, y_train, x_test, y_test = self.gen_test_train(data_set)

        results = self.gen_model(x_train, y_train, x_test, y_test)
        return results

    def gen_model(self, x_train, y_train, x_test, y_test):

        for i in range(len(y_train)):
            for j in range(len(y_train[i])):
                y_train[i][j] = y_train[i][j].replace('B-', '')
                y_train[i][j] = y_train[i][j].replace('O-', '')
                y_train[i][j] = y_train[i][j].replace('I-', '')

        for i in range(len(y_test)):
            for j in range(len(y_test[i])):
                y_test[i][j] = y_test[i][j].replace('B-', '')
                y_test[i][j] = y_test[i][j].replace('O-', '')
                y_test[i][j] = y_test[i][j].replace('I-', '')

        labels = ['DOS', 'UNIT', 'FREQ', 'DUR', 'WHO']
        # labels = ['O-DOS', 'B-DOS', 'I-UNIT', 'B-UNIT', 'O-UNIT', 'I-FREQ', 'B-FREQ', 'O-FREQ', 'I-DUR', 'B-DUR', 'O-DUR', 'I-WHO', 'B-WHO', 'O-WHO']
        # labels = ['m', 'r', 'f', 'do', 'du', 'mo']
        crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                                   max_iterations=100,
                                   all_possible_transitions=True)
        params_space = {
            'c1': scipy.stats.expon(scale=0.5),
            'c2': scipy.stats.expon(scale=0.05),
        }

        # use the same metric for evaluation
        f1_scorer = make_scorer(metrics.flat_f1_score,
                                average='weighted',
                                labels=labels)

        # search
        rand_search = RandomizedSearchCV(crf,
                                         params_space,
                                         cv=3,
                                         verbose=1,
                                         n_jobs=-1,
                                         n_iter=50,
                                         scoring=f1_scorer)
        rand_search.fit(x_train, y_train)

        crf = rand_search.best_estimator_

        y_prediction = crf.predict(x_test)

        # group B and I results
        sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))

        joblib.dump(crf, 'model.pkl')

        precision = metrics.flat_precision_score(y_test,
                                                 y_prediction,
                                                 labels=sorted_labels,
                                                 average='micro')
        recall = metrics.flat_recall_score(y_test,
                                           y_prediction,
                                           labels=sorted_labels,
                                           average='micro')
        f1 = metrics.flat_f1_score(y_test,
                                   y_prediction,
                                   labels=sorted_labels,
                                   average='micro')

        print('MICRO')
        print(precision, recall, f1)

        precision = metrics.flat_precision_score(y_test,
                                                 y_prediction,
                                                 labels=sorted_labels,
                                                 average='macro')
        recall = metrics.flat_recall_score(y_test,
                                           y_prediction,
                                           labels=sorted_labels,
                                           average='macro')
        f1 = metrics.flat_f1_score(y_test,
                                   y_prediction,
                                   labels=sorted_labels,
                                   average='macro')

        print('MACRO')
        print(precision, recall, f1)

        return metrics.flat_classification_report(y_test,
                                                  y_prediction,
                                                  labels=sorted_labels,
                                                  digits=3)

    def validate_performance(self, test_set):
        sentences = self.__load_corpus__(test_set)

        y_test = [self.model.sentence2labels(s) for s in sentences]

        y_prediction = []
        for i, sent in enumerate(sentences):
            new_sent = ' '.join([word[0] for word in sent])
            prediction = self.model.predict(new_sent)
            new_prediction = []
            if len(prediction) > 1:
                for p in prediction:
                    new_prediction += [p1 for p1 in p]
                # print(prediction)
                # print(new_prediction)

                prediction = new_prediction
            else:
                prediction = prediction[0]

            try:
                pred = [w[1] for w in prediction]
            except Exception:
                print(prediction)
                return

            # if len(pred) != len(y_test[i]):
            #     print(sent)
            #     print(new_sent)
            #     print(y_test[i])
            #     print(len(y_test[i]))
            #     print(pred)
            #     print(len(pred))

            y_prediction.append(pred)

        labels = [
            'O-DOS', 'B-DOS', 'I-UNIT', 'B-UNIT', 'O-UNIT', 'I-FREQ', 'B-FREQ',
            'O-FREQ', 'I-DUR', 'B-DUR', 'O-DUR', 'I-WHO', 'B-WHO', 'O-WHO'
        ]

        for i in range(len(y_prediction)):
            for j in range(len(y_prediction[i])):
                y_prediction[i][j] = y_prediction[i][j].replace('B-', '')
                y_prediction[i][j] = y_prediction[i][j].replace('O-', '')
                y_prediction[i][j] = y_prediction[i][j].replace('I-', '')

        for i in range(len(y_test)):
            for j in range(len(y_test[i])):
                y_test[i][j] = y_test[i][j].replace('B-', '')
                y_test[i][j] = y_test[i][j].replace('O-', '')
                y_test[i][j] = y_test[i][j].replace('I-', '')

        labels = ['DOS', 'UNIT', 'FREQ', 'DUR', 'WHO']

        # labels = ['DOS', 'UNIT', 'WHO', 'DUR', 'FREQ']

        sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))

        precision = metrics.flat_precision_score(y_test,
                                                 y_prediction,
                                                 labels=sorted_labels,
                                                 average='micro')
        recall = metrics.flat_recall_score(y_test,
                                           y_prediction,
                                           labels=sorted_labels,
                                           average='micro')
        f1 = metrics.flat_f1_score(y_test,
                                   y_prediction,
                                   labels=sorted_labels,
                                   average='micro')

        print('MICRO')
        print(precision, recall, f1)

        precision = metrics.flat_precision_score(y_test,
                                                 y_prediction,
                                                 labels=sorted_labels,
                                                 average='macro')
        recall = metrics.flat_recall_score(y_test,
                                           y_prediction,
                                           labels=sorted_labels,
                                           average='macro')
        f1 = metrics.flat_f1_score(y_test,
                                   y_prediction,
                                   labels=sorted_labels,
                                   average='macro')

        print('MACRO')
        print(precision, recall, f1)

        print(
            metrics.flat_classification_report(y_test,
                                               y_prediction,
                                               labels=sorted_labels,
                                               digits=3))

    def gen_test_train(self, corpus_file):
        """
        Given the corpus file, it will generate the train and test sets
            whose cardinality is in a 90%/10% ratio.
        """
        sentences = self.__load_corpus__(corpus_file)

        test_number = int(len(sentences) * 0.1)
        test = random.sample(sentences, test_number)
        train = list(sent for sent in sentences if sent not in test)

        x_train = [self.model.sentence2features(s) for s in train]
        y_train = [self.model.sentence2labels(s) for s in train]

        x_test = [self.model.sentence2features(s) for s in test]
        y_test = [self.model.sentence2labels(s) for s in test]

        return x_train, y_train, x_test, y_test

    @staticmethod
    def __write_tsv__(sentences, name):
        with open(name, 'w') as file:
            for sent in sentences:
                for line in sent:
                    file.write(line[0] + '\t' + line[1] + '\t' + line[2] +
                               '\n')
                file.write('\n')
            file.close()

    @staticmethod
    def __load_corpus__(corpus_file):
        with open(corpus_file, newline='', encoding='utf-8') as file:
            data = list(csv.reader(file, delimiter='\t'))

        sentences = []
        sent = []

        for line in data:
            if line == [] or line[0] == '':
                sentences.append(sent)
                sent = []
            else:
                sent.append(line)

        sentences.append(sent)

        # for sent in sentences:
        #     if sent and sent[0] == '0' and sent[1] == '0':
        #         sent[0] = '.'
        #         sent[1] = '.'

        return sentences
예제 #8
0
class I2B2Extractor:
    def __init__(self):
        self.model = Model('model.pkl')
        self.path = '../i2b2_data/data/'
        self.identifiers = {
            'DOS': 'do',
            'UNIT': 'do',
            'FREQ': 'f',
            'PER': 'du'
        }
        label_files = os.listdir(self.path + 'annotations_ground_truth/pool/')
        self.label_file_dict = {
            name.split('.')[0]: name
            for name in label_files
        }

    def parse_file(self, file):
        with open(file) as f:
            lines = f.readlines()
        for i, line in enumerate(lines):
            self.label_line(line, i + 1)

    def extract_all(self):
        training_set_folders = [
            self.path + 'training.sets.released/' + str(ind)
            for ind in range(1, 11)
        ]

        for training_set in training_set_folders:
            for file in os.listdir(training_set):
                if file in self.label_file_dict.keys():
                    self.parse_file(training_set + '/' + file)

                return

    def write_output(self):
        pass

    def label_line(self, line, index):
        prediction = self.model.predict(line)
        if len(prediction) == 0:
            return
        prediction = prediction[0]

        print(prediction)
        entities = {
            'm': ['nm'],
            'do': [],
            'mo': ['nm'],
            'f': [],
            'du': [],
            'r': ['nm'],
            'ln': ['nm']
        }
        for i, (word, entity) in enumerate(prediction):
            if entity not in self.identifiers.keys():
                continue
            entities[self.identifiers[entity]].append(
                (word, str(index) + ':' + str(i)))
        print(entities.items())
        print()

    def generate_labeled_file(self, file):
        with open(self.path + 'train.test.released.8.17.09/' + file) as f:
            lines = f.readlines()

        words1 = []
        locations = []
        for i, line in enumerate(lines):
            words = line.split(' ')
            words = [w for w in words if w not in ['\n', '']]
            words = [
                w.replace('\n', '') if w.endswith('\n') else w for w in words
            ]
            locations += [(i + 1, n) for n in range(len(words))]
            words1 += words

        text = ' '.join([s for s in words1])
        sentences = nltk.sent_tokenize(text)

        words2 = []
        for sent in sentences:
            words = sent.split(' ')
            words2 += words

        k = 0
        new_sentences = []
        for i, sent in enumerate(sentences):
            new_sent = sent.split(' ')
            j = 0

            while j < len(new_sent):
                word = new_sent[j]

                if not word == words1[k]:
                    if j == 0 and words1[k - 1].endswith(new_sent[0]):
                        del new_sent[0]
                        j -= 1
                        k -= 1

                    if words1[k].startswith(word):

                        if len(new_sent) > j + 1:
                            if words1[k].endswith(new_sent[j + 1]):
                                new_sent[j] = words1[k]
                                del new_sent[j + 1]
                                k -= 1
                        elif words1[k].endswith(sentences[i +
                                                          1].split(' ')[0]):
                            new_sent[j] = words1[k]
                        elif (words1[k] + words1[k + 1]).endswith(
                                sentences[i + 1].split(' ')[0]):
                            new_sent[j] = words1[k]
                k += 1
                j += 1

            tmp = []
            for word in new_sent:
                if '\t' in word:
                    word = word.replace('\t', ' ')

                tmp.append(word)
            new_sentences.append(tmp)

        words2 = []
        for sent in new_sentences:
            words2 += sent

        k = 0
        labeled_sentences = []
        for i, sent in enumerate(new_sentences):
            pos = nltk.pos_tag(sent)
            loc = locations[k:k + len(pos)]
            k += len(pos)

            labeled_sentences.append(list(zip(pos, loc)))

        labels = self.get_labels(file)

        with open('i2b2_corpus/' + file + '.tsv', 'w') as f:
            for sent in labeled_sentences:
                for tup in sent:
                    if tup[1] in labels.keys():
                        f.write(tup[0][0] + '\t' + tup[0][1] + '\t' +
                                labels[tup[1]] + '\n')
                    else:
                        f.write(tup[0][0] + '\t' + tup[0][1] + '\t' + 'O\n')
                f.write('\n')
            f.close()

    def get_labels(self, file):
        label_file = self.label_file_dict[file]

        with open(self.path +
                  'annotations_ground_truth/converted.noduplicates.sorted/' +
                  label_file) as f:
            lines = f.readlines()

        labels = []
        for line in lines:
            labels += line.split('||')

        result = {}
        for label in labels:
            entity = label.split('=')[0]
            if len(label.split(' ')) < 3:
                continue
            else:
                if ',' in label and '...' in label:
                    positions = label.split('\" ')[1].split(',')

                    for pos in positions:
                        position = pos.split(' ')[-2:]
                        line_no = int(position[0].split(':')[0])
                        start = int(position[0].split(':')[1])
                        stop = int(position[1].split(':')[1])
                        for i in range(start, stop + 1):
                            result.update({tuple([line_no, i]): entity})

                else:
                    position = label.split(' ')[-2:]
                    line_no = int(position[0].split(':')[0])
                    start = int(position[0].split(':')[1])
                    stop = int(position[1].split(':')[1])
                    for i in range(start, stop + 1):
                        result.update({tuple([line_no, i]): entity})

        return result

    def generate_corpus_files(self):
        training_set = self.path + 'train.test.released.8.17.09/'
        for file in os.listdir(training_set):
            if file in self.label_file_dict.keys():
                self.generate_labeled_file(file)

    def concatenate_corpus_files(self):
        files = os.listdir('i2b2_corpus/')
        print(files)

        lines = []
        for file in files:
            with open('i2b2_corpus/' + file) as f:
                lines.append(f.readlines())

        with open('corpus_i2b2.tsv', 'w') as f:
            for text in lines:
                f.writelines(text)
                f.write('\n')
예제 #9
0
def chunk_sentence(sentence):
    model = Model('../model.pkl')
    sentence = fix_dashes_slashes(sentence)
    ne_labels = model.predict(sentence)
    sent = nltk.word_tokenize(sentence)
    pos = nltk.pos_tag(sent)

    sent = list(zip([x[0] for x in pos], [n[1] for n in ne_labels]))
    sent = convert_to_IOB(sent)

    for i, t in enumerate(sent):
        if t[0] == 'or':
            sent[i] = ('or', t[1] + '_or')
        if t[0] == 'to':
            sent[i] = ('to', t[1] + '_to')
        if t[0] == '-':
            sent[i] = ('-', t[1] + '_-')
        if t[0] == '/':
            sent[i] = ('/', t[1] + '_/')

    grammar = r"""
    DOS: {<DOS.*>+}
    UNIT: {<UNIT.*>+}
    FREQ: {<FREQ.*>+}
    PER: {<PER.*>+}
    WHO: {<WHO.*>+}
    O: {<O>+}
    O_or: {<O_or>}
    O_to: {<O_to>}
    O_-: {<O_->}
    O_/: {<O_/>}
    DOSAGE: {<DOS><UNIT>?<O_.*><DOS>?<UNIT>}
    DOSAGE: {<DOS><UNIT>}
    O: {<DOS>}
    O: {<UNIT>}
    """

    cp = nltk.RegexpParser(grammar)
    print(sent)
    result = cp.parse(sent)

    for st in result.subtrees(lambda t: '_' in t.label()):
        st.set_label(st.label().split('_')[0])

    for leafPos in result.treepositions('leaves'):
        result[leafPos] = result[leafPos][0]

    res = nltk.Tree('S', [])
    i = 0
    while i < len(result):
        t = result[i]
        if t.label() == 'O':
            leaves = t.leaves()
            while t.label() == 'O':
                i += 1
                try:
                    t = result[i]
                except IndexError:
                    break
                if t.label() == 'O':
                    leaves += t.leaves()
            res.append(nltk.Tree('O', leaves))
            continue
        res.append(t)
        i += 1

    # res.draw()
    return res