Пример #1
0
    def setup_class(cls):
        # create a sample dataset to test
        cls.dataset = Dataset()
        part = Part('some text c.A100G p.V100Q some text')
        part.sentences = [[
            Token('some', 0),
            Token('text', 5),
            Token('c', 10),
            Token('.', 11),
            Token('A', 12),
            Token('100', 13),
            Token('G', 16),
            Token('p', 18),
            Token('.', 19),
            Token('V', 20),
            Token('100', 21),
            Token('Q', 24),
            Token('some', 26),
            Token('text', 31)
        ]]

        predicted_labels = [
            'O', 'O', 'B', 'I', 'I', 'I', 'E', 'A', 'I', 'I', 'I', 'E', 'O',
            'O'
        ]

        for index, label in enumerate(predicted_labels):
            part.sentences[0][index].predicted_labels = [Label(label)]

        cls.dataset.documents['doc_1'] = Document()
        cls.dataset.documents['doc_1'].parts['p1'] = part

        part = Part('test edge case DNA A927B test')
        part.sentences = [[
            Token('test', 0),
            Token('edge', 5),
            Token('case', 10),
            Token('DNA', 15),
            Token('A', 19),
            Token('927', 20),
            Token('B', 23),
            Token('test', 25)
        ]]

        predicted_labels = ['O', 'O', 'O', 'O', 'M', 'P', 'M', 'O']

        for index, label in enumerate(predicted_labels):
            part.sentences[0][index].predicted_labels = [Label(label)]

        cls.dataset.documents['doc_1'].parts['p2'] = part
Пример #2
0
    def tag(data, model_file, class_id):
        warnings.warn('Use non-static `annotate` instead', DeprecationWarning)
        """
        :type data: nalaf.structures.data.Dataset
        :type model_file: str
        """

        tagger = pycrfsuite.Tagger()

        try:
            tagger.open(model_file)

            for sentence in data.sentences():
                labels = tagger.tag(
                    pycrfsuite.ItemSequence(token.features
                                            for token in sentence))

                for token_index in range(len(sentence)):
                    label = labels[token_index]
                    sentence[token_index].predicted_labels = [
                        Label(label, tagger.marginal(label, token_index))
                    ]

            data.form_predicted_annotations(class_id)

        finally:
            tagger.close()
Пример #3
0
    def read_predictions(self, dataset, class_id, prediction_file='output.txt'):
        """
        :type dataset: nalaf.structures.data.Dataset

        Reads in the predictions made by our model for each token and stores them into token.predicted_label[]

        Requires a dataset object and the output prediction file.

        The default output prediction file is 'output.txt'. The format is:
            * [predicted label]:[marginal probability]
            * in new line for each token
            * followed by a blank line for the end of the sentence

        IMPORTANT NOTE:
        Assumes a call to the test() function was made previously with the 'i' option included.
        Furthermore, it assumes we are calling it with the same dataset object used to create the test file.

        For example first we would call:
            * crf.create_input_file(dataset=test, mode='test')
            * crf.test(options='-m example_entity_model -i test > output.txt')
        Then we would call:
            * crf.read_predictions(dataset=test)
        """

        os.chdir(self.directory)
        with open(prediction_file) as file:
            for sentence in dataset.sentences():
                for token in sentence:
                    label, probability = file.readline().split(':')
                    token.predicted_labels = [Label(label, float(probability))]

                file.readline()  # skip the empty line signifying new sentence

        # call form_predicted_annotations() to populate the mention level predictions
        dataset.form_predicted_annotations(class_id)
Пример #4
0
    def annotate(self, corpus, class_id):
        """
        :type corpus: nalaf.structures.data.Dataset
        :type class_id: str ~ to annotate with
        """

        for sentence in corpus.sentences():
            labels = self.tagger.tag(pycrfsuite.ItemSequence(token.features for token in sentence))

            for token_index in range(len(sentence)):
                label = labels[token_index]
                sentence[token_index].predicted_labels = [Label(label, self.tagger.marginal(label, token_index))]

        corpus.form_predicted_annotations(class_id)
Пример #5
0
    def label(self, dataset):
        """
        :type dataset: nalaf.structures.data.Dataset
        """
        for part in dataset.parts():
            for sentence in part.sentences:
                for token in sentence:
                    token.original_labels = [Label('O')]

                    for ann in part.annotations:
                        start = ann.offset
                        end = ann.offset + len(ann.text)
                        if start <= token.start < token.end <= end:
                            token.original_labels[0].value = 'I-{}'.format(
                                ann.class_id)
Пример #6
0
    def annotate(self, corpus, class_id):
        """
        :type corpus: nalaf.structures.data.Dataset
        :type class_id: str ~ to annotate with
        """

        for sentence in corpus.sentences():
            labels = self.tagger.tag(pycrfsuite.ItemSequence(token.features for token in sentence))

            for token_index in range(len(sentence)):
                label = labels[token_index]
                try:
                    sentence[token_index].predicted_labels = [Label(label, self.tagger.marginal(label, token_index))]
                except Exception as e:
                    raise Exception("Exception when assining the predicted labels; likely a Multi-Thread problem", e)

        corpus.form_predicted_annotations(class_id)
Пример #7
0
    def label(self, dataset):
        """
        :type dataset: nalaf.structures.data.Dataset
        """
        for part in dataset.parts():
            previous_token = None
            for sentence in part.sentences:
                alternate = 'W'
                for token in sentence:
                    token.original_labels = [Label('O')]

                    for ann in part.annotations:
                        start = ann.offset
                        end = ann.offset + len(ann.text)
                        if start == token.start or start < token.start < end:
                            if ann.class_id == self.mut_class_id:
                                self._match_regex_label(previous_token, token)
                                previous_token = token

                                # replace temporary label with W or M
                                if token.original_labels[0].value == '*':
                                    token.original_labels[0].value = alternate
                                    alternate = 'W' if alternate == 'M' else 'M'
                                # reset the alternation to W since we reached end
                                if token.end == end:
                                    alternate = 'W'
                                break

                # iterate a sliding window of 3
                # when you find 'P I P' labels replace them with 'P P P'
                for previous, current, next in zip(sentence, sentence[1:],
                                                   sentence[2:]):
                    if previous.original_labels[
                            0].value == 'P' and next.original_labels[
                                0].value == 'P':
                        if current.original_labels[0].value == 'I':
                            current.original_labels[0].value = 'P'