Пример #1
0
    def test_iob1_to_bioul(self):
        tag_sequence = ["I-ORG", "O", "I-MISC", "O"]
        bioul_sequence = span_utils.to_bioul(tag_sequence, encoding="IOB1")
        assert bioul_sequence == ["U-ORG", "O", "U-MISC", "O"]

        tag_sequence = ["O", "I-PER", "B-PER", "I-PER", "I-PER", "B-PER"]
        bioul_sequence = span_utils.to_bioul(tag_sequence, encoding="IOB1")
        assert bioul_sequence == ["O", "U-PER", "B-PER", "I-PER", "L-PER", "U-PER"]
Пример #2
0
    def test_iob1_to_bioul(self):
        tag_sequence = ['I-ORG', 'O', 'I-MISC', 'O']
        bioul_sequence = span_utils.to_bioul(tag_sequence, encoding="IOB1")
        assert bioul_sequence == ['U-ORG', 'O', 'U-MISC', 'O']

        tag_sequence = ['O', 'I-PER', 'B-PER', 'I-PER', 'I-PER', 'B-PER']
        bioul_sequence = span_utils.to_bioul(tag_sequence, encoding="IOB1")
        assert bioul_sequence == ['O', 'U-PER', 'B-PER', 'I-PER', 'L-PER', 'U-PER']
Пример #3
0
    def test_bio_to_bioul(self):
        tag_sequence = ['B-ORG', 'O', 'B-MISC', 'O', 'B-MISC', 'I-MISC', 'I-MISC']
        bioul_sequence = span_utils.to_bioul(tag_sequence, encoding="BIO")
        assert bioul_sequence == ['U-ORG', 'O', 'U-MISC', 'O', 'B-MISC', 'I-MISC', 'L-MISC']

        # Encoding in IOB format should throw error with incorrect encoding.
        with self.assertRaises(span_utils.InvalidTagSequence):
            tag_sequence = ['O', 'I-PER', 'B-PER', 'I-PER', 'I-PER', 'B-PER']
            bioul_sequence = span_utils.to_bioul(tag_sequence, encoding="BIO")
Пример #4
0
    def test_bio_to_bioul(self):
        tag_sequence = ["B-ORG", "O", "B-MISC", "O", "B-MISC", "I-MISC", "I-MISC"]
        bioul_sequence = span_utils.to_bioul(tag_sequence, encoding="BIO")
        assert bioul_sequence == ["U-ORG", "O", "U-MISC", "O", "B-MISC", "I-MISC", "L-MISC"]

        # Encoding in IOB format should throw error with incorrect encoding.
        with pytest.raises(span_utils.InvalidTagSequence):
            tag_sequence = ["O", "I-PER", "B-PER", "I-PER", "I-PER", "B-PER"]
            bioul_sequence = span_utils.to_bioul(tag_sequence, encoding="BIO")
Пример #5
0
    def test_iob1_to_bioul(self):
        tag_sequence = [u'I-ORG', u'O', u'I-MISC', u'O']
        bioul_sequence = span_utils.to_bioul(tag_sequence, encoding=u"IOB1")
        assert bioul_sequence == [u'U-ORG', u'O', u'U-MISC', u'O']

        tag_sequence = [u'O', u'I-PER', u'B-PER', u'I-PER', u'I-PER', u'B-PER']
        bioul_sequence = span_utils.to_bioul(tag_sequence, encoding=u"IOB1")
        assert bioul_sequence == [
            u'O', u'U-PER', u'B-PER', u'I-PER', u'L-PER', u'U-PER'
        ]
Пример #6
0
    def test_bio_to_bioul(self):
        tag_sequence = [
            'B-ORG', 'O', 'B-MISC', 'O', 'B-MISC', 'I-MISC', 'I-MISC'
        ]
        bioul_sequence = span_utils.to_bioul(tag_sequence, encoding="BIO")
        assert bioul_sequence == [
            'U-ORG', 'O', 'U-MISC', 'O', 'B-MISC', 'I-MISC', 'L-MISC'
        ]

        # Encoding in IOB format should throw error with incorrect encoding.
        with self.assertRaises(span_utils.InvalidTagSequence):
            tag_sequence = ['O', 'I-PER', 'B-PER', 'I-PER', 'I-PER', 'B-PER']
            bioul_sequence = span_utils.to_bioul(tag_sequence, encoding="BIO")
    def test_decode_runs_correctly(self):
        training_tensors = self.dataset.as_tensor_dict()
        output_dict = self.model(**training_tensors)
        decode_output_dict = self.model.make_output_human_readable(output_dict)
        lengths = get_lengths_from_binary_sequence_mask(decode_output_dict["mask"]).data.tolist()
        # Hard to check anything concrete which we haven't checked in the above
        # test, so we'll just check that the tags are equal to the lengths
        # of the individual instances, rather than the max length.
        for prediction, length in zip(decode_output_dict["tags"], lengths):
            assert len(prediction) == length

            # Checks that the output is a well formed BIO sequence,
            # as otherwise an exception is thrown.
            to_bioul(prediction, encoding="BIO")
Пример #8
0
    def test_decode_runs_correctly(self):
        training_tensors = self.dataset.as_tensor_dict()
        output_dict = self.model(**training_tensors)
        decode_output_dict = self.model.make_output_human_readable(output_dict)
        lengths = get_lengths_from_binary_sequence_mask(decode_output_dict["mask"]).data.tolist()
        # Hard to check anything concrete which we haven't checked in the above
        # test, so we'll just check that the tags are equal to the lengths
        # of the individual instances, rather than the max length.
        for prediction, length in zip(decode_output_dict["wordpiece_tags"], lengths):
            assert len(prediction) == length

        for prediction, length in zip(decode_output_dict["tags"], lengths):
            # to_bioul throws an exception if the tag sequence is not well formed,
            # so here we can easily check that the sequence we produce is good.
            to_bioul(prediction, encoding="BIO")
Пример #9
0
def convert_conll2003_ner_to_bioul(filename: str, out_filename: str):
    """ Converts the conll2003 file to bilou tagged strings
    and writes it to out_filename

    The out_filename will have the first column as word and
    the next three columns as the NER tags

    Parameters
    ----------
    filename: str
        Convert the file in conll2003 format to bioul tags
    out_filename: str
        Writes the file to bioul format

    Returns
    -------
    None

    """
    msg_printer = wasabi.Printer()
    lines: List[List[str]] = []
    labels: List[List[str]] = []

    with open(filename) as fp:
        lines_: List[str] = []
        labels_: List[str] = []  # every list is a label for one namespace
        for text in fp:
            text_ = text.strip()
            if bool(text_):
                line_labels = text_.split()
                line_ = line_labels[0]
                label_ = line_labels[3]  # all 3 tags
                lines_.append(line_)
                labels_.append(label_)
            elif text_ == "-DOCSTART-":
                # skip next empty line as well
                lines_ = []
                labels_ = []
                next(fp)
            else:
                if len(lines_) > 0 and len(labels_) > 0:
                    lines.append(lines_)
                    labels.append(labels_)
                    lines_ = []
                    labels_ = []
    bilou_tags = []
    for label in labels:
        bilou_ = to_bioul(tag_sequence=label, encoding="IOB1")
        bilou_tags.append(bilou_)

    with msg_printer.loading(f"writing BILOU tags for {filename}"):
        with open(out_filename, "w") as fp:
            for line, bilou_tags_ in zip(lines, bilou_tags):
                assert len(line) == len(bilou_tags_)
                for word, tag in zip(line, bilou_tags_):
                    fp.write(" ".join([word, tag, tag, tag]))
                    fp.write("\n")

                fp.write("\n")
    msg_printer.good(f"Finished writing BILOU tags for {filename}")
Пример #10
0
def spans_to_bio_tags(spans, length):
    tag_sequence = ['O'] * length
    for span in spans:
        is_inner_span = False
        for span_2 in spans:
            if (not is_same_span(span, span_2)) and is_x_in_y(span, span_2):
                is_inner_span = True

        if is_inner_span:
            continue

        start, end, label = span
        tag_sequence[start] = 'B-' + label
        for ix in range(start + 1, end):
            tag_sequence[ix] = 'I-' + label

    return to_bioul(tag_sequence, encoding='BIO')
Пример #11
0
    def text_to_instance(
            self,  # type: ignore
            tokens: List[Token],
            verb_label: List[int],
            parseTree: Tree,
            tags: List[str] = None,
            fout=None) -> Instance:
        """
        We take `pre-tokenized` input here, along with a verb label.  The verb label should be a
        one-hot binary vector, the same length as the tokens, indicating the position of the verb
        to find arguments for.
        """
        # pylint: disable=arguments-differ

        # Convert tags to BIOUL QUESTION -  BIO or IOB1?
        # print(f"Tags before: {tags}")

        if (self.label_encoding == "BIOUL"):
            if (tags is not None):
                old_tags = deepcopy(tags)
                tags = to_bioul(tags, encoding="BIO")
                try:
                    spans = bioul_tags_to_spans(tags)
                except InvalidTagSequence:
                    print(f"Old tags: {old_tags}")
                    print(f"New tags: {tags}\n")

            # Create span matrix from parse tree
            leftLabelsTree = leftMost(parseTree)
            rightLabelsTree = rightMost(parseTree)

            # leaves = []
            # right_leaves = []
            # get_leaves(parseTree, leaves)
            # get_leaves(parseTree, right_leaves)
            # assert(leaves == right_leaves)
            # leaf2idx = {}
            # for idx, leaf in enumerate(leaves):
            #     leaf2idx[leaf] = idx

            leftList = []
            rightList = []

            addToList(leftLabelsTree, leftList)
            addToList(rightLabelsTree, rightList)

            if len(leftList) != len(rightList):
                raise Exception(
                    f"For tree {parseTree}, leftList and rightList lengths do not match"
                )

            span_matrix = np.zeros([len(tokens), len(tokens)])

            for idx in range(len(leftList)):
                leftLabel, rightLabel = leftList[idx], rightList[idx]
                if (leftLabel == rightLabel):
                    continue
                span_matrix[leftLabel, rightLabel] = 1

        # print(f"Tags after: {tags}\n")

        # print(tokens)
        # print(verb_label)
        # print(tags)

        fields: Dict[str, Field] = {}
        text_field = TextField(tokens, token_indexers=self._token_indexers)
        fields['tokens'] = text_field
        fields['verb_indicator'] = SequenceLabelField(verb_label, text_field)
        if (self.label_encoding == "BIOUL"):
            fields['span_matrix'] = ArrayField(span_matrix)

        if all([x == 0 for x in verb_label]):
            verb = None
        else:
            verb = tokens[verb_label.index(1)].text
        metadata_dict = {"words": [x.text for x in tokens], "verb": verb}
        if tags:
            fields['tags'] = SequenceLabelField(tags, text_field)
            metadata_dict["gold_tags"] = tags
        fields["metadata"] = MetadataField(metadata_dict)

        if (fout is not None):
            srl_dict = {"parse_tree": parseTree, "span_matrix": span_matrix}
            pickle.dump(srl_dict, fout)

        return Instance(fields)