示例#1
0
def read_conllz_from_string(conllz_string):
    raw_sentence_list = conllz_string.split('\n\n')

    for raw_sentence in raw_sentence_list:
        cleaned_sentence = raw_sentence.strip()

        if not cleaned_sentence:
            # skip
            continue

        sentence = Sentence()
        raw_line_list = raw_sentence.split('\n')
        for index, raw_line in enumerate(raw_line_list):
            if index == 0:
                id = raw_line.strip('#\t\n ')
                sentence.id = id

                continue  # read id is done

            # line = raw_line.strip()
            item = raw_line.split('\t')

            if not raw_line or not item:
                # skip
                continue

            sentence.write_as_row(item)

        yield sentence
def test_write_conllz():
    sentence_1 = Sentence()
    sentence_1.id = 'SID-1'
    sentence_1.write_as_row(['char-1', 'tag-1'])
    sentence_1.write_as_row(['char-2', 'tag-2'])

    sentence_2 = Sentence()
    sentence_2.id = 'SID-2'
    sentence_2.write_as_row(['char-1', 'tag-1'])
    sentence_2.write_as_row(['char-2', 'tag-2'])

    sentence = [sentence_1, sentence_2]
    write_conllz(sentence, ['corpus4.txt'])
def conllz_to_offset(sentence_data: Sentence,
                     raise_exception=False,
                     attr_index=0) -> Tuple[Document, bool]:
    decoder = BILUOSequenceEncoderDecoder()

    input_text = sentence_data.word_lines
    tags_seq = sentence_data.get_attribute_by_index(attr_index)

    failed = False
    meta = copy.deepcopy(sentence_data.meta)

    try:
        seq = decoder.to_offset(tags_seq,
                                input_text,
                                label=meta.pop('label', None),
                                id=sentence_data.id,
                                **meta)
    except TagSetDecodeError as e:
        if not raise_exception:
            # invalid tag sequence will raise exception
            # so return a empty result
            seq = Document(input_text)
            failed = True
        else:
            raise

    return seq, failed
def test_write_as_row():
    s = Sentence(['1'], ['cfv', 'dddwedwf'], ['cfv'], 4)
    data = ['a', 'b', 'c']
    s.write_as_row(data)
    print(s)
def test_get_attribute():
    s = Sentence(1, ['2', 3], 3, 4)
    d = s.get_attribute(1)
    print(d)
def test_init():
    s = Sentence(1, 2, 3, 4)
    assert s.id == 4 and s.attribute_names == 3 and s.attribute_lines == 2 and s.word_lines == 1
示例#7
0
        sentence_id = sentence.id

        for index, row in enumerate(sentence.read_as_row()):
            if index == 0:  # only write at head
                sentence_id = sentence_id if sentence_id else str(uuid.uuid4())
                output_fd.write('{}\n'.format('\t'.join(['#', sentence_id])))

            output_fd.write('{}'.format("\t".join(row)))
            output_fd.write('\n')

        output_fd.write('\n')


if __name__ == "__main__":
    gold = "#\tSID-1\nchar-1\ttag-1\nchar-2\ttag-2\n\n#\tSID-2\nchar-1\ttag-1\nchar-2\ttag-2\n\n" ""
    sentence_1 = Sentence()
    sentence_1.id = 'SID-1'
    sentence_1.write_as_row(['char-1', 'tag-1'])
    sentence_1.write_as_row(['char-2', 'tag-2'])

    sentence_2 = Sentence()
    sentence_2.id = 'SID-2'
    sentence_2.write_as_row(['char-1', 'tag-1'])
    sentence_2.write_as_row(['char-2', 'tag-2'])

    with io.StringIO() as fd:
        write_conllz([sentence_1, sentence_2], fd)

        output = fd.getvalue()
        assert output == gold