def read_conllz_from_string(conllz_string): raw_sentence_list = conllz_string.split('\n\n') for raw_sentence in raw_sentence_list: cleaned_sentence = raw_sentence.strip() if not cleaned_sentence: # skip continue sentence = Sentence() raw_line_list = raw_sentence.split('\n') for index, raw_line in enumerate(raw_line_list): if index == 0: id = raw_line.strip('#\t\n ') sentence.id = id continue # read id is done # line = raw_line.strip() item = raw_line.split('\t') if not raw_line or not item: # skip continue sentence.write_as_row(item) yield sentence
def test_write_conllz(): sentence_1 = Sentence() sentence_1.id = 'SID-1' sentence_1.write_as_row(['char-1', 'tag-1']) sentence_1.write_as_row(['char-2', 'tag-2']) sentence_2 = Sentence() sentence_2.id = 'SID-2' sentence_2.write_as_row(['char-1', 'tag-1']) sentence_2.write_as_row(['char-2', 'tag-2']) sentence = [sentence_1, sentence_2] write_conllz(sentence, ['corpus4.txt'])
def conllz_to_offset(sentence_data: Sentence, raise_exception=False, attr_index=0) -> Tuple[Document, bool]: decoder = BILUOSequenceEncoderDecoder() input_text = sentence_data.word_lines tags_seq = sentence_data.get_attribute_by_index(attr_index) failed = False meta = copy.deepcopy(sentence_data.meta) try: seq = decoder.to_offset(tags_seq, input_text, label=meta.pop('label', None), id=sentence_data.id, **meta) except TagSetDecodeError as e: if not raise_exception: # invalid tag sequence will raise exception # so return a empty result seq = Document(input_text) failed = True else: raise return seq, failed
def test_write_as_row(): s = Sentence(['1'], ['cfv', 'dddwedwf'], ['cfv'], 4) data = ['a', 'b', 'c'] s.write_as_row(data) print(s)
def test_get_attribute(): s = Sentence(1, ['2', 3], 3, 4) d = s.get_attribute(1) print(d)
def test_init(): s = Sentence(1, 2, 3, 4) assert s.id == 4 and s.attribute_names == 3 and s.attribute_lines == 2 and s.word_lines == 1
sentence_id = sentence.id for index, row in enumerate(sentence.read_as_row()): if index == 0: # only write at head sentence_id = sentence_id if sentence_id else str(uuid.uuid4()) output_fd.write('{}\n'.format('\t'.join(['#', sentence_id]))) output_fd.write('{}'.format("\t".join(row))) output_fd.write('\n') output_fd.write('\n') if __name__ == "__main__": gold = "#\tSID-1\nchar-1\ttag-1\nchar-2\ttag-2\n\n#\tSID-2\nchar-1\ttag-1\nchar-2\ttag-2\n\n" "" sentence_1 = Sentence() sentence_1.id = 'SID-1' sentence_1.write_as_row(['char-1', 'tag-1']) sentence_1.write_as_row(['char-2', 'tag-2']) sentence_2 = Sentence() sentence_2.id = 'SID-2' sentence_2.write_as_row(['char-1', 'tag-1']) sentence_2.write_as_row(['char-2', 'tag-2']) with io.StringIO() as fd: write_conllz([sentence_1, sentence_2], fd) output = fd.getvalue() assert output == gold