def main(file_prefix): base_name, _ = os.path.splitext(file_prefix) log_file = './data/error/{}.error'.format(base_name) with open('./data/raw/{}'.format(file_prefix)) as fd, open(log_file, 'wt') as logger: output_lines = [] seq_list = [] for raw_line in fd: line = raw_line.strip() if not line: continue try: seq, sentence = process_one_line(line, logger) except CheckFailedError as e: continue else: seq_list.append(seq) output_lines.append(sentence) # write_conll(output_lines, 'data/{}.text'.format(file_prefix)) with open('./data/domain/{}.conllx'.format(base_name), 'wt') as output_fd: write_conllx(output_lines, output_fd)
def test_write_conllx(): sentence_1 = SentenceX() sentence_1.id = 'SID-1' sentence_1.write_as_row(['char-1', 'tag-1']) sentence_1.write_as_row(['char-2', 'tag-2']) sentence_2 = SentenceX() sentence_2.id = 'SID-2' sentence_2.write_as_row(['char-1', 'tag-1']) sentence_2.write_as_row(['char-2', 'tag-2']) sentence = [sentence_1, sentence_2] write_conllx(sentence, open('corpus4.txt', 'w'))
def write_to_file(self, output_file): sentence_list = [offset_to_sentence(offset) for offset in self] with open(output_file, "wt") as fd: write_conllx(sentence_list, fd)
#!/usr/bin/env python from tokenizer_tools.conllz.iterator_reader import conllx_iterator_reader from tokenizer_tools.split_data import split_data from tokenizer_tools.conllz.writer import write_conllx data = list(conllx_iterator_reader(['./data/all_data.conllx'])) train, dev, test = split_data(data) with open('./data/train.conllx', 'wt') as fd: write_conllx(train, fd) with open('./data/dev.conllx', 'wt') as fd: write_conllx(dev, fd) with open('./data/test.conllx', 'wt') as fd: write_conllx(test, fd)
#!/usr/bin/env python import os import pathlib from tokenizer_tools.conllz.iterator_reader import conllx_iterator_reader from tokenizer_tools.conllz.writer import write_conllx current_dir = os.path.dirname(os.path.abspath(__file__)) input_file_list = [ str(i) for i in pathlib.Path('./data/domain').iterdir() if i.is_file() ] data = list(conllx_iterator_reader(input_file_list)) with open('./data/all.conllx', 'wt') as fd: write_conllx(data, fd)