def convert(ctb_root, out_root): ctb_root = join(ctb_root, 'bracketed') fids = [f for f in listdir(ctb_root) if isfile(join(ctb_root, f)) and \ f.endswith('.nw') or \ f.endswith('.mz') or \ f.endswith('.wb')] make_sure_path_exists(out_root) for f in fids: with open(join(ctb_root, f), 'r') as src, \ open(join(out_root, f.split('.')[0] + '.fid'), 'w') as out: # encoding='GB2312' in_s_tag = False try: for line in src: if line.startswith('<S ID=') or line.startswith('<seg id='): in_s_tag = True elif line.startswith('</S>') or line.startswith('</seg>'): in_s_tag = False elif line.startswith('<'): continue elif in_s_tag and len(line) > 1: out.write(line) except: pass
def convert_ctb8_to_bracketed(ctb_root, out_root): ctb_root = join(ctb_root, 'bracketed') chtbs = [f for f in listdir(ctb_root) if isfile(join(ctb_root, f)) and f.startswith('chtb')] make_sure_path_exists(out_root) for f in chtbs: with open(join(ctb_root, f), encoding='utf-8') as src, open(join(out_root, f + '.txt'), 'w', encoding='utf-8') as out: for line in src: if not line.startswith('<'): out.write(line)
def convert(ctb_root, out_root): ctb_root = join(ctb_root, 'bracketed') fids = [f for f in listdir(ctb_root) if isfile(join(ctb_root, f)) and f.endswith('.fid')] make_sure_path_exists(out_root) for f in fids: with open(join(ctb_root, f), encoding='GB2312') as src, open(join(out_root, f), 'w') as out: in_s_tag = False try: for line in src: if line.startswith('<S ID='): in_s_tag = True elif line.startswith('</S>'): in_s_tag = False elif in_s_tag: out.write(line) except: pass
def convert_ctb5_to_backeted(ctb_root, out_root): ctb_root = join(ctb_root, 'bracketed') fids = [ f for f in listdir(ctb_root) if isfile(join(ctb_root, f)) and f.endswith('.fid') ] make_sure_path_exists(out_root) for f in fids: with open(join(ctb_root, f), encoding='GB2312') as src, open(join(out_root, f), 'w') as out: in_s_tag = False try: for line in src: if line.startswith('<S ID='): in_s_tag = True elif line.startswith('</S>'): in_s_tag = False elif in_s_tag: out.write(line) except: # The last file throws encoding error at the very end, doesn't affect sentences. pass
print() if __name__ == '__main__': parser = argparse.ArgumentParser( description= 'Convert combined Penn Treebank files (.txt) to Stanford Dependency format (.conllx)' ) parser.add_argument( "--input", required=True, help= 'The folder containing train.txt/dev.txt/test.txt in bracketed format') parser.add_argument( "--lang", required=True, help='Which language? Use en for English, cn for Chinese') parser.add_argument( "--output", required=True, dest="output", help= 'The folder where to store the output train.conllx/dev.conllx/test.conllx in Stanford ' 'Dependency format') args = parser.parse_args() make_sure_path_exists(args.output) for f in ['train', 'dev', 'test']: convert(join(args.input, f + '.txt'), join(args.output, f + '.conllx'), args.lang)
'Which task (par, pos)? Use par for phrase structure parsing, pos for part-of-speech ' 'tagging') args = parser.parse_args() root_path = args.output task = args.task ext = 'txt' if task == 'par': training = list(range(2, 21 + 1)) development = [22] test = [23] elif task == 'pos': training = list(range(0, 18 + 1)) development = list(range(19, 21 + 1)) test = list(range(22, 24 + 1)) ext = 'tsv' else: eprint('Invalid task {}'.format(task)) exit(1) print('Importing ptb from nltk') from nltk.corpus import ptb print() make_sure_path_exists(root_path) combine(training, join(root_path, 'train.{}'.format(ext)), task) combine(development, join(root_path, 'dev.{}'.format(ext)), task) combine(test, join(root_path, 'test.{}'.format(ext)), task)