Exemplo n.º 1
0
def convert(ctb_root, out_root):
    ctb_root = join(ctb_root, 'bracketed')
    fids = [f for f in listdir(ctb_root) if isfile(join(ctb_root, f)) and \
        f.endswith('.nw') or \
        f.endswith('.mz') or \
        f.endswith('.wb')]
    make_sure_path_exists(out_root)

    for f in fids:
        with open(join(ctb_root, f), 'r') as src, \
             open(join(out_root, f.split('.')[0] + '.fid'), 'w') as out:
            # encoding='GB2312'
            in_s_tag = False
            try:
                for line in src:
                    if line.startswith('<S ID=') or line.startswith('<seg id='):
                        in_s_tag = True
                    elif line.startswith('</S>') or line.startswith('</seg>'):
                        in_s_tag = False
                    elif line.startswith('<'):
                        continue
                    elif in_s_tag and len(line) > 1:
                        out.write(line)
            except:
                pass
Exemplo n.º 2
0
def convert_ctb8_to_bracketed(ctb_root, out_root):
    ctb_root = join(ctb_root, 'bracketed')
    chtbs = [f for f in listdir(ctb_root) if isfile(join(ctb_root, f)) and f.startswith('chtb')]
    make_sure_path_exists(out_root)
    for f in chtbs:
        with open(join(ctb_root, f), encoding='utf-8') as src, open(join(out_root, f + '.txt'), 'w', encoding='utf-8') as out:
            for line in src:
                if not line.startswith('<'):
                    out.write(line)
Exemplo n.º 3
0
def convert(ctb_root, out_root):
    ctb_root = join(ctb_root, 'bracketed')
    fids = [f for f in listdir(ctb_root) if isfile(join(ctb_root, f)) and f.endswith('.fid')]
    make_sure_path_exists(out_root)
    for f in fids:
        with open(join(ctb_root, f), encoding='GB2312') as src, open(join(out_root, f), 'w') as out:
            in_s_tag = False
            try:
                for line in src:
                    if line.startswith('<S ID='):
                        in_s_tag = True
                    elif line.startswith('</S>'):
                        in_s_tag = False
                    elif in_s_tag:
                        out.write(line)
            except:
                pass
Exemplo n.º 4
0
def convert_ctb5_to_backeted(ctb_root, out_root):
    ctb_root = join(ctb_root, 'bracketed')
    fids = [
        f for f in listdir(ctb_root)
        if isfile(join(ctb_root, f)) and f.endswith('.fid')
    ]
    make_sure_path_exists(out_root)
    for f in fids:
        with open(join(ctb_root, f),
                  encoding='GB2312') as src, open(join(out_root, f),
                                                  'w') as out:
            in_s_tag = False
            try:
                for line in src:
                    if line.startswith('<S ID='):
                        in_s_tag = True
                    elif line.startswith('</S>'):
                        in_s_tag = False
                    elif in_s_tag:
                        out.write(line)
            except:
                # The last file throws encoding error at the very end, doesn't affect sentences.
                pass
Exemplo n.º 5
0
    print()


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description=
        'Convert combined Penn Treebank files (.txt) to Stanford Dependency format (.conllx)'
    )
    parser.add_argument(
        "--input",
        required=True,
        help=
        'The folder containing train.txt/dev.txt/test.txt in bracketed format')
    parser.add_argument(
        "--lang",
        required=True,
        help='Which language? Use en for English, cn for Chinese')
    parser.add_argument(
        "--output",
        required=True,
        dest="output",
        help=
        'The folder where to store the output train.conllx/dev.conllx/test.conllx in Stanford '
        'Dependency format')

    args = parser.parse_args()
    make_sure_path_exists(args.output)
    for f in ['train', 'dev', 'test']:
        convert(join(args.input, f + '.txt'), join(args.output, f + '.conllx'),
                args.lang)
Exemplo n.º 6
0
        'Which task (par, pos)? Use par for phrase structure parsing, pos for part-of-speech '
        'tagging')

    args = parser.parse_args()
    root_path = args.output
    task = args.task
    ext = 'txt'

    if task == 'par':
        training = list(range(2, 21 + 1))
        development = [22]
        test = [23]
    elif task == 'pos':
        training = list(range(0, 18 + 1))
        development = list(range(19, 21 + 1))
        test = list(range(22, 24 + 1))
        ext = 'tsv'
    else:
        eprint('Invalid task {}'.format(task))
        exit(1)

    print('Importing ptb from nltk')
    from nltk.corpus import ptb

    print()

    make_sure_path_exists(root_path)
    combine(training, join(root_path, 'train.{}'.format(ext)), task)
    combine(development, join(root_path, 'dev.{}'.format(ext)), task)
    combine(test, join(root_path, 'test.{}'.format(ext)), task)