Exemplo n.º 1
0
def vimdiff(*strings, **options):
    """Given a list of strings, visualizes the differences between them
    using vimdiff (or similar program). 
    The visualizer program is specified by the command_template keyword
    and the default is:

        vimdiff -gf %s

    where %s will be replaced with a list of filenames."""
    import sys
    assert len(strings) >= 2, "Must have at least two strings."
    file_objs = []
    for i, s in enumerate(strings):
        tempfile = keepable_tempfile(prefix='%d-' % i)
        tempfile.write(s)
        tempfile.flush()
        file_objs.append(tempfile)

    filenames = [tempfile.name for tempfile in file_objs]
    command_template = options.get('command_template', 'vimdiff -gf %s')
    command = command_template % ' '.join(filenames)
    import os
    print "Running:", command
    os.system(command)
Exemplo n.º 2
0
def parse_section_of_a_file(inobj, outobj, datadir, start=None, end=None,
    convert_from_trees=False, keep_temporary_section=False, **parse_args):
    """Generic function to parse just a portion of a file and put the
    outobj into a file.  This function is the workhorse of ParseHog.parse.
    inobj and outobj are either file-like objects or filenames.
    start and end can be None to mean the beginning or end of the file,
    respectively."""
    # note that None is less than all integers and that the empty string
    # is greater than all integers.  thus, if we change end to "" if it is 
    # None, we can compare it with integers properly.
    if end is None:
        end = ""

    input_file = open_file_or_filename(inobj)
    if convert_from_trees:
        input_file = TreeConverter(input_file)
        parse_args['already_tokenized'] = True # set -K

    sliced_file = keepable_tempfile(keep=keep_temporary_section)

    current_line_number = 0
    for line in input_file:
        if start <= current_line_number <= end:
            sliced_file.write(line)
        elif current_line_number > end:
            break
        current_line_number += 1
    sliced_file.flush()
    if (start is None) or (end is ""):
        num_input_lines = None
    else:
        num_input_lines = (start - end) + 1

    parser = ECParser()
    parser.parse_sgml_file(sliced_file, output=outobj, 
        datadir=datadir, **parse_args)
Exemplo n.º 3
0
def vimdiff(*strings, **options):
    """Given a list of strings, visualizes the differences between them
    using vimdiff (or similar program). 
    The visualizer program is specified by the command_template keyword
    and the default is:

        vimdiff -gf %s

    where %s will be replaced with a list of filenames."""
    import sys
    assert len(strings) >= 2, "Must have at least two strings."
    file_objs = []
    for i, s in enumerate(strings):
        tempfile = keepable_tempfile(prefix='%d-' % i)
        tempfile.write(s)
        tempfile.flush()
        file_objs.append(tempfile)

    filenames = [tempfile.name for tempfile in file_objs]
    command_template = options.get('command_template', 'vimdiff -gf %s')
    command = command_template % ' '.join(filenames)
    import os
    print "Running:", command
    os.system(command)
Exemplo n.º 4
0
def train(train_data, dev_data, output_dir, mode='parser',
    train_bin_dir=CURRENT_TRAIN_BIN, original_data=None, verbose=True,
    cat_alternative=None, keep_tempfiles=False):
    """Create a language model / parsing model in output_dir from
    train_data and dev_data.  train_bin_dir is the directory
    containing allScript and the training binaries.  We use original_data
    as our prototype for the model directory, and while most of its
    contents are unimportant, some files like terms.txt are relevant.
    
    To use cat_alternative, you'll need dmcc's version of allScript.
    This lets you specify zcat, bzcat, smartcat, etc. for reading in
    files."""
    if isinstance(train_data, basestring):
        train_data = [train_data]
    if isinstance(dev_data, basestring):
        dev_data = [dev_data]

    assert mode in ('parser', 'lm')
        
    for train_or_dev_filename in train_data + dev_data:
        assert os.path.exists(train_or_dev_filename), \
            "File %s doesn't exist." % train_or_dev_filename

    allScript = os.path.join(train_bin_dir, 'allScript')
    assert os.path.exists(allScript)
    if original_data is None:
        if mode == 'parser':
            original_data = DEFAULT_DATA
        else:
            original_data = DEFAULT_LM
    # output_dir = validate_and_cleanup_datadir_path(output_dir)

    import shutil, commands
    from iterextras import any
    from waterworks.Files import possibly_compressed_file
    # erase the output directory if it exists and remake it from our
    # original_data directory (which should be a clean training of WSJ or
    # switchboard -- it must have the right terms.txt, etc.)
    if output_dir != original_data:
        print "Removing", output_dir
        shutil.rmtree(output_dir, ignore_errors=True)
        print "Copying", original_data, "to", output_dir
        shutil.copytree(original_data, output_dir)

    def compressed_filename(filename):
        filename = filename.lower()
        return filename.endswith('.gz') or filename.endswith('.bz2')

    temp_files = []
    modelbase = "%s." % os.path.basename(output_dir)
    # if there are any compressed files in training, we combined all
    # training into one uncompressed file
    if any(train_data, compressed_filename):
        temp_train = keepable_tempfile(mode='w', prefix=modelbase,
                                       suffix='.train', keep=True, dir='/ltmp')
        print "Uncompressing and combining training data to", temp_train.name
        for filename in train_data:
            f = possibly_compressed_file(filename)
            for line in f:
                temp_train.write(line)
        temp_train.close()
        train_data = [temp_train.name]
        temp_files.append(temp_train)

    # same for dev files
    if any(dev_data, compressed_filename):
        temp_dev = keepable_tempfile(mode='w', prefix=modelbase,
                                     suffix='.dev', keep=True, dir='/ltmp')
        print "Uncompressing and combining dev data to", temp_dev.name
        for filename in dev_data:
            f = possibly_compressed_file(filename)
            for line in f:
                temp_dev.write(line)
        temp_dev.close()
        dev_data = [temp_dev.name]
        temp_files.append(temp_dev)

    # the repr()s will put quotes around lists of arguments
    cmd = ' '.join([allScript, 
                    '-' + mode, 
                    output_dir, 
                    repr(' '.join(train_data)), 
                    repr(' '.join(dev_data))])

    if verbose:
        print "Training command:", repr(cmd)
    
    status, output = commands.getstatusoutput(cmd)

    if verbose:
        print "Output:"
        print "-------"
        print output
        print "-------"

    # store training output
    f = file(os.path.join(output_dir, 'traininglog'), 'a')
    f.write(output)
    f.close()

    if not keep_tempfiles:
        print "Removing temporary training files..."
        for fileobj in temp_files:
            os.remove(fileobj.name)

    if status != 0:
        raise TrainingError("Training script exited with nonzero exit code.")

    warning_messages = ('Exit code: 134', 'Exit code: 137', 'segfault', 'abort',
                        'Could not find', "Assertion `pstStream' failed.")
    for message in warning_messages:
        if message.lower() in output.lower():
            raise TrainingError("Found a warning message in training " + \
                                "output: %r" % message)

    print "Done"
    return output