def vimdiff(*strings, **options): """Given a list of strings, visualizes the differences between them using vimdiff (or similar program). The visualizer program is specified by the command_template keyword and the default is: vimdiff -gf %s where %s will be replaced with a list of filenames.""" import sys assert len(strings) >= 2, "Must have at least two strings." file_objs = [] for i, s in enumerate(strings): tempfile = keepable_tempfile(prefix='%d-' % i) tempfile.write(s) tempfile.flush() file_objs.append(tempfile) filenames = [tempfile.name for tempfile in file_objs] command_template = options.get('command_template', 'vimdiff -gf %s') command = command_template % ' '.join(filenames) import os print "Running:", command os.system(command)
def parse_section_of_a_file(inobj, outobj, datadir, start=None, end=None, convert_from_trees=False, keep_temporary_section=False, **parse_args): """Generic function to parse just a portion of a file and put the outobj into a file. This function is the workhorse of ParseHog.parse. inobj and outobj are either file-like objects or filenames. start and end can be None to mean the beginning or end of the file, respectively.""" # note that None is less than all integers and that the empty string # is greater than all integers. thus, if we change end to "" if it is # None, we can compare it with integers properly. if end is None: end = "" input_file = open_file_or_filename(inobj) if convert_from_trees: input_file = TreeConverter(input_file) parse_args['already_tokenized'] = True # set -K sliced_file = keepable_tempfile(keep=keep_temporary_section) current_line_number = 0 for line in input_file: if start <= current_line_number <= end: sliced_file.write(line) elif current_line_number > end: break current_line_number += 1 sliced_file.flush() if (start is None) or (end is ""): num_input_lines = None else: num_input_lines = (start - end) + 1 parser = ECParser() parser.parse_sgml_file(sliced_file, output=outobj, datadir=datadir, **parse_args)
def vimdiff(*strings, **options): """Given a list of strings, visualizes the differences between them using vimdiff (or similar program). The visualizer program is specified by the command_template keyword and the default is: vimdiff -gf %s where %s will be replaced with a list of filenames.""" import sys assert len(strings) >= 2, "Must have at least two strings." file_objs = [] for i, s in enumerate(strings): tempfile = keepable_tempfile(prefix='%d-' % i) tempfile.write(s) tempfile.flush() file_objs.append(tempfile) filenames = [tempfile.name for tempfile in file_objs] command_template = options.get('command_template', 'vimdiff -gf %s') command = command_template % ' '.join(filenames) import os print "Running:", command os.system(command)
def train(train_data, dev_data, output_dir, mode='parser', train_bin_dir=CURRENT_TRAIN_BIN, original_data=None, verbose=True, cat_alternative=None, keep_tempfiles=False): """Create a language model / parsing model in output_dir from train_data and dev_data. train_bin_dir is the directory containing allScript and the training binaries. We use original_data as our prototype for the model directory, and while most of its contents are unimportant, some files like terms.txt are relevant. To use cat_alternative, you'll need dmcc's version of allScript. This lets you specify zcat, bzcat, smartcat, etc. for reading in files.""" if isinstance(train_data, basestring): train_data = [train_data] if isinstance(dev_data, basestring): dev_data = [dev_data] assert mode in ('parser', 'lm') for train_or_dev_filename in train_data + dev_data: assert os.path.exists(train_or_dev_filename), \ "File %s doesn't exist." % train_or_dev_filename allScript = os.path.join(train_bin_dir, 'allScript') assert os.path.exists(allScript) if original_data is None: if mode == 'parser': original_data = DEFAULT_DATA else: original_data = DEFAULT_LM # output_dir = validate_and_cleanup_datadir_path(output_dir) import shutil, commands from iterextras import any from waterworks.Files import possibly_compressed_file # erase the output directory if it exists and remake it from our # original_data directory (which should be a clean training of WSJ or # switchboard -- it must have the right terms.txt, etc.) if output_dir != original_data: print "Removing", output_dir shutil.rmtree(output_dir, ignore_errors=True) print "Copying", original_data, "to", output_dir shutil.copytree(original_data, output_dir) def compressed_filename(filename): filename = filename.lower() return filename.endswith('.gz') or filename.endswith('.bz2') temp_files = [] modelbase = "%s." % os.path.basename(output_dir) # if there are any compressed files in training, we combined all # training into one uncompressed file if any(train_data, compressed_filename): temp_train = keepable_tempfile(mode='w', prefix=modelbase, suffix='.train', keep=True, dir='/ltmp') print "Uncompressing and combining training data to", temp_train.name for filename in train_data: f = possibly_compressed_file(filename) for line in f: temp_train.write(line) temp_train.close() train_data = [temp_train.name] temp_files.append(temp_train) # same for dev files if any(dev_data, compressed_filename): temp_dev = keepable_tempfile(mode='w', prefix=modelbase, suffix='.dev', keep=True, dir='/ltmp') print "Uncompressing and combining dev data to", temp_dev.name for filename in dev_data: f = possibly_compressed_file(filename) for line in f: temp_dev.write(line) temp_dev.close() dev_data = [temp_dev.name] temp_files.append(temp_dev) # the repr()s will put quotes around lists of arguments cmd = ' '.join([allScript, '-' + mode, output_dir, repr(' '.join(train_data)), repr(' '.join(dev_data))]) if verbose: print "Training command:", repr(cmd) status, output = commands.getstatusoutput(cmd) if verbose: print "Output:" print "-------" print output print "-------" # store training output f = file(os.path.join(output_dir, 'traininglog'), 'a') f.write(output) f.close() if not keep_tempfiles: print "Removing temporary training files..." for fileobj in temp_files: os.remove(fileobj.name) if status != 0: raise TrainingError("Training script exited with nonzero exit code.") warning_messages = ('Exit code: 134', 'Exit code: 137', 'segfault', 'abort', 'Could not find', "Assertion `pstStream' failed.") for message in warning_messages: if message.lower() in output.lower(): raise TrainingError("Found a warning message in training " + \ "output: %r" % message) print "Done" return output