def __init__(self, filename): initialize(self, locals()) self.reader = iter(possibly_compressed_file(filename)) self.header = parse_kv_list(self.reader.next()) assert 'S' in self.header self.num_sentences = self.header['S']
def __init__(self, filename): initialize(self, locals()) self.reader = iter(possibly_compressed_file(filename)) self.header = parse_kv_list(next(self.reader)) assert 'S' in self.header self.num_sentences = self.header['S']
def mapping_from_filename(this_class, filename): """Reads cvlm feature mapping from a filename. The expected format is that each line has an index followed by a tab followed by the feature name. Returns a FeatureMapping.""" mapping = this_class() for line in possibly_compressed_file(filename): index, name = line.split('\t') index = int(index) mapping[index] = name.strip() return mapping
def make_merged_feature_values(self): def warn_or_error(message): if self.warn_only: print "Warning:", message else: raise ValueError(message) self.corpus1 = RerankerFeatureCorpus(self.corpus_filename1) self.corpus2 = RerankerFeatureCorpus(self.corpus_filename2) if len(self.corpus1) != len(self.corpus2): warn_or_error("Corpus 1 has %d sentences, corpus 2 has %d." % (len(self.corpus1), len(self.corpus2))) merged_corpus = possibly_compressed_file(self.merged_corpus_filename, 'w') merged_corpus.write(self.corpus1.cvlm_format_header()) sentence_iter = izip(self.corpus1, self.corpus2) if self.verbose: print 'Transforming corpora (%d sentences)' % len(self.corpus1) sentence_iter = display_index_every_K_items(sentence_iter, 50, format='Sentence %s\n') for sentence_index, (sentence1, sentence2) in enumerate(sentence_iter): if len(sentence1) != len(sentence2): warn_or_error("Sentence %d: Corpus 1 has %d parses, corpus " "2 has %d." % (sentence_index, len(sentence1), len(sentence2))) if sentence1.gold_brackets != sentence2.gold_brackets: warn_or_error("Sentence %d: Corpus 1 has %d gold brackets, " "corpus 2 has %d." % (sentence_index, sentence1.gold_brackets, sentence2.gold_brackets)) parse_iter = enumerate(izip(sentence1, sentence2)) for parse_index, (parse1, parse2) in parse_iter: if parse1.proposed_brackets != parse2.proposed_brackets: warn_or_error( "Sentence %d, parse %d: Corpus 1 has %d " "proposed brackets, corpus 2 has %d." % (sentence_index, parse_index, parse1.proposed_brackets, parse2.proposed_brackets)) if parse1.matched_brackets != parse2.matched_brackets: warn_or_error( "Sentence %d, parse %d: Corpus 1 has %d " "matched brackets, corpus 2 has %d." % (sentence_index, parse_index, parse1.matched_brackets, parse2.matched_brackets)) # add all features from parse2 to parse1 (after remapping) features = parse1.features for index, value in parse2.features.items(): features[index + self.offset] = value merged_corpus.write(sentence1.cvlm_format())
def weights_from_filename(this_class, filename): """Reads cvlm weight vectors from a filename. The expected format is that each line has an index followed by an equals sign followed by the feature weight (a float). Returns a FeatureMapping.""" weights = this_class() for line in possibly_compressed_file(filename): index, weight = line.split('=') index = int(index) weight = float(weight) weights[index] = weight return weights
def make_merged_feature_values(self): def warn_or_error(message): if self.warn_only: print "Warning:", message else: raise ValueError(message) self.corpus1 = RerankerFeatureCorpus(self.corpus_filename1) self.corpus2 = RerankerFeatureCorpus(self.corpus_filename2) if len(self.corpus1) != len(self.corpus2): warn_or_error("Corpus 1 has %d sentences, corpus 2 has %d." % (len(self.corpus1), len(self.corpus2))) merged_corpus = possibly_compressed_file(self.merged_corpus_filename, 'w') merged_corpus.write(self.corpus1.cvlm_format_header()) sentence_iter = izip(self.corpus1, self.corpus2) if self.verbose: print 'Transforming corpora (%d sentences)' % len(self.corpus1) sentence_iter = display_index_every_K_items(sentence_iter, 50, format='Sentence %s\n') for sentence_index, (sentence1, sentence2) in enumerate(sentence_iter): if len(sentence1) != len(sentence2): warn_or_error("Sentence %d: Corpus 1 has %d parses, corpus " "2 has %d." % (sentence_index, len(sentence1), len(sentence2))) if sentence1.gold_brackets != sentence2.gold_brackets: warn_or_error("Sentence %d: Corpus 1 has %d gold brackets, " "corpus 2 has %d." % (sentence_index, sentence1.gold_brackets, sentence2.gold_brackets)) parse_iter = enumerate(izip(sentence1, sentence2)) for parse_index, (parse1, parse2) in parse_iter: if parse1.proposed_brackets != parse2.proposed_brackets: warn_or_error("Sentence %d, parse %d: Corpus 1 has %d " "proposed brackets, corpus 2 has %d." % (sentence_index, parse_index, parse1.proposed_brackets, parse2.proposed_brackets)) if parse1.matched_brackets != parse2.matched_brackets: warn_or_error("Sentence %d, parse %d: Corpus 1 has %d " "matched brackets, corpus 2 has %d." % (sentence_index, parse_index, parse1.matched_brackets, parse2.matched_brackets)) # add all features from parse2 to parse1 (after remapping) features = parse1.features for index, value in parse2.features.items(): features[index + self.offset] = value merged_corpus.write(sentence1.cvlm_format())
def write(self, filename): f = possibly_compressed_file(filename, 'w') for index in range(len(self)): name = self[index] f.write('%d\t%s\n' % (index, name)) f.close()
def train(train_data, dev_data, output_dir, mode='parser', train_bin_dir=CURRENT_TRAIN_BIN, original_data=None, verbose=True, cat_alternative=None, keep_tempfiles=False): """Create a language model / parsing model in output_dir from train_data and dev_data. train_bin_dir is the directory containing allScript and the training binaries. We use original_data as our prototype for the model directory, and while most of its contents are unimportant, some files like terms.txt are relevant. To use cat_alternative, you'll need dmcc's version of allScript. This lets you specify zcat, bzcat, smartcat, etc. for reading in files.""" if isinstance(train_data, basestring): train_data = [train_data] if isinstance(dev_data, basestring): dev_data = [dev_data] assert mode in ('parser', 'lm') for train_or_dev_filename in train_data + dev_data: assert os.path.exists(train_or_dev_filename), \ "File %s doesn't exist." % train_or_dev_filename allScript = os.path.join(train_bin_dir, 'allScript') assert os.path.exists(allScript) if original_data is None: if mode == 'parser': original_data = DEFAULT_DATA else: original_data = DEFAULT_LM # output_dir = validate_and_cleanup_datadir_path(output_dir) import shutil, commands from iterextras import any from waterworks.Files import possibly_compressed_file # erase the output directory if it exists and remake it from our # original_data directory (which should be a clean training of WSJ or # switchboard -- it must have the right terms.txt, etc.) if output_dir != original_data: print "Removing", output_dir shutil.rmtree(output_dir, ignore_errors=True) print "Copying", original_data, "to", output_dir shutil.copytree(original_data, output_dir) def compressed_filename(filename): filename = filename.lower() return filename.endswith('.gz') or filename.endswith('.bz2') temp_files = [] modelbase = "%s." % os.path.basename(output_dir) # if there are any compressed files in training, we combined all # training into one uncompressed file if any(train_data, compressed_filename): temp_train = keepable_tempfile(mode='w', prefix=modelbase, suffix='.train', keep=True, dir='/ltmp') print "Uncompressing and combining training data to", temp_train.name for filename in train_data: f = possibly_compressed_file(filename) for line in f: temp_train.write(line) temp_train.close() train_data = [temp_train.name] temp_files.append(temp_train) # same for dev files if any(dev_data, compressed_filename): temp_dev = keepable_tempfile(mode='w', prefix=modelbase, suffix='.dev', keep=True, dir='/ltmp') print "Uncompressing and combining dev data to", temp_dev.name for filename in dev_data: f = possibly_compressed_file(filename) for line in f: temp_dev.write(line) temp_dev.close() dev_data = [temp_dev.name] temp_files.append(temp_dev) # the repr()s will put quotes around lists of arguments cmd = ' '.join([allScript, '-' + mode, output_dir, repr(' '.join(train_data)), repr(' '.join(dev_data))]) if verbose: print "Training command:", repr(cmd) status, output = commands.getstatusoutput(cmd) if verbose: print "Output:" print "-------" print output print "-------" # store training output f = file(os.path.join(output_dir, 'traininglog'), 'a') f.write(output) f.close() if not keep_tempfiles: print "Removing temporary training files..." for fileobj in temp_files: os.remove(fileobj.name) if status != 0: raise TrainingError("Training script exited with nonzero exit code.") warning_messages = ('Exit code: 134', 'Exit code: 137', 'segfault', 'abort', 'Could not find', "Assertion `pstStream' failed.") for message in warning_messages: if message.lower() in output.lower(): raise TrainingError("Found a warning message in training " + \ "output: %r" % message) print "Done" return output