def eval(self, gold_file, output_file): def split_out(string): if string.strip(): print(string.split(':')[1].strip()) cmd = self.cmd_start + 'eval gold-file:{} output-file:{}'.format(gold_file, output_file) p = ProcessCommunicator(cmd, shell=True, stdout_func=split_out, stderr_func=watch_for_java_exception) exit_code = p.wait() return exit_code
def train(self, prefix, e, f): """ Train the giza word alignments on the provided text files. :param prefix: Prefix for where the giza output files will be stored. :type prefix: path+prefix :param e: Path to the "e" file :type e: path :param f: Path to the "f" :type f: path """ GIZA_LOG.info("Starting mgiza training from scratch...") self.tf = GizaFiles(prefix, e, f) GIZA_LOG.info("Converting txt files to SNTS and VCB files...") self.tf.txt_to_snt(ev = Vocab(), fv = Vocab()) # Now, do the aligning... exe = c.getpath('mgiza') if exe is None: raise GizaAlignmentException('Path to mgiza binary not defined.') elif not os.path.exists(exe): raise GizaAlignmentException('Path to mgiza binary "%s" invalid.') elts = [exe, '-o', os.path.join(self.tf.prefix, self.tf.name), '-S', self.tf.e_vcb, '-T', self.tf.f_vcb, '-C', self.tf.ef_snt, '-CoocurrenceFile', self.tf.ef_cooc, '-hmmiterations', '5', '-model4iterations', '0', '-ncpus', '0'] cmd = ' '.join(elts) GIZA_LOG.debug('Command: "{}"'.format(cmd)) p = ProcessCommunicator(elts) status = p.wait() GIZA_LOG.debug("Exit code: {}".format(str(status))) if status != 0: raise GizaAlignmentException("mgiza exited abnormally with a return code of {}".format(str(status))) self.tf.merge_a3() # self.tf.clean() return self.tf.aligned_sents()
def svmlight_to_vectors(txt): """ Convert a text file to vectors. :param txt: Path to the text file. """ MAXENT_LOG.info("Attempting to convert {} to a vector file.".format(txt)) ntf = NamedTemporaryFile(mode='w', delete=False) ntf.close() p = ProcessCommunicator('{} import-svmlight --input "{}" --output "{}"'.format(mallet_bin, txt, ntf.name), stdout_func=MAXENT_LOG.info, stderr_func=MAXENT_LOG.warn, shell=True) if p.wait() == 0: MAXENT_LOG.debug("Successfully created temporary vector file {}".format(ntf.name)) return ntf.name else: raise ClassifierException("SVMLight Conversion did not complete successfully.")
def train_txt(txt_path, model_path): """ Train a classifier from a svm-light format text file. :param txt_path: :param model_path: """ vectors = svmlight_to_vectors(txt_path) MAXENT_LOG.info("Attempting to train classifier {}".format(model_path)) p = ProcessCommunicator([mallet_bin, 'train-classifier', '--input', vectors, '--trainer', 'MaxEntTrainer', '--output-classifier', model_path], stdout_func=MAXENT_LOG.info, stderr_func=MAXENT_LOG.info) if p.wait() == 0: MAXENT_LOG.debug("Success.") os.unlink(vectors) return MalletMaxent(model_path) else: raise ClassifierException("Training the classifier did not complete. Check the logs.")
def resume(self, prefix, new_e, new_f): """ "Force" align a new set of data using the old model, per the instructions at: http://www.kyloo.net/software/doku.php/mgiza:forcealignment """ # First, initialize a new GizaFile container for # the files we are going to create new_gf = GizaFiles(prefix, new_e, new_f) # Now, we're going to extend the old vocabulary files # with the new text to align. old_ev = Vocab.load(self.tf.e_vcb) old_fv = Vocab.load(self.tf.f_vcb) old_ev.add_from_txt(new_gf.e) old_fv.add_from_txt(new_gf.f) # Now that we've extended the vocabs, let's dump the # now-extended vocabs into the new filepaths. old_ev.dump(new_gf.e_vcb) old_fv.dump(new_gf.f_vcb) # Write out new_gf.txt_to_snt(ev = old_ev, fv = old_fv) exe = c.getpath('mgiza') if exe is None: raise GizaAlignmentException('Path to mgiza binary not defined.') elif not os.path.exists(exe): raise GizaAlignmentException('Path to mgiza binary "%s" invalid.' % exe) args = [exe, #self.tf.cfg, '-restart', '2', '-o', os.path.join(new_gf.prefix, new_gf.name), '-m2', '5', '-previoust', self.tf.t, '-previousa', self.tf.a, '-previousn', self.tf.n, '-previousd', self.tf.d3, '-c', new_gf.ef_snt, '-s', new_gf.e_vcb, '-t', new_gf.f_vcb, '-Coocurrencefile', new_gf.ef_cooc] cmd = ' '.join(args) GIZA_LOG.debug('Command: "{}"'.format(cmd)) p = ProcessCommunicator(args) status = p.wait() GIZA_LOG.debug("Exit status {}".format(str(status))) if status != 0: raise GizaAlignmentException("mgiza exited abnormally with a return code of {}".format(str(status))) new_gf.merge_a3() # new_gf.clean() return new_gf.aligned_sents()
def test(self, model_file, test_file, output_file): cmd = self.cmd_start + 'test model-name:{} test-file:{} output-file:{}'.format(model_file, test_file, output_file) p = ProcessCommunicator(cmd, shell=True, stdout_func=None, stderr_func=watch_for_java_exception) exit_code = p.wait() return exit_code
def train(self, train_file, model_file): cmd = self.cmd_start + 'train train-file:{} model-name:{}'.format(train_file, model_file) p = ProcessCommunicator(cmd, shell=True, stdout_func=print, stderr_func=print) exit_code = p.wait() return exit_code