def eval(self, gold_file, output_file): def split_out(string): if string.strip(): print(string.split(':')[1].strip()) cmd = self.cmd_start + 'eval gold-file:{} output-file:{}'.format(gold_file, output_file) p = ProcessCommunicator(cmd, shell=True, stdout_func=split_out, stderr_func=watch_for_java_exception) exit_code = p.wait() return exit_code
def train(self, prefix, e, f): """ Train the giza word alignments on the provided text files. :param prefix: Prefix for where the giza output files will be stored. :type prefix: path+prefix :param e: Path to the "e" file :type e: path :param f: Path to the "f" :type f: path """ GIZA_LOG.info("Starting mgiza training from scratch...") self.tf = GizaFiles(prefix, e, f) GIZA_LOG.info("Converting txt files to SNTS and VCB files...") self.tf.txt_to_snt(ev = Vocab(), fv = Vocab()) # Now, do the aligning... exe = c.getpath('mgiza') if exe is None: raise GizaAlignmentException('Path to mgiza binary not defined.') elif not os.path.exists(exe): raise GizaAlignmentException('Path to mgiza binary "%s" invalid.') elts = [exe, '-o', os.path.join(self.tf.prefix, self.tf.name), '-S', self.tf.e_vcb, '-T', self.tf.f_vcb, '-C', self.tf.ef_snt, '-CoocurrenceFile', self.tf.ef_cooc, '-hmmiterations', '5', '-model4iterations', '0', '-ncpus', '0'] cmd = ' '.join(elts) GIZA_LOG.debug('Command: "{}"'.format(cmd)) p = ProcessCommunicator(elts) status = p.wait() GIZA_LOG.debug("Exit code: {}".format(str(status))) if status != 0: raise GizaAlignmentException("mgiza exited abnormally with a return code of {}".format(str(status))) self.tf.merge_a3() # self.tf.clean() return self.tf.aligned_sents()
def svmlight_to_vectors(txt): """ Convert a text file to vectors. :param txt: Path to the text file. """ MAXENT_LOG.info("Attempting to convert {} to a vector file.".format(txt)) ntf = NamedTemporaryFile(mode='w', delete=False) ntf.close() p = ProcessCommunicator('{} import-svmlight --input "{}" --output "{}"'.format(mallet_bin, txt, ntf.name), stdout_func=MAXENT_LOG.info, stderr_func=MAXENT_LOG.warn, shell=True) if p.wait() == 0: MAXENT_LOG.debug("Successfully created temporary vector file {}".format(ntf.name)) return ntf.name else: raise ClassifierException("SVMLight Conversion did not complete successfully.")
def train_txt(txt_path, model_path): """ Train a classifier from a svm-light format text file. :param txt_path: :param model_path: """ vectors = svmlight_to_vectors(txt_path) MAXENT_LOG.info("Attempting to train classifier {}".format(model_path)) p = ProcessCommunicator([mallet_bin, 'train-classifier', '--input', vectors, '--trainer', 'MaxEntTrainer', '--output-classifier', model_path], stdout_func=MAXENT_LOG.info, stderr_func=MAXENT_LOG.info) if p.wait() == 0: MAXENT_LOG.debug("Success.") os.unlink(vectors) return MalletMaxent(model_path) else: raise ClassifierException("Training the classifier did not complete. Check the logs.")
def __init__(self, model): # Get the jar defined in the env.conf file. # If the .jar is not defined... ---------------------------------------- """ :param model: Path to the model file. :type model: str """ # ------------------------------------------- # Do some error checking. # ------------------------------------------- if tagger_dir is None: TAG_LOG.critical('Path to the stanford tagger .jar file is not defined.') raise TaggerError('Path to the stanford tagger .jar file is not defined.') elif not os.path.exists(tagger_dir): raise TaggerError('Path to the stanford tagger "{}" is not found.'.format(tagger_dir)) if java_bin is None: raise TaggerError("Path to java bin is undefined!") # ------------------------------------------- other_jars = glob.glob(os.path.join(tagger_dir, 'lib/*.jar')) classpath = ':'.join([tagger_jar]+other_jars) self.results_queue = [] self.st = ProcessCommunicator([java_bin, '-cp', classpath, 'edu.stanford.nlp.tagger.maxent.MaxentTagger', '-model', model, '-sentenceDelimiter', 'newline', '-tokenize', 'false'], stderr_func=stanford_stderr_handler, stdout_func=lambda x: stanford_stdout_handler(x, self.results_queue), blocking=False)
class StanfordPOSTagger(object): """ Instantiate a java VM to run the stanford tagger. """ def __init__(self, model): # Get the jar defined in the env.conf file. # If the .jar is not defined... ---------------------------------------- """ :param model: Path to the model file. :type model: str """ # ------------------------------------------- # Do some error checking. # ------------------------------------------- if tagger_dir is None: TAG_LOG.critical('Path to the stanford tagger .jar file is not defined.') raise TaggerError('Path to the stanford tagger .jar file is not defined.') elif not os.path.exists(tagger_dir): raise TaggerError('Path to the stanford tagger "{}" is not found.'.format(tagger_dir)) if java_bin is None: raise TaggerError("Path to java bin is undefined!") # ------------------------------------------- other_jars = glob.glob(os.path.join(tagger_dir, 'lib/*.jar')) classpath = ':'.join([tagger_jar]+other_jars) self.results_queue = [] self.st = ProcessCommunicator([java_bin, '-cp', classpath, 'edu.stanford.nlp.tagger.maxent.MaxentTagger', '-model', model, '-sentenceDelimiter', 'newline', '-tokenize', 'false'], stderr_func=stanford_stderr_handler, stdout_func=lambda x: stanford_stdout_handler(x, self.results_queue), blocking=False) def tag_tokenization(self, tokenization, **kwargs): return self.tag(tokenization.text(), **kwargs) def tag(self, s, **kwargs): # Lowercase if asked for """ :rtype: list[POSToken] """ if kwargs.get('lowercase', True): s = s.lower() self.st.stdin.write(bytes(s+'\r\n', encoding='utf-8')) # Try to flush out to stdin try: self.st.stdin.flush() except BrokenPipeError: raise CriticalTaggerError('The Stanford parser unexpectedly quit.') while len(self.results_queue) == 0: time.sleep(0.25) return self.results_queue.pop() def close(self): self.st.kill()
def resume(self, prefix, new_e, new_f): """ "Force" align a new set of data using the old model, per the instructions at: http://www.kyloo.net/software/doku.php/mgiza:forcealignment """ # First, initialize a new GizaFile container for # the files we are going to create new_gf = GizaFiles(prefix, new_e, new_f) # Now, we're going to extend the old vocabulary files # with the new text to align. old_ev = Vocab.load(self.tf.e_vcb) old_fv = Vocab.load(self.tf.f_vcb) old_ev.add_from_txt(new_gf.e) old_fv.add_from_txt(new_gf.f) # Now that we've extended the vocabs, let's dump the # now-extended vocabs into the new filepaths. old_ev.dump(new_gf.e_vcb) old_fv.dump(new_gf.f_vcb) # Write out new_gf.txt_to_snt(ev = old_ev, fv = old_fv) exe = c.getpath('mgiza') if exe is None: raise GizaAlignmentException('Path to mgiza binary not defined.') elif not os.path.exists(exe): raise GizaAlignmentException('Path to mgiza binary "%s" invalid.' % exe) args = [exe, #self.tf.cfg, '-restart', '2', '-o', os.path.join(new_gf.prefix, new_gf.name), '-m2', '5', '-previoust', self.tf.t, '-previousa', self.tf.a, '-previousn', self.tf.n, '-previousd', self.tf.d3, '-c', new_gf.ef_snt, '-s', new_gf.e_vcb, '-t', new_gf.f_vcb, '-Coocurrencefile', new_gf.ef_cooc] cmd = ' '.join(args) GIZA_LOG.debug('Command: "{}"'.format(cmd)) p = ProcessCommunicator(args) status = p.wait() GIZA_LOG.debug("Exit status {}".format(str(status))) if status != 0: raise GizaAlignmentException("mgiza exited abnormally with a return code of {}".format(str(status))) new_gf.merge_a3() # new_gf.clean() return new_gf.aligned_sents()
def test(self, model_file, test_file, output_file): cmd = self.cmd_start + 'test model-name:{} test-file:{} output-file:{}'.format(model_file, test_file, output_file) p = ProcessCommunicator(cmd, shell=True, stdout_func=None, stderr_func=watch_for_java_exception) exit_code = p.wait() return exit_code
def train(self, train_file, model_file): cmd = self.cmd_start + 'train train-file:{} model-name:{}'.format(train_file, model_file) p = ProcessCommunicator(cmd, shell=True, stdout_func=print, stderr_func=print) exit_code = p.wait() return exit_code