def em_step(self, iteration): ffile = open(self.ffilename) efile = open(self.efilename) afile = open(self.afilename) alignments = Alignment.reader_pharaoh(ffile, efile, afile) dirname = os.path.join(self.outputdir, 'iter_%s' % str(iteration + 1).rjust(3, '0')) os.mkdir(dirname) if logger.level >= 1: logger.writeln('\niteration %s' % (iteration + 1)) likelihood = 0 starttime = time.time() for i, alignment in enumerate(alignments, 1): if i % FLAGS.emtrain_log_interval == 0: logger.writeln('%s sentences at %s secs/sent' % (i, (time.time() - starttime) / i)) starttime = time.time() extractor = Extractor( maxabslen=100000, maxlen=10000, minhole=1, maxvars=100000, lexical_weighter=self.lexical_weighter, forbid_adjacent=self.forbid_adjacent, maximize_derivation=self.maximize_derivation, require_aligned_terminal=self.require_aligned_terminal) hg = extractor.extract_hypergraph(alignment) if hg is None: continue # compute expected counts self.compute_expected_counts(hg) likelihood += hg.root.inside treefilename = os.path.join(dirname, 'tree_%s' % str(i).rjust(8, '0')) self.write_viterbi_tree(hg, treefilename) #for edge in hg.edges(): # logger.writeln('%s %s' % (self.counter.get_prob(edge.rule), # edge.rule)) if logger.level >= 1: logger.writeln('likelihood: %s' % likelihood) if logger.level >= 1: logger.writeln('normalizing...') self.counter.normalize_vbdp(self.alpha, self.threshold) if logger.level >= 1: logger.writeln('prob table size: %s' % len(self.counter.prob))
def em_step(self, iteration): ffile = open(self.ffilename) efile = open(self.efilename) afile = open(self.afilename) alignments = Alignment.reader_pharaoh(ffile, efile, afile) percent_counter = PercentCounter(total=self.corpus_size) dirname = os.path.join(self.outputdir, 'iter_%s' % str(iteration + 1).rjust(3, '0')) os.mkdir(dirname) if logger.level >= 1: logger.writeln('\niteration %s' % (iteration + 1)) likelihood = 0 for i, alignment in enumerate(alignments): percent_counter.print_percent(i) # if logger.level >= 1: # logger.writeln() # logger.writeln('>>> sentence_pair_%s' % i) extractor = Extractor(lexical_weighter=self.lexical_weighter, maximize_derivation=self.maximize_derivation) hg = extractor.extract_hypergraph(alignment) if hg is None: continue # compute expected counts self.compute_expected_counts(hg) likelihood += hg.root.inside treefilename = os.path.join(dirname, 'tree_%s' % str(i + 1).rjust(8, '0')) self.write_viterbi_tree(hg, treefilename) #for edge in hg.edges(): # logger.writeln('%s %s' % (self.counter.get_prob(edge.rule), # edge.rule)) if logger.level >= 1: logger.writeln('likelihood: %s' % likelihood) if logger.level >= 1: logger.writeln('normalizing...') self.counter.normalize_vbdp(self.alpha, self.threshold) if logger.level >= 1: logger.writeln('prob table size: %s' % len(self.counter.prob))
#!/usr/bin/env python3 import sys from extractor import Extractor from alignment import Alignment import hypergraph import logger if __name__ == '__main__': ffilename = sys.argv[1] efilename = sys.argv[2] afilename = sys.argv[3] n = int(sys.argv[4]) ffile = open(ffilename) efile = open(efilename) afile = open(afilename) alignments = Alignment.reader_pharaoh(ffile, efile, afile) for i, alignment in enumerate(alignments): if i == n: extractor = Extractor(maximize_derivation=True) hg = extractor.extract_hypergraph(alignment) hg.show()