def m_step(self, all_word_paths): # keep a dictionary of score-weighted pseudocounts of aligned elements pseudocounts = {} for word in all_word_paths: # get the total score to scale the paths # word_total_score = logSum([path.get_score() for path in word]) if not len(word): continue word_total_score = logSum([path[1] for path in word \ if not isinf(path[1])]) for path in word: ''' path_elements = path.get_elements() path_score = path.get_score() ''' path_elements, path_score = path path_score_scaled = path_score - word_total_score for a_element, b_element in path_elements: if a_element not in self.alignment_scores[-1]: a_element = None if a_element not in pseudocounts: pseudocounts[a_element] = {} if b_element not in pseudocounts[a_element] \ or isinf(pseudocounts[a_element][b_element]): pseudocounts[a_element][b_element] = path_score_scaled else: pseudocounts[a_element][b_element] = \ logAdd(path_score_scaled, pseudocounts[a_element][b_element]) self.pseudocounts.append(pseudocounts) # rescale all the pseudocounts so they sum to 1 pseudoprobs = {} for a_element in pseudocounts: pseudoprobs[a_element] = {} a_element_total = logSum([v for v in \ pseudocounts[a_element].values() \ if not isinf(v)]) for b_element in pseudocounts[a_element]: b_element_pseudocount = pseudocounts[a_element][b_element] if isinf(a_element_total): b_element_prob = log(0) else: b_element_prob = b_element_pseudocount - a_element_total pseudoprobs[a_element][b_element] = b_element_prob return pseudoprobs
def e_step(self, alignment_scores): # list(list(ViterbiPath)) all_word_paths = [] likelihood = log(0) positions_to_log = len(self.ab_pairs) / 100 for ab_idx, ab_pair in enumerate(self.ab_pairs): if self.verbose: if not ab_idx % positions_to_log: self.logfile = open(self.logfilename, 'a') self.logfile.write('\t'.join([ str(s) for s in self.iteration_number, time(), ab_idx, ab_pair ]) + '\n') self.logfile.close() a, b = ab_pair #print self.iteration_number,a,b v = ViterbiAligner(a, b, alignment_scores) word_paths = [] aligned_paths = v.get_all_paths() if not len(aligned_paths): ''' with open('bad_alignments/%s.pickle' % a, 'w') as fout: pickle.dump(v, fout) ''' continue for path in aligned_paths: # only append the elements, otherwise memory use skyrockets elements, score = path.get_elements(), path.get_score() word_paths.append((elements, score)) if isinf(likelihood): likelihood = score else: likelihood = logAdd(likelihood, score) all_word_paths.append(word_paths) del v self.likelihood.append(likelihood) return all_word_paths
def e_step(self, alignment_scores): # list(list(ViterbiPath)) all_word_paths = [] likelihood = log(0) positions_to_log = len(self.ab_pairs) / 100 for ab_idx,ab_pair in enumerate(self.ab_pairs): if self.verbose: if not ab_idx % positions_to_log: self.logfile = open(self.logfilename, 'a') self.logfile.write('\t'.join([str(s) for s in self.iteration_number, time(), ab_idx, ab_pair])+'\n') self.logfile.close() a,b = ab_pair #print self.iteration_number,a,b v = ViterbiAligner(a, b, alignment_scores) word_paths = [] aligned_paths = v.get_all_paths() if not len(aligned_paths): ''' with open('bad_alignments/%s.pickle' % a, 'w') as fout: pickle.dump(v, fout) ''' continue for path in aligned_paths: # only append the elements, otherwise memory use skyrockets elements, score = path.get_elements(), path.get_score() word_paths.append((elements, score)) if isinf(likelihood): likelihood = score else: likelihood = logAdd(likelihood, score) all_word_paths.append(word_paths) del v self.likelihood.append(likelihood) return all_word_paths