def alternative_update_subst(unigram_tbl, bigram_tbl, link_tbl, subst_tbl, xe_gap, max_iter, prev_xe=np.inf): c = 1 k, n = link_tbl.shape normalization_factor = np.sum(logsumexp(link_tbl, axis=0)) while True: cnt_tbl, cnt_subst_tbl, prb_cf = em_iter_count(unigram_tbl, bigram_tbl, link_tbl, subst_tbl, only_subst_tbl=True) subst_tbl = em_iter_update_subst(cnt_subst_tbl) x_entropy = cross_entropy([prb_cf], [n]) eprint('iter-subst {} cross entropy is {}, gap {},' ' logP(c) {}'.format(c, x_entropy, abs(1.0 - x_entropy / prev_xe), prb_cf)) if time_to_stop(c, max_iter, prev_xe, x_entropy, xe_gap): break elif np.isnan(x_entropy): eprint('program end in iter {} caused by nan'.format(c)) break else: prev_xe = x_entropy c += 1 return subst_tbl, x_entropy, prb_cf, normalization_factor
def alternative_update_gmm(line, unigram_tbl, bigram_tbl, link_tbl, subst_tbl, xe_gap, max_iter, prev_xe=np.inf): c = 1 while True: cnt_tbl, cnt_subst_tbl, prb_cf = em_iter_count(unigram_tbl, bigram_tbl, link_tbl, subst_tbl) link_tbl, gmm, weights = em_iter_update(cnt_tbl, line) ll_gmm = gmm_log_likelihood(link_tbl, weights) x_entropy = cross_entropy([prb_cf], [len(line)]) eprint('iter-GMM {} cross entropy is {}, gap {},' ' logP(c) {}, logP_GMM(c) {}'.format( c, x_entropy, abs(1.0 - x_entropy / prev_xe), prb_cf, ll_gmm)) if time_to_stop(c, max_iter, prev_xe, x_entropy, xe_gap): break elif np.isnan(x_entropy): eprint('program end in iter {} caused by nan'.format(c)) break else: prev_xe = x_entropy c += 1 return link_tbl, gmm, weights, x_entropy, prb_cf
def em_decipher_alternative(line, unigram_tbl, bigram_tbl, link_tbl, subst_tbl, xe_gap=1e-8, max_iter=300): prev_xe_gmm = np.inf prev_xe_subst = np.inf t = 1 while True: eprint('start training GMM') link_tbl, gmm, weights, x_entropy, prb_cf_gmm = alternative_update_gmm( line, unigram_tbl, bigram_tbl, link_tbl, subst_tbl, 1e-8, max_iter, prev_xe=np.inf) if time_to_stop(t, max_iter, prev_xe_gmm, x_entropy, xe_gap): break else: prev_xe_gmm = x_entropy eprint('start training subst') subst_tbl, x_entropy, prb_cf_subst, normalization_factor =\ alternative_update_subst( unigram_tbl, bigram_tbl, link_tbl, subst_tbl, 1e-5, max_iter, prev_xe=prev_xe_subst) # keep the real prb_cf for later use prb_cf_subst += normalization_factor if time_to_stop(t, max_iter, prev_xe_subst, x_entropy, 1e-5): break else: prev_xe_subst = x_entropy t += 1 return link_tbl, subst_tbl, gmm, weights, x_entropy, prb_cf_gmm
def em_decipher(line, unigram_tbl, bigram_tbl, link_tbl, subst_tbl, xe_gap=1e-8, max_iter=0): """ EM on a line of features. EM iterations stop if matches one of the following conditions: 1) reach the max_iter 2) current cross entropy / last cross entropy >= xe_gap :return: final link_tbl, gmm model, cross entropy, and log likelihood """ # prepare hyper-parameters prev_xe = np.inf # start training c = 1 while True: cnt_tbl, cnt_subst_tbl, prb_cf = em_iter_count(unigram_tbl, bigram_tbl, link_tbl, subst_tbl) _, gmm, weights = em_iter_update(cnt_tbl, line) # subst_tbl = em_iter_update_subst(cnt_subst_tbl) ll_gmm = gmm_log_likelihood(link_tbl, weights) x_entropy = cross_entropy([prb_cf], [len(line)]) eprint('iter {} cross entropy is {}, gap {},' ' logP(c) {}, logP_GMM(c) {}'.format( c, x_entropy, abs(1.0 - x_entropy / prev_xe), prb_cf, ll_gmm)) if time_to_stop(c, max_iter, prev_xe, x_entropy, xe_gap): break elif np.isnan(x_entropy): eprint('program end in iter {} caused by nan'.format(c)) break else: prev_xe = x_entropy c += 1 return link_tbl, subst_tbl, gmm, weights, x_entropy, prb_cf
def em_gmm(line, link_tbl, weights, xe_gap=1e-8, max_iter=300, cov_type='fix', scaling_factor=0.1): """ EM on a line of features. EM iterations stop if matches one of the following conditions: 1) reach the max_iter 2) current cross entropy / last cross entropy >= xe_gap :return: final link_tbl, gmm model, and cross entropy """ # prepare hyper-parameters prev_xe = np.inf # start training c = 1 while True: cnt_tbl, prb_cf = em_iter_gmm_count(link_tbl, weights) link_tbl, gmm, weights = em_iter_update(cnt_tbl, line, cov_type=cov_type, scaling_factor=scaling_factor) x_entropy = cross_entropy([prb_cf], [len(line)]) eprint('iter {} cross entropy is {}, gap {}, logP(c) {}'.format( c, x_entropy, x_entropy / prev_xe, prb_cf)) if time_to_stop(c, max_iter, prev_xe, x_entropy, xe_gap): break elif np.isnan(x_entropy): eprint('program end in iter {} caused by nan'.format(c)) break else: prev_xe = x_entropy c += 1 return link_tbl, gmm, weights, x_entropy