def fit(self): cnt_tbl = np.zeros((self.k, self.n)) for i, y in enumerate(self.targets): cnt_tbl[y, i] = 1 eprint('initialize use gold') self.gmm, self.weights = gmm_update(self.features, cnt_tbl, cov_type='fix', scaling_fix_cov=0.1) self.link_tbl = gmm_assign(self.gmm, self.features) if self.use_em: eprint('continue training with LM-GMM') self.link_tbl, self.gmm, self.weights, self.xe, self.ll =\ em_decipher(self.features, self.unigram_tbl, self.bigram_tbl, self.link_tbl) else: pass if self.unigram_tbl is not None and self.bigram_tbl is not None: _, _, prb_cf = em_forward_backward(self.features, self.unigram_tbl, self.bigram_tbl, self.link_tbl) eprint('log likelihood of LM-GMM is {}'.format(prb_cf)) self.ll = prb_cf self.xe = cross_entropy([prb_cf], [self.n])
def em_restart(line, unigram_tbl, bigram_tbl, weighted_tbl_init_function, subst_tbl_init_function, restart=10, use_alternative_update=False): """ EM with random restarts. :param weighted_tbl_init_function: A function generating weighted table to initialize Gaussian distribution parameters. Let k=#clusters, n=#observations, then the table should have k * n dims. Each value in cells could be any positive numbers. The rations between values indicate the importance of each feature for composing clusters. :return: the best link_tbl, gmm model, cross entropy and likelihood after all restarts. """ best_link_tbl = None best_subst_tbl = None best_gmm = None best_xe = np.inf best_weights = None best_ll = None eprint('start training...') for i in range(restart + 1): eprint('init parameters') init = weighted_tbl_init_function() gmm, weights = gmm_update(line, init, cov_type='fix') link_tbl = gmm_assign(gmm, line) if i > 0: eprint('random restart --- {} restarts remaining, ' 'best cross entropy so far is {}'.format( restart - i, best_xe)) subst_init_tbl = subst_tbl_init_function() if use_alternative_update: decipher_func = em_decipher_alternative else: decipher_func = em_decipher link_tbl, subst_tbl, gmm, weights, xe, ll = decipher_func( line, unigram_tbl, bigram_tbl, link_tbl, subst_init_tbl) if np.isnan(xe): # jump over nan results continue if xe < best_xe: best_ll = ll best_xe = xe best_link_tbl = link_tbl best_subst_tbl = subst_tbl best_gmm = gmm best_weights = weights eprint('with {} restarts, ' 'the best cross entropy is {}, ' 'the best log likelihood is {}'.format(restart, best_xe, best_ll)) return (best_link_tbl, best_subst_tbl, best_gmm, best_weights, best_xe, best_ll)
def em_iter_update(cnt_tbl, line): """ given count, update parameters """ # only ration between pdfs matters, so we use column normalization normalized_tbl = cnt_tbl - logsumexp(cnt_tbl, axis=0)[np.newaxis, :] weighted_tbl = np.exp(normalized_tbl) gmm, weights = gmm_update(line, weighted_tbl, cov_type='fix') link_tbl = gmm_assign(gmm, line) return link_tbl, gmm, weights
def em_iter_update(cnt_tbl, line): """ given count, update parameters """ normalized_tbl = cnt_tbl - logsumexp(cnt_tbl, axis=0)[np.newaxis, :] weighted_tbl = np.exp(normalized_tbl) gmm, weights = gmm_update(line, weighted_tbl, cov_type='fix', scaling_fix_cov=0.1) link_tbl = gmm_assign(gmm, line) return link_tbl, gmm, weights
def em_iter_update(cnt_tbl, line): """ given count, update parameters """ # only ratio between pdfs is useful # that's why we use column normalization here normalized_tbl = cnt_tbl - logsumexp(cnt_tbl, axis=0)[np.newaxis, :] weighted_tbl = np.exp(normalized_tbl) gmm, weights = gmm_update(line, weighted_tbl, cov_type='fix', scaling_fix_cov=0.1) link_tbl = gmm_assign(gmm, line) return link_tbl, gmm, weights
def em_iter_update(cnt_tbl, line, cov_type='fix', scaling_factor=0.1): """ given count, update parameters """ # TODO: verify here with GMM implementation in SKlearn. # TODO: SKlearn should has column normalization. # TODO: But no-column normalization should be right. normalized_tbl = cnt_tbl - logsumexp(cnt_tbl, axis=0)[np.newaxis, :] weighted_tbl = np.exp(normalized_tbl) # weighted_tbl = np.exp(cnt_tbl) gmm, weights = gmm_update(line, weighted_tbl, cov_type=cov_type, scaling_fix_cov=scaling_factor) link_tbl = gmm_assign(gmm, line) return link_tbl, gmm, weights
def em_gmm_restart(line, link_tbl_init_function, restart=10, cov_type='fix', scaling_factor=0.1): """ EM with random restarts. :return: the best link_tbl, gmm model and cross entropy after all restarts. """ best_link_tbl = None best_gmm = None best_xe = np.inf best_weights = None eprint('start training...') for i in range(restart + 1): eprint('init parameters') gmm, weights = gmm_update(line, link_tbl_init_function(), cov_type=cov_type, scaling_fix_cov=scaling_factor) link_tbl = gmm_assign(gmm, line) if i > 0: eprint('random restart --- {} restarts remaining, ' 'best cross entropy so far is {}'.format( restart - i, best_xe)) link_tbl, gmm, weights, xe = em_gmm(line, link_tbl, weights, cov_type=cov_type, scaling_factor=scaling_factor) if xe < best_xe: best_xe = xe best_link_tbl = link_tbl best_gmm = gmm best_weights = weights eprint('with {} restarts, ' 'the best cross entropy is {}'.format(restart, best_xe)) return best_link_tbl, best_gmm, best_weights, best_xe
def estimate_gmm(feature_bins, feature_zs, col_sizes, k, ignore=-1, cov_type='spherical', scaling_factor=1.0): n_row = len(feature_bins) features = [] weighted_tbl = [] for row in range(n_row): if row != ignore: w_tbl = np.zeros((k, col_sizes[row])) features += list(feature_bins[row, :col_sizes[row]]) w_tbl[feature_zs[row, :col_sizes[row]], range(col_sizes[row])] = 1.0 weighted_tbl.append(w_tbl) features = np.asarray(features) weighted_tbl = np.concatenate(weighted_tbl, axis=1) gmm, weights = gmm_update(features, weighted_tbl, cov_type=cov_type, scaling_fix_cov=scaling_factor) return gmm, np.log(weights)