예제 #1
0
def alternative_update_subst(unigram_tbl,
                             bigram_tbl,
                             link_tbl,
                             subst_tbl,
                             xe_gap,
                             max_iter,
                             prev_xe=np.inf):
    c = 1
    k, n = link_tbl.shape
    normalization_factor = np.sum(logsumexp(link_tbl, axis=0))
    while True:
        cnt_tbl, cnt_subst_tbl, prb_cf = em_iter_count(unigram_tbl,
                                                       bigram_tbl,
                                                       link_tbl,
                                                       subst_tbl,
                                                       only_subst_tbl=True)
        subst_tbl = em_iter_update_subst(cnt_subst_tbl)
        x_entropy = cross_entropy([prb_cf], [n])
        eprint('iter-subst {} cross entropy is {}, gap {},'
               ' logP(c) {}'.format(c, x_entropy,
                                    abs(1.0 - x_entropy / prev_xe), prb_cf))
        if time_to_stop(c, max_iter, prev_xe, x_entropy, xe_gap):
            break
        elif np.isnan(x_entropy):
            eprint('program end in iter {} caused by nan'.format(c))
            break
        else:
            prev_xe = x_entropy
            c += 1
    return subst_tbl, x_entropy, prb_cf, normalization_factor
예제 #2
0
def alternative_update_gmm(line,
                           unigram_tbl,
                           bigram_tbl,
                           link_tbl,
                           subst_tbl,
                           xe_gap,
                           max_iter,
                           prev_xe=np.inf):
    c = 1
    while True:
        cnt_tbl, cnt_subst_tbl, prb_cf = em_iter_count(unigram_tbl, bigram_tbl,
                                                       link_tbl, subst_tbl)
        link_tbl, gmm, weights = em_iter_update(cnt_tbl, line)
        ll_gmm = gmm_log_likelihood(link_tbl, weights)
        x_entropy = cross_entropy([prb_cf], [len(line)])
        eprint('iter-GMM {} cross entropy is {}, gap {},'
               ' logP(c) {}, logP_GMM(c) {}'.format(
                   c, x_entropy, abs(1.0 - x_entropy / prev_xe), prb_cf,
                   ll_gmm))
        if time_to_stop(c, max_iter, prev_xe, x_entropy, xe_gap):
            break
        elif np.isnan(x_entropy):
            eprint('program end in iter {} caused by nan'.format(c))
            break
        else:
            prev_xe = x_entropy
            c += 1
    return link_tbl, gmm, weights, x_entropy, prb_cf
예제 #3
0
def em_decipher_alternative(line,
                            unigram_tbl,
                            bigram_tbl,
                            link_tbl,
                            subst_tbl,
                            xe_gap=1e-8,
                            max_iter=300):

    prev_xe_gmm = np.inf
    prev_xe_subst = np.inf

    t = 1
    while True:
        eprint('start training GMM')
        link_tbl, gmm, weights, x_entropy, prb_cf_gmm = alternative_update_gmm(
            line,
            unigram_tbl,
            bigram_tbl,
            link_tbl,
            subst_tbl,
            1e-8,
            max_iter,
            prev_xe=np.inf)

        if time_to_stop(t, max_iter, prev_xe_gmm, x_entropy, xe_gap):
            break
        else:
            prev_xe_gmm = x_entropy

        eprint('start training subst')
        subst_tbl, x_entropy, prb_cf_subst, normalization_factor =\
            alternative_update_subst(
                unigram_tbl, bigram_tbl, link_tbl, subst_tbl,
                1e-5, max_iter, prev_xe=prev_xe_subst)

        # keep the real prb_cf for later use
        prb_cf_subst += normalization_factor

        if time_to_stop(t, max_iter, prev_xe_subst, x_entropy, 1e-5):
            break
        else:
            prev_xe_subst = x_entropy

        t += 1

    return link_tbl, subst_tbl, gmm, weights, x_entropy, prb_cf_gmm
예제 #4
0
def em_decipher(line,
                unigram_tbl,
                bigram_tbl,
                link_tbl,
                subst_tbl,
                xe_gap=1e-8,
                max_iter=0):
    """
    EM on a line of features.
    EM iterations stop if matches one of the following conditions:
      1) reach the max_iter
      2) current cross entropy / last cross entropy >= xe_gap
    :return: final link_tbl, gmm model, cross entropy, and log likelihood
    """
    # prepare hyper-parameters
    prev_xe = np.inf

    # start training
    c = 1
    while True:
        cnt_tbl, cnt_subst_tbl, prb_cf = em_iter_count(unigram_tbl, bigram_tbl,
                                                       link_tbl, subst_tbl)
        _, gmm, weights = em_iter_update(cnt_tbl, line)
        # subst_tbl = em_iter_update_subst(cnt_subst_tbl)
        ll_gmm = gmm_log_likelihood(link_tbl, weights)
        x_entropy = cross_entropy([prb_cf], [len(line)])
        eprint('iter {} cross entropy is {}, gap {},'
               ' logP(c) {}, logP_GMM(c) {}'.format(
                   c, x_entropy, abs(1.0 - x_entropy / prev_xe), prb_cf,
                   ll_gmm))

        if time_to_stop(c, max_iter, prev_xe, x_entropy, xe_gap):
            break
        elif np.isnan(x_entropy):
            eprint('program end in iter {} caused by nan'.format(c))
            break
        else:
            prev_xe = x_entropy
            c += 1

    return link_tbl, subst_tbl, gmm, weights, x_entropy, prb_cf
예제 #5
0
def em_gmm(line,
           link_tbl,
           weights,
           xe_gap=1e-8,
           max_iter=300,
           cov_type='fix',
           scaling_factor=0.1):
    """
    EM on a line of features.
    EM iterations stop if matches one of the following conditions:
      1) reach the max_iter
      2) current cross entropy / last cross entropy >= xe_gap
    :return: final link_tbl, gmm model, and cross entropy
    """
    # prepare hyper-parameters
    prev_xe = np.inf

    # start training
    c = 1
    while True:
        cnt_tbl, prb_cf = em_iter_gmm_count(link_tbl, weights)
        link_tbl, gmm, weights = em_iter_update(cnt_tbl,
                                                line,
                                                cov_type=cov_type,
                                                scaling_factor=scaling_factor)
        x_entropy = cross_entropy([prb_cf], [len(line)])
        eprint('iter {} cross entropy is {}, gap {}, logP(c) {}'.format(
            c, x_entropy, x_entropy / prev_xe, prb_cf))
        if time_to_stop(c, max_iter, prev_xe, x_entropy, xe_gap):
            break
        elif np.isnan(x_entropy):
            eprint('program end in iter {} caused by nan'.format(c))
            break
        else:
            prev_xe = x_entropy
            c += 1

    return link_tbl, gmm, weights, x_entropy