def do_accumilate_posterior_obs(accumilation_dict, obs, aj, ei, posterior_unigram_val): # these are actual counts in log space!! if isinstance(obs, str) and (not isinstance(aj, tuple)) and isinstance(ei, str): if ('count_obs', obs) in accumilation_dict: accumilation_dict[('count_obs', obs)] = lu.logadd(accumilation_dict[('count_obs', obs)], posterior_unigram_val) else: accumilation_dict[('count_obs', obs)] = posterior_unigram_val if ('count_state', aj) in accumilation_dict: accumilation_dict[('count_state', aj)] = lu.logadd(accumilation_dict[('count_state', aj)], posterior_unigram_val) else: accumilation_dict[('count_state', aj)] = posterior_unigram_val if ('count_emission', obs, ei) in accumilation_dict: accumilation_dict[('count_emission', obs, ei)] = lu.logadd(accumilation_dict[('count_emission', obs, ei)], posterior_unigram_val) else: accumilation_dict[('count_emission', obs, ei)] = posterior_unigram_val # doing total counts ... if ('any_emission_from', ei) in accumilation_dict: accumilation_dict[('any_emission_from', ei)] = lu.logadd(accumilation_dict[('any_emission_from', ei)], posterior_unigram_val) else: accumilation_dict[('any_emission_from', ei)] = posterior_unigram_val return accumilation_dict else: print 'obs must be string, aj must be str, ei must be string' exit()
def do_accumilate_posterior_obs(accumilation_dict, obs, aj, ei, posterior_unigram_val): # these are actual counts in log space!! if isinstance(obs, basestring) and (not isinstance(aj, tuple)) and isinstance(ei, basestring): if ('count_obs', obs) in accumilation_dict: accumilation_dict[('count_obs', obs)] = lu.logadd(accumilation_dict[('count_obs', obs)], posterior_unigram_val) else: accumilation_dict[('count_obs', obs)] = posterior_unigram_val if ('count_state', aj) in accumilation_dict: accumilation_dict[('count_state', aj)] = lu.logadd(accumilation_dict[('count_state', aj)], posterior_unigram_val) else: accumilation_dict[('count_state', aj)] = posterior_unigram_val if ('count_emission', obs, ei) in accumilation_dict: accumilation_dict[('count_emission', obs, ei)] = lu.logadd(accumilation_dict[('count_emission', obs, ei)], posterior_unigram_val) else: accumilation_dict[('count_emission', obs, ei)] = posterior_unigram_val # doing total counts ... if ('any_emission_from', ei) in accumilation_dict: accumilation_dict[('any_emission_from', ei)] = lu.logadd(accumilation_dict[('any_emission_from', ei)], posterior_unigram_val) else: accumilation_dict[('any_emission_from', ei)] = posterior_unigram_val return accumilation_dict else: print 'obs must be string, aj must be str, ei must be string' exit()
def get_backwards(obs, trelis, alpha_pi, source_len=None): n = len(obs) - 1 # index of last word beta_pi = {(n, (BOUNDRY_STATE, BOUNDRY_STATE)): 0.0} S = alpha_pi[(n, (BOUNDRY_STATE, BOUNDRY_STATE))] # from line 13 in pseudo code p_unigrams = {} p_obs = {} p_trans = {} for k in range(n, 0, -1): for v in trelis[k]: pb = beta_pi[(k, v)] aj = v[0] source_token = v[1] posterior_unigram_val = beta_pi[(k, v)] + alpha_pi[(k, v)] - S p_obs = do_accumilate_posterior_obs(p_obs, obs[k], aj, source_token, posterior_unigram_val) p_unigrams = do_append_posterior_unigrams(p_unigrams, k, v, posterior_unigram_val) for u in trelis[k - 1]: # print 'reverse transition', 'k', k, 'u', u, '->', 'v', v aj_1 = u[0] q = get_jump_transition(aj, aj_1, source_len) target_token = obs[k] e = get_emission(target_token, source_token) p = q + e beta_p = pb + p new_pi_key = (k - 1, u) if new_pi_key not in beta_pi: # implements lines 16 beta_pi[new_pi_key] = beta_p else: beta_pi[new_pi_key] = lu.logadd(beta_pi[new_pi_key], beta_p) posterior_bigram_val = alpha_pi[(k - 1, u)] + p + beta_pi[(k, v)] - S p_trans = do_accumilate_posterior_bigrams_jump(p_trans, aj, aj_1, posterior_bigram_val, source_len) return p_unigrams, p_trans, p_obs, S, beta_pi
def get_jump_mle(alignments_split, source_split): jcounts = {} for a, s in zip(alignments_split, source_split): alignment_bigrams = [(a[i], a[i - 1]) for i in range(1, len(a))] for j1, j0 in alignment_bigrams: jkey = jump_key(j1, j0, len(s)) jcounts[jkey] = lu.logadd(jcounts.get(jkey, float('-inf')), 0.0) return jcounts
def do_accumilate_posterior_bigrams_jump(accumilation_dict, aj, aj_1, posterior_bigram_val, sent_length): # these are actual counts in log space!! if not isinstance(aj, tuple) or isinstance(aj_1, tuple): jkey = jump_key(aj, aj_1, sent_length) accumilation_dict[jkey] = lu.logadd(accumilation_dict.get(jkey, float('-inf')), posterior_bigram_val) return accumilation_dict else: print 'aj and aj_1 should be str ### or int', aj, aj_1 exit()
def accumilate(accumilator, addition): for k, val in addition.iteritems(): if isinstance(val, float): s = lu.logadd(accumilator.get(k, float('-inf')), val) accumilator[k] = s elif isinstance(val, set): accumilator[k] = accumilator.get(k, set([])) accumilator[k].update(val) return accumilator
def do_accumilate_posterior_bigrams(accumilation_dict, aj, aj_1, posterior_bigram_val): # these are actual counts in log space!! if not isinstance(aj, tuple) or isinstance(aj_1, tuple): if ('count_transition', aj, aj_1) not in accumilation_dict: accumilation_dict[('count_transition', aj, aj_1)] = posterior_bigram_val else: accumilation_dict[('count_transition', aj, aj_1)] = lu.logadd(accumilation_dict[('count_transition', aj, aj_1)], posterior_bigram_val) if ('any_transition_from', aj_1) not in accumilation_dict: accumilation_dict[('any_transition_from', aj_1)] = posterior_bigram_val else: accumilation_dict[('any_transition_from', aj_1)] = lu.logadd(accumilation_dict[('any_transition_from', aj_1)], posterior_bigram_val) return accumilation_dict else: print 'aj and aj_1 should be str ### or int', aj, aj_1 exit()
def do_accumilate_posterior_bigrams_jump(accumilation_dict, aj, aj_1, posterior_bigram_val, sent_length): # these are actual counts in log space!! if not isinstance(aj, tuple) or isinstance(aj_1, tuple): jkey = jump_key(aj, aj_1) jiip_key = jump_iip_key(sent_length, aj_1) accumilation_dict[jkey] = lu.logadd(accumilation_dict.get(jkey, float('-inf')), posterior_bigram_val) accumilation_dict[jiip_key] = accumilation_dict.get(jiip_key, set([])) accumilation_dict[jiip_key].add(jkey) return accumilation_dict else: print 'aj and aj_1 should be str ### or int', aj, aj_1 exit()
def get_jump_mle(alignments_split, source_split): jump_counts = {} jump_keys_by_sentence_len = {} for a, s in zip(alignments_split, source_split): alignment_bigrams = [(a[i], a[i - 1]) for i in range(1, len(a))] for j1, j0 in alignment_bigrams: jkey = jump_key(j1, j0) jiip_key = jump_iip_key(len(s), j0) jump_counts[jkey] = lu.logadd(jump_counts.get(jkey, float('-inf')), 0.0) jump_keys_by_sentence_len[jiip_key] = jump_keys_by_sentence_len.get(jiip_key, set([])) jump_keys_by_sentence_len[jiip_key].add(jkey) jiip = {} for jiip_key, jkeys in jump_keys_by_sentence_len.iteritems(): jkeys_val = [jump_counts[jk] for jk in jkeys] jiip[jiip_key] = lu.logadd_of_list(jkeys_val) return jump_counts, jiip
def get_jump_mle(alignments_split, source_split): jump_counts = {} jump_keys_by_sentence_len = {} for a, s in zip(alignments_split, source_split): alignment_bigrams = [(a[i], a[i - 1]) for i in range(1, len(a))] for j1, j0 in alignment_bigrams: jkey = jump_key(j1, j0) jiip_key = jump_iip_key(len(s), j0) jump_counts[jkey] = lu.logadd(jump_counts.get(jkey, float("-inf")), 0.0) jump_keys_by_sentence_len[jiip_key] = jump_keys_by_sentence_len.get(jiip_key, set([])) jump_keys_by_sentence_len[jiip_key].add(jkey) jiip = {} for jiip_key, jkeys in jump_keys_by_sentence_len.iteritems(): jkeys_val = [jump_counts[jk] for jk in jkeys] jiip[jiip_key] = lu.logadd_of_list(jkeys_val) return jump_counts, jiip
def get_jump_transition(current_state, prev_state, sent_length): ''' This method implements eq (5) in the Vogel & Ney paper (HMM-Based Word Alignment in Statistical Translation) it returns the probability P(aj | aj-1, L) ''' jkey = jump_key(current_state, prev_state, sent_length) if jkey in jump_counts: # TODO: using a normal distribution to get probability of jump widths might be much faster! if (prev_state, sent_length) in jump_denoms: denom = jump_denoms[(prev_state, sent_length)] else: denom = float('-inf') for l in range(sent_length): jl_key = jump_key(l, prev_state, sent_length) denom = lu.logadd(denom, jump_counts.get(jl_key, float('-inf'))) jump_denoms[(prev_state, sent_length)] = denom return jump_counts[jkey] - denom else: return -100.00
def get_backwards(obs, trelis, alpha_pi): n = len(obs) - 1 # index of last word beta_pi = {(n, (BOUNDRY_STATE, BOUNDRY_STATE)): 0.0} S = alpha_pi[(n, (BOUNDRY_STATE, BOUNDRY_STATE))] # from line 13 in pseudo code p_unigrams = {} p_obs = {} p_trans = {} for k in range(n, 0, -1): for v in trelis[k]: pb = beta_pi[(k, v)] aj = v[0] source_token = v[1] posterior_unigram_val = beta_pi[(k, v)] + alpha_pi[(k, v)] - S p_obs = do_accumilate_posterior_obs(p_obs, obs[k], aj, source_token, posterior_unigram_val) p_unigrams = do_append_posterior_unigrams(p_unigrams, k, v, posterior_unigram_val) for u in trelis[k - 1]: #print 'reverse transition', 'k', k, 'u', u, '->', 'v', v aj_1 = u[0] q = get_transition(aj, aj_1) target_token = obs[k] e = get_emission(target_token, source_token) p = q + e beta_p = pb + p new_pi_key = (k - 1, u) if new_pi_key not in beta_pi: # implements lines 16 beta_pi[new_pi_key] = beta_p else: beta_pi[new_pi_key] = lu.logadd(beta_pi[new_pi_key], beta_p) #print 'beta ', new_pi_key, '=', beta_pi[new_pi_key], exp(beta_pi[new_pi_key]) posterior_bigram_val = alpha_pi[(k - 1, u)] + p + beta_pi[(k, v)] - S #posterior_bigram_val = "%.3f" % (exp(alpha_pi[(k - 1, u)] + p + beta_pi[(k, v)] - S)) p_trans = do_accumilate_posterior_bigrams( p_trans, aj, aj_1, posterior_bigram_val) return p_unigrams, p_trans, p_obs, S, beta_pi
def accumilate(accumilator, addition): for k in addition: s = lu.logadd(accumilator.get(k, float('-inf')), addition[k]) accumilator[k] = s return accumilator