def marginal_likelihood(e_corpus: Corpus, f_corpus: Corpus, model: ConditionalModel): PL, PM, PAj, PFj = model.components ll = 0.0 for e_snt, f_snt in zip(e_corpus.itersentences(), f_corpus.itersentences()): # observations l = e_snt.shape[0] m = f_snt.shape[0] log_pl = np.log(PL.generate(l)) log_pm = np.log(PM.generate(m)) # P(f|e) = \prod_j P(f_j|e) # = \prod_j \sum_i P(f_j,a_j=i|e) log_pf_e = 0.0 for j, f in enumerate(f_snt): # P(f_j|e) = \sum_i P(f_j,a_j=i|e) pfj_e = 0.0 # contribution of this French word for i, e in enumerate(e_snt): # P(f_j, a_j=i | e) = P(a_j=i) P(f_j|e_i, l, m) pfj_e += PAj.generate((j, i), e_snt, 0, l, m) * PFj.generate((j, f), (j, i), e_snt, 0, l, m) # P(f|z,e) = \prod_j P(f_j|z,e) log_pf_e += np.log(pfj_e) # \sum_{f,e} P(l)P(m)P(f|e,l,m) ll += log_pl + log_pm + log_pf_e return - ll / e_corpus.n_sentences()
def get_joint_ibm1(e_corpus: Corpus, f_corpus: Corpus): PL = cat.LengthDistribution() PM = cat.LengthDistribution() PZ = cat.ClusterDistribution(1) PEi = cat.UnigramMixture(1, e_corpus.vocab_size()) PAj = cat.UniformAlignment() PFj = cat.BrownLexical(e_corpus.vocab_size(), f_corpus.vocab_size()) return JointModel(PL, PM, PZ, PEi, PAj, PFj)
def main(e_path, f_path): e_corpus = Corpus(open(e_path), null='<null>') f_corpus = Corpus(open(f_path)) model = get_ibm1(e_corpus, f_corpus) EM(e_corpus, f_corpus, model, iterations=10) map_decoder(e_corpus, f_corpus, model, partial(print_map, e_corpus=e_corpus, f_corpus=f_corpus))
def main(e_path, f_path): e_corpus = Corpus(open(e_path), null='<null>') f_corpus = Corpus(open(f_path)) model = get_ibm1(e_corpus, f_corpus) EM(e_corpus, f_corpus, model, iterations=10) from lola.io import print_lola_format map_decoder(e_corpus, f_corpus, model, partial(print_lola_format, e_corpus=e_corpus, f_corpus=f_corpus, ostream=sys.stdout))
def read_component(e_corpus: Corpus, f_corpus: Corpus, args, line: str, i: int, state: Config): """ Instantiate components. If you contribute a new component, make sure to construct it here. :param e_corpus: :param f_corpus: :param args: :param line: :param i: :param state: :return: """ try: cfg, [name, _] = util.re_sub('^([^:]+)(:)', '', line) except: raise ValueError('In line %d, expected component name: %s' % (i, line)) if state.has_component(name): raise ValueError('Duplicate component name in line %d: %s', i, name) cfg, component_type = util.re_key_value('type', cfg, optional=False, dtype=str) if component_type == 'BrownLexical': state.add_component(name, BrownLexical(e_corpus, f_corpus, name=name)) elif component_type == 'UniformAlignment': state.add_component(name, UniformAlignment(name=name)) elif component_type == 'VogelJump': state.add_component(name, VogelJump(e_corpus.max_len(), name=name)) elif component_type == "LexMLP": state.add_component(name, MLPComponent.construct(e_corpus, f_corpus, name, cfg)) elif component_type == "LexLR": state.add_component(name, LRComponent.construct(e_corpus, f_corpus, name, cfg)) else: raise ValueError("I do not know this type of generative component: %s" % component_type)
def print_map(s: int, z: int, a: 'np.array', pz: float, pa: 'np.array', e_corpus: Corpus, f_corpus: Corpus, ostream=sys.stdout): e_snt = e_corpus.sentence(s) f_snt = f_corpus.sentence(s) tokens = [] for j, (i, p) in enumerate(zip(a, pa)): tokens.append('%d:%s|%d:%s|%.2f' % (j + 1, f_corpus.translate( f_snt[j]), i, e_corpus.translate(e_snt[i]), p)) print('%d|%.2f ||| %s' % (z, pz, ' '.join(tokens)), file=ostream)
def get_joint_ibm1z(e_corpus: Corpus, f_corpus: Corpus, n_clusters=1, cluster_unigrams=True, alpha=1.0): PL = cat.LengthDistribution() PM = cat.LengthDistribution() if not cluster_unigrams: PZ = cat.ClusterDistribution(n_clusters) else: PZ = cat.ClusterUnigrams(n_clusters) PEi = cat.UnigramMixture(n_clusters, e_corpus.vocab_size(), alpha) PAj = cat.UniformAlignment() PFj = cat.MixtureOfBrownLexical(n_clusters, e_corpus.vocab_size(), f_corpus.vocab_size(), alpha) return JointModel(PL, PM, PZ, PEi, PAj, PFj)
def read_corpora(training_path: str, test_path: str, generating: bool, min_count: int, max_count: int) -> (CorpusView, CorpusView): """ Return training and test data. :param training_path: path to training corpus :param test_path: path to test corpus (or None) :param generating: whether this is the side we are generating (French) :param min_count: minimum frequency for word to be retained in the vocabulary :param max_count: maximum frequency for word to be retained in the vocabulary :return: Training view and test view """ if test_path is None: # not test corpus if generating: corpus = Corpus(training_path, min_count=min_count, max_count=max_count) else: # we are conditioning on this corpus corpus = Corpus(training_path, null='<NULL>', min_count=min_count, max_count=max_count) return corpus, None else: # read training data with open(training_path, 'r') as fi: lines = fi.readlines() n_training = len(lines) # read test data with open(test_path, 'r') as fi: lines.extend(fi.readlines()) n_test = len(lines) - n_training # create a big corpus with everything if generating: corpus = Corpus(lines, min_count=min_count, max_count=max_count) else: # we are conditioning on this corpus corpus = Corpus(lines, null='<NULL>', min_count=min_count, max_count=max_count) # return two different views: the training view and the test view return CorpusView(corpus, 0, n_training), CorpusView(corpus, n_training, n_test)
def marginal_likelihood(e_corpus: Corpus, f_corpus: Corpus, model: JointModel): PL, PM, PZ, PEi, PAj, PFj = model.components n_clusters = PZ.n_clusters ll = 0.0 for e_snt, f_snt in zip(e_corpus.itersentences(), f_corpus.itersentences()): # observations l = e_snt.shape[0] m = f_snt.shape[0] log_pl = np.log(PL.generate(l)) log_pm = np.log(PM.generate(m)) # 0-order alignments # P(f,e) = \sum_z P(z) P(e|z) P(f|z,e) log_pfe = -np.inf # contribution of this sentence for z in range(n_clusters): # contribution of the cluster log_pz = np.log(PZ.generate(z, l, m)) # compute the contribution of the entire English sentence log_pe_z = 0.0 # P(e|z) = \prod_i P(e_i|z) for i, e in enumerate(e_snt): log_pe_z += np.log(PEi.generate((i, e), z, l, m)) # P(f|z,e) = \prod_j P(f_j|z,e) # = \prod_j \sum_i P(f_j,a_j=i|z,e) log_pf_ze = 0.0 for j, f in enumerate(f_snt): # P(f_j|z,e) = \sum_i P(f_j,a_j=i|z,e) pfj_ze = 0.0 # contribution of this French word for i, e in enumerate(e_snt): pfj_ze += PAj.generate( (j, i), e_snt, z, l, m) * PFj.generate( (j, f), (j, i), e_snt, z, l, m) # P(f|z,e) = \prod_j P(f_j|z,e) log_pf_ze += np.log(pfj_ze) # \sum_z P(z) P(e|z) P(f|z,e) log_pfe = np.logaddexp(log_pfe, log_pz + log_pe_z + log_pf_ze) # \sum_{f,e} P(l)P(m)P(f,e|l,m) ll += log_pl + log_pm + log_pfe return -ll / e_corpus.n_sentences()
def map_decoder(e_corpus: Corpus, f_corpus: Corpus, model: JointModel, callback): """ :param e_corpus: English data :param f_corpus: French data :param model: components :param callback: called for each sentence in the parallel corpus callable(s, z, a, p(z|f,e), p(a|z,f,e)) """ n_clusters = model.PZ.n_clusters ll = 0.0 # E-step for s, (e_snt, f_snt) in enumerate( zip(e_corpus.itersentences(), f_corpus.itersentences())): log_pz_fe, log_post_a = log_posterior(e_snt, f_snt, model) # Here we get the best path for each cluster best_paths_z = log_post_a.argmax(2) # shape: (n_clusters, m) # Now we find out which path is the best one across clusters best_z = 0 best_log_prob = -np.inf for z in range(n_clusters): # p(z,a|f,e) = p(z|f,e) p(a|z,f,e) path_log_prob = log_pz_fe[z] + np.sum( [log_post_a[z, j, i] for j, i in enumerate(best_paths_z[z])]) if path_log_prob > best_log_prob: # update if better best_log_prob = path_log_prob best_z = z # best posterior probabilities: p(a|z,fe) best_log_pa_zfe = np.array( [log_post_a[z, j, i] for j, i in enumerate(best_paths_z[z])]) # communicate the finding callback(s, best_z, best_paths_z[best_z], np.exp(log_pz_fe[best_z]), np.exp(best_log_pa_zfe))
def map_decoder(e_corpus: Corpus, f_corpus: Corpus, model: ConditionalModel, callback): """ :param e_corpus: English data :param f_corpus: French data :param model: components :param callback: called for each sentence in the parallel corpus callable(s, z, a, p(z|f,e), p(a|z,f,e)) """ # E-step for s, (e_snt, f_snt) in enumerate(zip(e_corpus.itersentences(), f_corpus.itersentences())): log_post_a = log_posterior(e_snt, f_snt, model) # Here we get the best path for each cluster best_path = log_post_a.argmax(1) # shape: (m) # best posterior probabilities: p(a|z,fe) best_posterior = np.array([log_post_a[j, i] for j, i in enumerate(best_path)]) # communicate the finding callback(s, best_path, np.exp(best_posterior))
def print_lola_format(sid, alignments, posterior, e_corpus: Corpus, f_corpus: Corpus, ostream): """ Print alignment in a human readable format. :param e_corpus: data we condition on :param f_corpus: data we generate :param sid: sentence id :param alignments: alignments (sequence of a_j values for each j) :param posterior: posterior p(a_j|f,e) :param ostream: where to write alignments to :return: """ e_snt = e_corpus.sentence(sid) f_snt = f_corpus.sentence(sid) # in printing we make the French sentence 1-based by convention # we keep the English sentence 0-based because of the NULL token print(' '.join(['{0}:{1}|{2}:{3}|{4:.2f}'.format(j + 1, f_corpus.translate(f_snt[j]), i, e_corpus.translate(e_snt[i]), p) for j, (i, p) in enumerate(zip(alignments, posterior))]), file=ostream)
def print_lola_format(sid, alignments, posterior, e_corpus: Corpus, f_corpus: Corpus, ostream): """ Print alignment in a human readable format. :param e_corpus: data we condition on :param f_corpus: data we generate :param sid: sentence id :param alignments: alignments (sequence of a_j values for each j) :param posterior: posterior p(a_j|f,e) :param ostream: where to write alignments to :return: """ e_snt = e_corpus.sentence(sid) f_snt = f_corpus.sentence(sid) # in printing we make the French sentence 1-based by convention # we keep the English sentence 0-based because of the NULL token print(' '.join([ '{0}:{1}|{2}:{3}|{4:.2f}'.format(j + 1, f_corpus.translate(f_snt[j]), i, e_corpus.translate(e_snt[i]), p) for j, (i, p) in enumerate(zip(alignments, posterior)) ]), file=ostream)
def EM(e_corpus: Corpus, f_corpus: Corpus, model: ConditionalModel, iterations=5): """ Generative story: l ~ P(L) m ~ P(M) a_j ~ P(A_j | l) for j=1..m f_j ~ P(F_j | e_{a_j}) for j=1..m :param e_corpus: English data :param f_corpus: French data :param model: a conditional model :param iterations: EM iterations """ PL, PM, PAj, PFj = model.components logging.info('Iteration %d Likelihood %f', 0, marginal_likelihood(e_corpus, f_corpus, model)) for iteration in range(1, iterations + 1): # E-step for s, (e_snt, f_snt) in enumerate(zip(e_corpus.itersentences(), f_corpus.itersentences())): # get the posterior P(a|f,e) post_a = posterior(e_snt, f_snt, model) l = e_snt.shape[0] m = f_snt.shape[0] # gather expected counts for (f_j, e_j): p(a_j=i|f,e) for j, f in enumerate(f_snt): for i, e in enumerate(e_snt): PAj.observe((j, i), e_snt, 0, l, m, post_a[j, i]) PFj.observe((j, f), (j, i), e_snt, 0, l, m, post_a[j, i]) # M-step model.update() logging.info('Iteration %d Likelihood %f', iteration, marginal_likelihood(e_corpus, f_corpus, model))
def read_component(e_corpus: Corpus, f_corpus: Corpus, args, line: str, i: int, state: Config): """ Instantiate components. If you contribute a new component, make sure to construct it here. :param e_corpus: :param f_corpus: :param args: :param line: :param i: :param state: :return: """ try: cfg, [name, _] = util.re_sub('^([^:]+)(:)', '', line) except: raise ValueError('In line %d, expected component name: %s' % (i, line)) if state.has_component(name): raise ValueError('Duplicate component name in line %d: %s', i, name) cfg, component_type = util.re_key_value('type', cfg, optional=False, dtype=str) if component_type == 'BrownLexical': state.add_component(name, BrownLexical(e_corpus, f_corpus, name=name)) elif component_type == 'UniformAlignment': state.add_component(name, UniformAlignment(name=name)) elif component_type == 'VogelJump': state.add_component(name, VogelJump(e_corpus.max_len(), name=name)) elif component_type == "LexMLP": state.add_component( name, MLPComponent.construct(e_corpus, f_corpus, name, cfg)) elif component_type == "LexLR": state.add_component( name, LRComponent.construct(e_corpus, f_corpus, name, cfg)) else: raise ValueError( "I do not know this type of generative component: %s" % component_type)
def __init__(self, e_corpus: Corpus, f_corpus: Corpus, name: str = "lexlr", rng=np.random.RandomState(1234), hidden=[100], learning_rate=0.1, max_iterations=100, patience=10, patience_increase=2, improvement_threshold=0.995): """ :param e_corpus: data we condition on :param f_corpus: data we generate :param name: name of the component :param rng: numpy random state :param hidden: dimensionality of hidden layers :param learning_rate: initial learning rate :param max_iterations: maximum number of updates :param patience: minimum number of updates :param patience_increase: :param improvement_threshold: """ super(LRComponent, self).__init__(name, LexEventSpace(e_corpus, f_corpus)) # TODO: generalise to batches? self._corpus_size = e_corpus.n_sentences() self._learning_rate = learning_rate self._max_iterations = max_iterations self._patience = patience self._patience_increase = patience_increase self._improvement_threshold = improvement_threshold # The event space determines the input and output dimensionality vE, vF = self.event_space.shape # TODO: Featurize(event_space) # for now my features are (English word identity concatenated with French word identity) # TODO: create a better matrix where we have # vE * vF rows but we have d1 + d2 + d3 columns where d1 is the E embedding, d2 is the F embedding and d3 is whatever else self._X = np.zeros((vE * vF, vE + vF), dtype=theano.config.floatX) for e, f in product(range(vE), range(vF)): self._X[e * vF + f, e] = 1.0 self._X[e * vF + f, vE + f] = 1.0 # Create MLP builder = NNBuilder(rng) # ... the embedding layer builder.add_layer(vE + vF, hidden[0]) # ... additional hidden layers for di, do in zip(hidden, hidden[1:]): builder.add_layer(di, do) # The Logistic Regression adds the final scoring layer and is responsible for normalisation over vF classes self._nn = LR(builder, vE, vF) # type: MLP # Create Theano variables for the MLP input nn_input = T.matrix('mlp_input') # ... and the expected output nn_expected = T.matrix('mlp_expected') learning_rate = T.scalar('learning_rate') # Learning rate and momentum hyperparameter values # Again, for non-toy problems these values can make a big difference # as to whether the network (quickly) converges on a good local minimum. #learning_rate = 0.01 momentum = 0 # Create a theano function for computing the MLP's output given some input self._nn_output = theano.function([nn_input], self._nn.output(nn_input)) # Create a function for computing the cost of the network given an input cost = - self._nn.expected_logprob(nn_input, nn_expected) # Create a theano function for training the network self._train = theano.function([nn_input, nn_expected, learning_rate], # cost function cost, updates=gradient_updates_momentum(cost, self._nn.params, learning_rate, momentum)) # table to store the CPDs (output of LR reshaped into a (vE, vF) matrix) self._cpds = self._nn_output(self._X).reshape(self.event_space.shape) # table to gather expected counts self._counts = np.zeros(self.event_space.shape, dtype=theano.config.floatX) self._i = 0
def __init__(self, e_corpus: Corpus, f_corpus: Corpus, name: str = "lexmlp", rng=np.random.RandomState(1234), hidden=[100], learning_rate=0.1, max_iterations=100, patience=10, patience_increase=2, improvement_threshold=0.995): """ :param e_corpus: data we condition on :param f_corpus: data we generate :param name: name of the component :param rng: numpy random state :param hidden: dimensionality of hidden layers :param learning_rate: initial learning rate :param max_iterations: maximum number of updates :param patience: minimum number of updates :param patience_increase: :param improvement_threshold: """ self._corpus_size = e_corpus.n_sentences() self._learning_rate = learning_rate self._max_iterations = max_iterations self._patience = patience self._patience_increase = patience_increase self._improvement_threshold = improvement_threshold # The event space determines the input and output dimensionality self.n_input, self.n_output = e_corpus.vocab_size(), f_corpus.vocab_size() # Input for the classifiers self._X = np.identity(self.n_input, dtype=theano.config.floatX) # Create MLP builder = NNBuilder(rng) # ... the embedding layer builder.add_layer(self.n_input, hidden[0]) # ... additional hidden layers for di, do in zip(hidden, hidden[1:]): builder.add_layer(di, do) # The MLP adds a softmax layer over n_classes self._mlp = MLP(builder, n_classes=f_corpus.vocab_size()) # type: MLP # Create Theano variables for the MLP input mlp_input = T.matrix('mlp_input') # ... and the expected output mlp_expected = T.matrix('mlp_expected') learning_rate = T.scalar('learning_rate') # Learning rate and momentum hyperparameter values # Again, for non-toy problems these values can make a big difference # as to whether the network (quickly) converges on a good local minimum. #learning_rate = 0.01 momentum = 0 # Create a theano function for computing the MLP's output given some input self._mlp_output = theano.function([mlp_input], self._mlp.output(mlp_input)) # Create a function for computing the cost of the network given an input cost = - self._mlp.expected_logprob(mlp_input, mlp_expected) # Create a theano function for training the network self._train = theano.function([mlp_input, mlp_expected, learning_rate], # cost function cost, updates=gradient_updates_momentum(cost, self._mlp.params, learning_rate, momentum)) # table to store the CPDs (output of MLP) self._cpds = self._mlp_output(self._X) # table to gather expected counts self._counts = np.zeros((self.n_input, self.n_output), dtype=theano.config.floatX) self._i = 0
def __init__(self, e_corpus: Corpus, f_corpus: Corpus, name: str = "lexmlp", rng=np.random.RandomState(1234), hidden=[100], learning_rate=0.1, max_iterations=100, patience=10, patience_increase=2, improvement_threshold=0.995): """ :param e_corpus: data we condition on :param f_corpus: data we generate :param name: name of the component :param rng: numpy random state :param hidden: dimensionality of hidden layers :param learning_rate: initial learning rate :param max_iterations: maximum number of updates :param patience: minimum number of updates :param patience_increase: :param improvement_threshold: """ super(MLPComponent, self).__init__(name, LexEventSpace(e_corpus, f_corpus)) # TODO: generalise to batches? self._corpus_size = e_corpus.n_sentences() self._learning_rate = learning_rate self._max_iterations = max_iterations self._patience = patience self._patience_increase = patience_increase self._improvement_threshold = improvement_threshold # The event space determines the input and output dimensionality self.n_input, self.n_output = self.event_space.shape # Input for the classifiers (TODO: should depend on the event space more closely) self._X = np.identity(self.n_input, dtype=theano.config.floatX) # Create MLP builder = NNBuilder(rng) # ... the embedding layer builder.add_layer(self.n_input, hidden[0]) # ... additional hidden layers for di, do in zip(hidden, hidden[1:]): builder.add_layer(di, do) # ... and the output layer (a softmax layer) #builder.add_layer(hidden[-1], self.n_output, activation=T.nnet.softmax) # The MLP adds the softmax layer over n_classes self._mlp = MLP(builder, n_classes=self.n_output) # type: MLP # Create Theano variables for the MLP input mlp_input = T.matrix('mlp_input') # ... and the expected output mlp_expected = T.matrix('mlp_expected') learning_rate = T.scalar('learning_rate') # Learning rate and momentum hyperparameter values # Again, for non-toy problems these values can make a big difference # as to whether the network (quickly) converges on a good local minimum. #learning_rate = 0.01 momentum = 0 # Create a theano function for computing the MLP's output given some input self._mlp_output = theano.function([mlp_input], self._mlp.output(mlp_input)) # Create a function for computing the cost of the network given an input cost = -self._mlp.expected_logprob(mlp_input, mlp_expected) # Create a theano function for training the network self._train = theano.function( [mlp_input, mlp_expected, learning_rate], # cost function cost, updates=gradient_updates_momentum(cost, self._mlp.params, learning_rate, momentum)) # table to store the CPDs (output of MLP) self._cpds = self._mlp_output(self._X) # table to gather expected counts self._counts = np.zeros(self.event_space.shape, dtype=theano.config.floatX) self._i = 0
def get_ibm1(e_corpus: Corpus, f_corpus: Corpus): PL = cat.LengthDistribution() PM = cat.LengthDistribution() PAj = cat.UniformAlignment() PFj = cat.BrownLexical(e_corpus.vocab_size(), f_corpus.vocab_size()) return ConditionalModel(PL, PM, PAj, PFj)
def EM(e_corpus: Corpus, f_corpus: Corpus, model: JointModel, iterations=5): """ Generative story: l ~ P(L) m ~ P(M) z ~ P(Z) e_i ~ P(E_i | z) for i=1..l a_j ~ P(A_j | l) for j=1..m f_j ~ P(F_j | e_{a_j}, z) for j=1..m Joint distribution: P(F,E,A,Z,L,M) = P(L)P(M)P(Z)P(E|Z)P(A|L,M)P(F|E,A,Z,L,M) We make the following independence assumptions: P(e|z) = prod_i P(e_i|z) P(f|e,a,z,l,m) = prod_j P(a_j|l,m)P(f_j|e_{a_j},z) The EM algorithm depends on 2 posterior computations: [1] P(z|f,e,l,m) = P(z)P(e|z)P(f|e,z)/P(f,e) where P(e|z) = \prod_i P(e_i|z) P(f|e,z) = \sum_a P(f,a|e,z) = \prod_j \sum_i P(a_j=i)P(f_j|e_i,z) P(f,e) = \sum_z \sum_a P(f,e,z,a|l,m) = \sum_z P(z)P(e|z) P(f|e,z) = \sum_z P(z)P(e|z) \prod_j \sum_i P(a_j=i) P(f_j|e_i,z) and [2] P(a|f,e,z) = P(a,z,f,e,l,m) / P(f,e,z,l,m) = P(z)(e|z)P(a|l,m)P(f|e,a,z) ---------------------------------- \sum_a P(z)(e|z)P(a|l,m)P(f|e,a,z) = P(z)(e|z)P(a|l,m)P(f|e,a,z) ---------------------------------- P(z)(e|z)\sum_a P(a|l,m)P(f|e,a,z) = P(z)(e|z)\prod_j P(a_j|l,m)P(f_j|e_{a_j},z) ----------------------------------------------- P(z)(e|z)\prod_j\sum_i P(a_j=i|l,m)P(f_j|e_i,z) = \prod_j P(a_j|l,m)P(f_j|e_{a_j},z) ------------------------------- \sum_i P(a_j=i|l,m)P(f_j|e_i,z) = \prod_j P(a_j|f, e, z) where P(a_j|f,e,z) = P(a_j|l,m)P(f_j|e_{a_j},z) ------------------------------- \sum_i P(a_j=i|l,m)P(f_j|e_i,z) Note that the choice of parameterisation is indenpendent of the EM algorithm in this method. For example, P(a_j|l,m) can be * uniform (IBM1) * categorical (IBM2) P(f_j|e_{a_j}, z) can be * categorical and independent of z, i.e. P(f_j|e_{a_j}, z) = P(f_j|e_{a_j}) * categorical * PoE: P(f_j|e_{a_j}, z) \propto P(f_j|e_{a_j}) P(f_j|z) * all of the above using MLP or LR instead of categorical distributions we can also have P(a_j|l,m)P(f_j|e_{a_j}, z) modelled by a single LR (with MLP-induced features). :param e_corpus: English data :param f_corpus: French data :param model: all components :param iterations: EM iterations """ PL, PM, PZ, PEi, PAj, PFj = model.components n_clusters = PZ.n_clusters logging.info('Iteration %d Likelihood %f', 0, marginal_likelihood(e_corpus, f_corpus, model)) for iteration in range(1, iterations + 1): # E-step for s, (e_snt, f_snt) in enumerate( zip(e_corpus.itersentences(), f_corpus.itersentences())): # get the factorised posterior: P(z|f,e) and P(a|z,f,e) post_z, post_a = posterior(e_snt, f_snt, model) l = e_snt.shape[0] m = f_snt.shape[0] for z in range(n_clusters): # gather expected count for z: p(z|f, e) PZ.observe(z, l, m, post_z[z]) # gather expected counts for (z, e_i): p(z|f, e) for i, e in enumerate(e_snt): PEi.observe((i, e), z, l, m, post_z[z]) # gather expected counts for (f_j, e_j): p(a_j=i|f,e,z) for j, f in enumerate(f_snt): for i, e in enumerate(e_snt): PAj.observe((j, i), e_snt, z, l, m, post_a[z, j, i]) PFj.observe((j, f), (j, i), e_snt, z, l, m, post_a[z, j, i]) # M-step model.update() logging.info('Iteration %d Likelihood %f', iteration, marginal_likelihood(e_corpus, f_corpus, model))