Пример #1
0
def marginal_likelihood(e_corpus: Corpus, f_corpus: Corpus, model: ConditionalModel):

    PL, PM, PAj, PFj = model.components
    ll = 0.0
    for e_snt, f_snt in zip(e_corpus.itersentences(), f_corpus.itersentences()):
        # observations
        l = e_snt.shape[0]
        m = f_snt.shape[0]
        log_pl = np.log(PL.generate(l))
        log_pm = np.log(PM.generate(m))

        # P(f|e) = \prod_j P(f_j|e)
        #          = \prod_j \sum_i P(f_j,a_j=i|e)
        log_pf_e = 0.0
        for j, f in enumerate(f_snt):
            # P(f_j|e) = \sum_i P(f_j,a_j=i|e)
            pfj_e = 0.0  # contribution of this French word
            for i, e in enumerate(e_snt):
                # P(f_j, a_j=i | e) = P(a_j=i) P(f_j|e_i, l, m)
                pfj_e += PAj.generate((j, i), e_snt, 0, l, m) * PFj.generate((j, f), (j, i), e_snt, 0, l, m)
            # P(f|z,e) = \prod_j P(f_j|z,e)
            log_pf_e += np.log(pfj_e)
        # \sum_{f,e} P(l)P(m)P(f|e,l,m)
        ll += log_pl + log_pm + log_pf_e
    return - ll / e_corpus.n_sentences()
Пример #2
0
def get_joint_ibm1(e_corpus: Corpus, f_corpus: Corpus):
    PL = cat.LengthDistribution()
    PM = cat.LengthDistribution()
    PZ = cat.ClusterDistribution(1)
    PEi = cat.UnigramMixture(1, e_corpus.vocab_size())
    PAj = cat.UniformAlignment()
    PFj = cat.BrownLexical(e_corpus.vocab_size(), f_corpus.vocab_size())
    return JointModel(PL, PM, PZ, PEi, PAj, PFj)
Пример #3
0
def main(e_path, f_path):

    e_corpus = Corpus(open(e_path), null='<null>')
    f_corpus = Corpus(open(f_path))

    model = get_ibm1(e_corpus, f_corpus)

    EM(e_corpus, f_corpus, model, iterations=10)

    map_decoder(e_corpus, f_corpus, model,
                partial(print_map, e_corpus=e_corpus, f_corpus=f_corpus))
Пример #4
0
def main(e_path, f_path):

    e_corpus = Corpus(open(e_path), null='<null>')
    f_corpus = Corpus(open(f_path))

    model = get_ibm1(e_corpus, f_corpus)

    EM(e_corpus, f_corpus, model, iterations=10)
    from lola.io import print_lola_format
    map_decoder(e_corpus, f_corpus, model,
                partial(print_lola_format,
                        e_corpus=e_corpus,
                        f_corpus=f_corpus,
                        ostream=sys.stdout))
Пример #5
0
def read_component(e_corpus: Corpus, f_corpus: Corpus, args, line: str, i: int, state: Config):
    """
    Instantiate components.
    If you contribute a new component, make sure to construct it here.

    :param e_corpus:
    :param f_corpus:
    :param args:
    :param line:
    :param i:
    :param state:
    :return:
    """
    try:
        cfg, [name, _] = util.re_sub('^([^:]+)(:)', '', line)
    except:
        raise ValueError('In line %d, expected component name: %s' % (i, line))

    if state.has_component(name):
        raise ValueError('Duplicate component name in line %d: %s', i, name)

    cfg, component_type = util.re_key_value('type', cfg, optional=False, dtype=str)

    if component_type == 'BrownLexical':
        state.add_component(name, BrownLexical(e_corpus, f_corpus, name=name))
    elif component_type == 'UniformAlignment':
        state.add_component(name, UniformAlignment(name=name))
    elif component_type == 'VogelJump':
        state.add_component(name, VogelJump(e_corpus.max_len(), name=name))
    elif component_type == "LexMLP":
        state.add_component(name, MLPComponent.construct(e_corpus, f_corpus, name, cfg))
    elif component_type == "LexLR":
        state.add_component(name, LRComponent.construct(e_corpus, f_corpus, name, cfg))
    else:
        raise ValueError("I do not know this type of generative component: %s" % component_type)
Пример #6
0
def print_map(s: int,
              z: int,
              a: 'np.array',
              pz: float,
              pa: 'np.array',
              e_corpus: Corpus,
              f_corpus: Corpus,
              ostream=sys.stdout):

    e_snt = e_corpus.sentence(s)
    f_snt = f_corpus.sentence(s)
    tokens = []
    for j, (i, p) in enumerate(zip(a, pa)):
        tokens.append('%d:%s|%d:%s|%.2f' % (j + 1, f_corpus.translate(
            f_snt[j]), i, e_corpus.translate(e_snt[i]), p))
    print('%d|%.2f ||| %s' % (z, pz, ' '.join(tokens)), file=ostream)
Пример #7
0
def get_joint_ibm1z(e_corpus: Corpus,
                    f_corpus: Corpus,
                    n_clusters=1,
                    cluster_unigrams=True,
                    alpha=1.0):
    PL = cat.LengthDistribution()
    PM = cat.LengthDistribution()
    if not cluster_unigrams:
        PZ = cat.ClusterDistribution(n_clusters)
    else:
        PZ = cat.ClusterUnigrams(n_clusters)
    PEi = cat.UnigramMixture(n_clusters, e_corpus.vocab_size(), alpha)
    PAj = cat.UniformAlignment()
    PFj = cat.MixtureOfBrownLexical(n_clusters, e_corpus.vocab_size(),
                                    f_corpus.vocab_size(), alpha)
    return JointModel(PL, PM, PZ, PEi, PAj, PFj)
Пример #8
0
def read_corpora(training_path: str, test_path: str, generating: bool,
                 min_count: int, max_count: int) -> (CorpusView, CorpusView):
    """
    Return training and test data.

    :param training_path: path to training corpus
    :param test_path: path to test corpus (or None)
    :param generating: whether this is the side we are generating (French)
    :param min_count: minimum frequency for word to be retained in the vocabulary
    :param max_count: maximum frequency for word to be retained in the vocabulary
    :return: Training view and test view
    """
    if test_path is None:  # not test corpus
        if generating:
            corpus = Corpus(training_path,
                            min_count=min_count,
                            max_count=max_count)
        else:  # we are conditioning on this corpus
            corpus = Corpus(training_path,
                            null='<NULL>',
                            min_count=min_count,
                            max_count=max_count)
        return corpus, None
    else:
        # read training data
        with open(training_path, 'r') as fi:
            lines = fi.readlines()
        n_training = len(lines)
        # read test data
        with open(test_path, 'r') as fi:
            lines.extend(fi.readlines())
        n_test = len(lines) - n_training
        # create a big corpus with everything
        if generating:
            corpus = Corpus(lines, min_count=min_count, max_count=max_count)
        else:  # we are conditioning on this corpus
            corpus = Corpus(lines,
                            null='<NULL>',
                            min_count=min_count,
                            max_count=max_count)
        # return two different views: the training view and the test view
        return CorpusView(corpus, 0,
                          n_training), CorpusView(corpus, n_training, n_test)
Пример #9
0
def marginal_likelihood(e_corpus: Corpus, f_corpus: Corpus, model: JointModel):

    PL, PM, PZ, PEi, PAj, PFj = model.components
    n_clusters = PZ.n_clusters
    ll = 0.0
    for e_snt, f_snt in zip(e_corpus.itersentences(),
                            f_corpus.itersentences()):
        # observations
        l = e_snt.shape[0]
        m = f_snt.shape[0]
        log_pl = np.log(PL.generate(l))
        log_pm = np.log(PM.generate(m))
        # 0-order alignments
        # P(f,e) = \sum_z P(z) P(e|z) P(f|z,e)
        log_pfe = -np.inf  # contribution of this sentence
        for z in range(n_clusters):
            # contribution of the cluster
            log_pz = np.log(PZ.generate(z, l, m))
            # compute the contribution of the entire English sentence
            log_pe_z = 0.0
            # P(e|z) = \prod_i P(e_i|z)
            for i, e in enumerate(e_snt):
                log_pe_z += np.log(PEi.generate((i, e), z, l, m))

            # P(f|z,e) = \prod_j P(f_j|z,e)
            #          = \prod_j \sum_i P(f_j,a_j=i|z,e)
            log_pf_ze = 0.0
            for j, f in enumerate(f_snt):
                # P(f_j|z,e) = \sum_i P(f_j,a_j=i|z,e)
                pfj_ze = 0.0  # contribution of this French word
                for i, e in enumerate(e_snt):
                    pfj_ze += PAj.generate(
                        (j, i), e_snt, z, l, m) * PFj.generate(
                            (j, f), (j, i), e_snt, z, l, m)
                # P(f|z,e) = \prod_j P(f_j|z,e)
                log_pf_ze += np.log(pfj_ze)
            # \sum_z P(z) P(e|z) P(f|z,e)
            log_pfe = np.logaddexp(log_pfe, log_pz + log_pe_z + log_pf_ze)
        # \sum_{f,e} P(l)P(m)P(f,e|l,m)
        ll += log_pl + log_pm + log_pfe
    return -ll / e_corpus.n_sentences()
Пример #10
0
def map_decoder(e_corpus: Corpus, f_corpus: Corpus, model: JointModel,
                callback):
    """

    :param e_corpus: English data
    :param f_corpus: French data
    :param model: components
    :param callback: called for each sentence in the parallel corpus
        callable(s, z, a, p(z|f,e), p(a|z,f,e))
    """

    n_clusters = model.PZ.n_clusters
    ll = 0.0
    # E-step
    for s, (e_snt, f_snt) in enumerate(
            zip(e_corpus.itersentences(), f_corpus.itersentences())):

        log_pz_fe, log_post_a = log_posterior(e_snt, f_snt, model)

        # Here we get the best path for each cluster
        best_paths_z = log_post_a.argmax(2)  # shape: (n_clusters, m)

        # Now we find out which path is the best one across clusters
        best_z = 0
        best_log_prob = -np.inf
        for z in range(n_clusters):
            # p(z,a|f,e) = p(z|f,e) p(a|z,f,e)
            path_log_prob = log_pz_fe[z] + np.sum(
                [log_post_a[z, j, i] for j, i in enumerate(best_paths_z[z])])
            if path_log_prob > best_log_prob:  # update if better
                best_log_prob = path_log_prob
                best_z = z

        # best posterior probabilities: p(a|z,fe)
        best_log_pa_zfe = np.array(
            [log_post_a[z, j, i] for j, i in enumerate(best_paths_z[z])])

        # communicate the finding
        callback(s, best_z, best_paths_z[best_z], np.exp(log_pz_fe[best_z]),
                 np.exp(best_log_pa_zfe))
Пример #11
0
def map_decoder(e_corpus: Corpus, f_corpus: Corpus, model: ConditionalModel, callback):
    """

    :param e_corpus: English data
    :param f_corpus: French data
    :param model: components
    :param callback: called for each sentence in the parallel corpus
        callable(s, z, a, p(z|f,e), p(a|z,f,e))
    """

    # E-step
    for s, (e_snt, f_snt) in enumerate(zip(e_corpus.itersentences(), f_corpus.itersentences())):

        log_post_a = log_posterior(e_snt, f_snt, model)

        # Here we get the best path for each cluster
        best_path = log_post_a.argmax(1)  # shape: (m)

        # best posterior probabilities: p(a|z,fe)
        best_posterior = np.array([log_post_a[j, i] for j, i in enumerate(best_path)])

        # communicate the finding
        callback(s, best_path, np.exp(best_posterior))
Пример #12
0
def print_lola_format(sid, alignments, posterior, e_corpus: Corpus, f_corpus: Corpus, ostream):
    """
    Print alignment in a human readable format.

    :param e_corpus: data we condition on
    :param f_corpus: data we generate
    :param sid: sentence id
    :param alignments: alignments (sequence of a_j values for each j)
    :param posterior: posterior p(a_j|f,e)
    :param ostream: where to write alignments to
    :return:
    """
    e_snt = e_corpus.sentence(sid)
    f_snt = f_corpus.sentence(sid)
    # in printing we make the French sentence 1-based by convention
    # we keep the English sentence 0-based because of the NULL token
    print(' '.join(['{0}:{1}|{2}:{3}|{4:.2f}'.format(j + 1,
                                                 f_corpus.translate(f_snt[j]),
                                                 i,
                                                 e_corpus.translate(e_snt[i]),
                                                 p)
                    for j, (i, p) in enumerate(zip(alignments, posterior))]),
          file=ostream)
Пример #13
0
def print_lola_format(sid, alignments, posterior, e_corpus: Corpus,
                      f_corpus: Corpus, ostream):
    """
    Print alignment in a human readable format.

    :param e_corpus: data we condition on
    :param f_corpus: data we generate
    :param sid: sentence id
    :param alignments: alignments (sequence of a_j values for each j)
    :param posterior: posterior p(a_j|f,e)
    :param ostream: where to write alignments to
    :return:
    """
    e_snt = e_corpus.sentence(sid)
    f_snt = f_corpus.sentence(sid)
    # in printing we make the French sentence 1-based by convention
    # we keep the English sentence 0-based because of the NULL token
    print(' '.join([
        '{0}:{1}|{2}:{3}|{4:.2f}'.format(j + 1, f_corpus.translate(f_snt[j]),
                                         i, e_corpus.translate(e_snt[i]), p)
        for j, (i, p) in enumerate(zip(alignments, posterior))
    ]),
          file=ostream)
Пример #14
0
def EM(e_corpus: Corpus, f_corpus: Corpus, model: ConditionalModel, iterations=5):
    """
    Generative story:

        l ~ P(L)
        m ~ P(M)
        a_j ~ P(A_j | l) for j=1..m
        f_j ~ P(F_j | e_{a_j}) for j=1..m

    :param e_corpus: English data
    :param f_corpus: French data
    :param model: a conditional model
    :param iterations: EM iterations
    """

    PL, PM, PAj, PFj = model.components

    logging.info('Iteration %d Likelihood %f', 0, marginal_likelihood(e_corpus, f_corpus, model))

    for iteration in range(1, iterations + 1):
        # E-step
        for s, (e_snt, f_snt) in enumerate(zip(e_corpus.itersentences(), f_corpus.itersentences())):
            # get the posterior P(a|f,e)
            post_a = posterior(e_snt, f_snt, model)
            l = e_snt.shape[0]
            m = f_snt.shape[0]

            # gather expected counts for (f_j, e_j): p(a_j=i|f,e)
            for j, f in enumerate(f_snt):
                for i, e in enumerate(e_snt):
                    PAj.observe((j, i), e_snt, 0, l, m, post_a[j, i])
                    PFj.observe((j, f), (j, i), e_snt, 0, l, m, post_a[j, i])

        # M-step
        model.update()

        logging.info('Iteration %d Likelihood %f', iteration, marginal_likelihood(e_corpus, f_corpus, model))
Пример #15
0
def read_component(e_corpus: Corpus, f_corpus: Corpus, args, line: str, i: int,
                   state: Config):
    """
    Instantiate components.
    If you contribute a new component, make sure to construct it here.

    :param e_corpus:
    :param f_corpus:
    :param args:
    :param line:
    :param i:
    :param state:
    :return:
    """
    try:
        cfg, [name, _] = util.re_sub('^([^:]+)(:)', '', line)
    except:
        raise ValueError('In line %d, expected component name: %s' % (i, line))

    if state.has_component(name):
        raise ValueError('Duplicate component name in line %d: %s', i, name)

    cfg, component_type = util.re_key_value('type',
                                            cfg,
                                            optional=False,
                                            dtype=str)

    if component_type == 'BrownLexical':
        state.add_component(name, BrownLexical(e_corpus, f_corpus, name=name))
    elif component_type == 'UniformAlignment':
        state.add_component(name, UniformAlignment(name=name))
    elif component_type == 'VogelJump':
        state.add_component(name, VogelJump(e_corpus.max_len(), name=name))
    elif component_type == "LexMLP":
        state.add_component(
            name, MLPComponent.construct(e_corpus, f_corpus, name, cfg))
    elif component_type == "LexLR":
        state.add_component(
            name, LRComponent.construct(e_corpus, f_corpus, name, cfg))
    else:
        raise ValueError(
            "I do not know this type of generative component: %s" %
            component_type)
Пример #16
0
    def __init__(self, e_corpus: Corpus,
                 f_corpus: Corpus,
                 name: str = "lexlr",
                 rng=np.random.RandomState(1234),
                 hidden=[100],
                 learning_rate=0.1,
                 max_iterations=100,
                 patience=10,
                 patience_increase=2,
                 improvement_threshold=0.995):
        """

        :param e_corpus: data we condition on
        :param f_corpus: data we generate
        :param name: name of the component
        :param rng: numpy random state
        :param hidden: dimensionality of hidden layers
        :param learning_rate: initial learning rate
        :param max_iterations: maximum number of updates
        :param patience: minimum number of updates
        :param patience_increase:
        :param improvement_threshold:
        """
        super(LRComponent, self).__init__(name, LexEventSpace(e_corpus, f_corpus))

        # TODO: generalise to batches?
        self._corpus_size = e_corpus.n_sentences()
        self._learning_rate = learning_rate
        self._max_iterations = max_iterations
        self._patience = patience
        self._patience_increase = patience_increase
        self._improvement_threshold = improvement_threshold

        # The event space determines the input and output dimensionality
        vE, vF = self.event_space.shape
        # TODO: Featurize(event_space)
        # for now my features are (English word identity concatenated with French word identity)
        # TODO: create a better matrix where we have
        # vE * vF rows but we have d1 + d2 + d3 columns where d1 is the E embedding, d2 is the F embedding and d3 is whatever else
        self._X = np.zeros((vE * vF, vE + vF), dtype=theano.config.floatX)
        for e, f in product(range(vE), range(vF)):
            self._X[e * vF + f, e] = 1.0
            self._X[e * vF + f, vE + f] = 1.0

        # Create MLP
        builder = NNBuilder(rng)
        # ... the embedding layer
        builder.add_layer(vE + vF, hidden[0])
        # ... additional hidden layers
        for di, do in zip(hidden, hidden[1:]):
            builder.add_layer(di, do)
        # The Logistic Regression adds the final scoring layer and is responsible for normalisation over vF classes
        self._nn = LR(builder, vE, vF)  # type: MLP

        # Create Theano variables for the MLP input
        nn_input = T.matrix('mlp_input')
        # ... and the expected output
        nn_expected = T.matrix('mlp_expected')
        learning_rate = T.scalar('learning_rate')

        # Learning rate and momentum hyperparameter values
        # Again, for non-toy problems these values can make a big difference
        # as to whether the network (quickly) converges on a good local minimum.
        #learning_rate = 0.01
        momentum = 0

        # Create a theano function for computing the MLP's output given some input
        self._nn_output = theano.function([nn_input], self._nn.output(nn_input))
        # Create a function for computing the cost of the network given an input
        cost = - self._nn.expected_logprob(nn_input, nn_expected)
        # Create a theano function for training the network
        self._train = theano.function([nn_input, nn_expected, learning_rate],
                                      # cost function
                                      cost,
                                      updates=gradient_updates_momentum(cost,
                                                                        self._nn.params,
                                                                        learning_rate,
                                                                        momentum))

        # table to store the CPDs (output of LR reshaped into a (vE, vF) matrix)
        self._cpds = self._nn_output(self._X).reshape(self.event_space.shape)
        # table to gather expected counts
        self._counts = np.zeros(self.event_space.shape, dtype=theano.config.floatX)
        self._i = 0
Пример #17
0
    def __init__(self, e_corpus: Corpus,
                 f_corpus: Corpus,
                 name: str = "lexmlp",
                 rng=np.random.RandomState(1234),
                 hidden=[100],
                 learning_rate=0.1,
                 max_iterations=100,
                 patience=10,
                 patience_increase=2,
                 improvement_threshold=0.995):
        """

        :param e_corpus: data we condition on
        :param f_corpus: data we generate
        :param name: name of the component
        :param rng: numpy random state
        :param hidden: dimensionality of hidden layers
        :param learning_rate: initial learning rate
        :param max_iterations: maximum number of updates
        :param patience: minimum number of updates
        :param patience_increase:
        :param improvement_threshold:
        """

        self._corpus_size = e_corpus.n_sentences()
        self._learning_rate = learning_rate
        self._max_iterations = max_iterations
        self._patience = patience
        self._patience_increase = patience_increase
        self._improvement_threshold = improvement_threshold

        # The event space determines the input and output dimensionality
        self.n_input, self.n_output = e_corpus.vocab_size(), f_corpus.vocab_size()
        # Input for the classifiers
        self._X = np.identity(self.n_input, dtype=theano.config.floatX)

        # Create MLP
        builder = NNBuilder(rng)
        # ... the embedding layer
        builder.add_layer(self.n_input, hidden[0])
        # ... additional hidden layers
        for di, do in zip(hidden, hidden[1:]):
            builder.add_layer(di, do)
        # The MLP adds a softmax layer over n_classes
        self._mlp = MLP(builder, n_classes=f_corpus.vocab_size())  # type: MLP

        # Create Theano variables for the MLP input
        mlp_input = T.matrix('mlp_input')
        # ... and the expected output
        mlp_expected = T.matrix('mlp_expected')
        learning_rate = T.scalar('learning_rate')

        # Learning rate and momentum hyperparameter values
        # Again, for non-toy problems these values can make a big difference
        # as to whether the network (quickly) converges on a good local minimum.
        #learning_rate = 0.01
        momentum = 0

        # Create a theano function for computing the MLP's output given some input
        self._mlp_output = theano.function([mlp_input], self._mlp.output(mlp_input))

        # Create a function for computing the cost of the network given an input
        cost = - self._mlp.expected_logprob(mlp_input, mlp_expected)

        # Create a theano function for training the network
        self._train = theano.function([mlp_input, mlp_expected, learning_rate],
                                      # cost function
                                      cost,
                                      updates=gradient_updates_momentum(cost,
                                                                        self._mlp.params,
                                                                        learning_rate,
                                                                        momentum))

        # table to store the CPDs (output of MLP)
        self._cpds = self._mlp_output(self._X)
        # table to gather expected counts
        self._counts = np.zeros((self.n_input, self.n_output), dtype=theano.config.floatX)
        self._i = 0
Пример #18
0
    def __init__(self,
                 e_corpus: Corpus,
                 f_corpus: Corpus,
                 name: str = "lexmlp",
                 rng=np.random.RandomState(1234),
                 hidden=[100],
                 learning_rate=0.1,
                 max_iterations=100,
                 patience=10,
                 patience_increase=2,
                 improvement_threshold=0.995):
        """

        :param e_corpus: data we condition on
        :param f_corpus: data we generate
        :param name: name of the component
        :param rng: numpy random state
        :param hidden: dimensionality of hidden layers
        :param learning_rate: initial learning rate
        :param max_iterations: maximum number of updates
        :param patience: minimum number of updates
        :param patience_increase:
        :param improvement_threshold:
        """
        super(MLPComponent, self).__init__(name,
                                           LexEventSpace(e_corpus, f_corpus))

        # TODO: generalise to batches?
        self._corpus_size = e_corpus.n_sentences()
        self._learning_rate = learning_rate
        self._max_iterations = max_iterations
        self._patience = patience
        self._patience_increase = patience_increase
        self._improvement_threshold = improvement_threshold

        # The event space determines the input and output dimensionality
        self.n_input, self.n_output = self.event_space.shape
        # Input for the classifiers (TODO: should depend on the event space more closely)
        self._X = np.identity(self.n_input, dtype=theano.config.floatX)

        # Create MLP
        builder = NNBuilder(rng)
        # ... the embedding layer
        builder.add_layer(self.n_input, hidden[0])
        # ... additional hidden layers
        for di, do in zip(hidden, hidden[1:]):
            builder.add_layer(di, do)
        # ... and the output layer (a softmax layer)
        #builder.add_layer(hidden[-1], self.n_output, activation=T.nnet.softmax)
        # The MLP adds the softmax layer over n_classes
        self._mlp = MLP(builder, n_classes=self.n_output)  # type: MLP

        # Create Theano variables for the MLP input
        mlp_input = T.matrix('mlp_input')
        # ... and the expected output
        mlp_expected = T.matrix('mlp_expected')
        learning_rate = T.scalar('learning_rate')

        # Learning rate and momentum hyperparameter values
        # Again, for non-toy problems these values can make a big difference
        # as to whether the network (quickly) converges on a good local minimum.
        #learning_rate = 0.01
        momentum = 0

        # Create a theano function for computing the MLP's output given some input
        self._mlp_output = theano.function([mlp_input],
                                           self._mlp.output(mlp_input))

        # Create a function for computing the cost of the network given an input
        cost = -self._mlp.expected_logprob(mlp_input, mlp_expected)
        # Create a theano function for training the network
        self._train = theano.function(
            [mlp_input, mlp_expected, learning_rate],
            # cost function
            cost,
            updates=gradient_updates_momentum(cost, self._mlp.params,
                                              learning_rate, momentum))

        # table to store the CPDs (output of MLP)
        self._cpds = self._mlp_output(self._X)
        # table to gather expected counts
        self._counts = np.zeros(self.event_space.shape,
                                dtype=theano.config.floatX)
        self._i = 0
Пример #19
0
def get_ibm1(e_corpus: Corpus, f_corpus: Corpus):
    PL = cat.LengthDistribution()
    PM = cat.LengthDistribution()
    PAj = cat.UniformAlignment()
    PFj = cat.BrownLexical(e_corpus.vocab_size(), f_corpus.vocab_size())
    return ConditionalModel(PL, PM, PAj, PFj)
Пример #20
0
def EM(e_corpus: Corpus, f_corpus: Corpus, model: JointModel, iterations=5):
    """
    Generative story:

        l ~ P(L)
        m ~ P(M)
        z ~ P(Z)
        e_i ~ P(E_i | z) for i=1..l
        a_j ~ P(A_j | l) for j=1..m
        f_j ~ P(F_j | e_{a_j}, z) for j=1..m

    Joint distribution:
        P(F,E,A,Z,L,M) = P(L)P(M)P(Z)P(E|Z)P(A|L,M)P(F|E,A,Z,L,M)

    We make the following independence assumptions:

        P(e|z) = prod_i P(e_i|z)
        P(f|e,a,z,l,m) = prod_j P(a_j|l,m)P(f_j|e_{a_j},z)

    The EM algorithm depends on 2 posterior computations:

        [1] P(z|f,e,l,m) = P(z)P(e|z)P(f|e,z)/P(f,e)
        where
            P(e|z) = \prod_i P(e_i|z)
            P(f|e,z) = \sum_a P(f,a|e,z) = \prod_j \sum_i P(a_j=i)P(f_j|e_i,z)
            P(f,e) = \sum_z \sum_a P(f,e,z,a|l,m)
                = \sum_z P(z)P(e|z) P(f|e,z)
                = \sum_z P(z)P(e|z) \prod_j \sum_i P(a_j=i) P(f_j|e_i,z)
        and
        [2] P(a|f,e,z) = P(a,z,f,e,l,m) / P(f,e,z,l,m)
            =    P(z)(e|z)P(a|l,m)P(f|e,a,z)
              ----------------------------------
              \sum_a P(z)(e|z)P(a|l,m)P(f|e,a,z)
            =    P(z)(e|z)P(a|l,m)P(f|e,a,z)
              ----------------------------------
              P(z)(e|z)\sum_a P(a|l,m)P(f|e,a,z)
            =   P(z)(e|z)\prod_j P(a_j|l,m)P(f_j|e_{a_j},z)
              -----------------------------------------------
              P(z)(e|z)\prod_j\sum_i P(a_j=i|l,m)P(f_j|e_i,z)
            = \prod_j     P(a_j|l,m)P(f_j|e_{a_j},z)
                       -------------------------------
                       \sum_i P(a_j=i|l,m)P(f_j|e_i,z)
            = \prod_j P(a_j|f, e, z)
        where
            P(a_j|f,e,z) =    P(a_j|l,m)P(f_j|e_{a_j},z)
                           -------------------------------
                           \sum_i P(a_j=i|l,m)P(f_j|e_i,z)

    Note that the choice of parameterisation is indenpendent of the EM algorithm in this method.
    For example,
        P(a_j|l,m) can be
            * uniform (IBM1)
            * categorical (IBM2)
        P(f_j|e_{a_j}, z) can be
            * categorical and independent of z, i.e. P(f_j|e_{a_j}, z) = P(f_j|e_{a_j})
            * categorical
            * PoE: P(f_j|e_{a_j}, z) \propto P(f_j|e_{a_j}) P(f_j|z)
            * all of the above using MLP or LR instead of categorical distributions
        we can also have P(a_j|l,m)P(f_j|e_{a_j}, z) modelled by a single LR (with MLP-induced features).

    :param e_corpus: English data
    :param f_corpus: French data
    :param model: all components
    :param iterations: EM iterations
    """

    PL, PM, PZ, PEi, PAj, PFj = model.components
    n_clusters = PZ.n_clusters

    logging.info('Iteration %d Likelihood %f', 0,
                 marginal_likelihood(e_corpus, f_corpus, model))

    for iteration in range(1, iterations + 1):
        # E-step
        for s, (e_snt, f_snt) in enumerate(
                zip(e_corpus.itersentences(), f_corpus.itersentences())):
            # get the factorised posterior: P(z|f,e) and P(a|z,f,e)
            post_z, post_a = posterior(e_snt, f_snt, model)
            l = e_snt.shape[0]
            m = f_snt.shape[0]
            for z in range(n_clusters):
                # gather expected count for z: p(z|f, e)
                PZ.observe(z, l, m, post_z[z])
                # gather expected counts for (z, e_i): p(z|f, e)
                for i, e in enumerate(e_snt):
                    PEi.observe((i, e), z, l, m, post_z[z])
                # gather expected counts for (f_j, e_j): p(a_j=i|f,e,z)
                for j, f in enumerate(f_snt):
                    for i, e in enumerate(e_snt):
                        PAj.observe((j, i), e_snt, z, l, m, post_a[z, j, i])
                        PFj.observe((j, f), (j, i), e_snt, z, l, m,
                                    post_a[z, j, i])

        # M-step
        model.update()

        logging.info('Iteration %d Likelihood %f', iteration,
                     marginal_likelihood(e_corpus, f_corpus, model))