Пример #1
0
    def __init__(self, source_char_seq, source_sample_matrix, source_char_aux,
                 source_word_mask, samples, model, data_stream,
                 config, testing_model, n_best=1, track_n_models=1,
                 normalize=True, **kwargs):
        # TODO: change config structure
        super(BleuTester, self).__init__(**kwargs)
        self.source_char_seq = source_char_seq
        self.source_sample_matrix = source_sample_matrix
        self.source_char_aux = source_char_aux
        self.source_word_mask = source_word_mask
        self.samples = samples
        self.model = model
        self.data_stream = data_stream
        self.config = config
        self.testing_model = testing_model
        self.n_best = n_best
        self.track_n_models = track_n_models
        self.normalize = normalize
        self.verbose = True

        # Helpers
        self.vocab = data_stream.dataset.dictionary
        self.src_ivocab = {v: k for k, v in self.vocab.items()}
        self.unk_sym = data_stream.dataset.unk_token
        self.eos_sym = data_stream.dataset.eos_token
        self.unk_idx = self.vocab[self.unk_sym]
        self.eos_idx = self.vocab[self.eos_sym]
        self.beam_search = BeamSearch(samples=samples)
        self.multibleu_cmd = ['perl', self.config['bleu_script'],
                              self.config['test_set_grndtruth'], '<']
Пример #2
0
def test_beam_search():
    """Test beam search using the model from the reverse_words demo.

    Ideally this test should be done with a trained model, but so far
    only with a randomly initialized one. So it does not really test
    the ability to find the best output sequence, but only correctness
    of returned costs.

    """
    rng = numpy.random.RandomState(1234)
    alphabet_size = 20
    beam_size = 10
    length = 15

    reverser = WordReverser(10, alphabet_size)
    reverser.weights_init = reverser.biases_init = IsotropicGaussian(0.5)
    reverser.initialize()

    inputs = tensor.lmatrix('inputs')
    samples, = VariableFilter(bricks=[reverser.generator], name="outputs")(
        ComputationGraph(reverser.generate(inputs)))

    input_vals = numpy.tile(rng.randint(alphabet_size, size=(length,)),
                            (beam_size, 1)).T

    search = BeamSearch(10, samples)
    results, mask, costs = search.search({inputs: input_vals},
                                         0, 3 * length)

    true_costs = reverser.cost(
        input_vals, numpy.ones((length, beam_size), dtype=floatX),
        results, mask).eval()
    true_costs = (true_costs * mask).sum(axis=0)
    assert_allclose(costs, true_costs, rtol=1e-5)
Пример #3
0
 def __init__(self,
              eol_symbol,
              beam_size,
              x,
              x_mask,
              samples,
              phoneme_dict=None,
              black_list=None,
              language_model=False):
     if black_list is None:
         self.black_list = []
     else:
         self.black_list = black_list
     self.x = x
     self.x_mask = x_mask
     self.eol_symbol = eol_symbol
     self.beam_size = beam_size
     if language_model:
         lm = TrigramLanguageModel()
         ind_to_word = dict(enumerate(lm.unigrams))
         self.beam_search = BeamSearchLM(lm, 1., ind_to_word, beam_size,
                                         samples)
     else:
         self.beam_search = BeamSearch(beam_size, samples)
     self.beam_search.compile()
     self.phoneme_dict = phoneme_dict
Пример #4
0
    def __init__(self,
                 source_sentence,
                 samples,
                 model,
                 data_stream,
                 config,
                 n_best=1,
                 track_n_models=1,
                 trg_ivocab=None,
                 normalize=True,
                 store_full_main_loop=False,
                 **kwargs):
        # TODO: change config structure
        super(BleuValidator, self).__init__(**kwargs)
        self.store_full_main_loop = store_full_main_loop
        self.source_sentence = source_sentence
        self.samples = samples
        self.model = model
        self.data_stream = data_stream
        self.config = config
        self.n_best = n_best
        self.track_n_models = track_n_models
        self.normalize = normalize
        self.verbose = config.get('val_set_out', None)

        # Helpers
        #self.vocab = data_stream.dataset.dictionary
        self.trg_ivocab = trg_ivocab
        #self.unk_sym = data_stream.dataset.unk_token
        #self.eos_sym = data_stream.dataset.eos_token
        #self.unk_idx = self.vocab[self.unk_sym]
        #self.eos_idx = self.vocab[self.eos_sym]
        self.unk_idx = 0  # fs439: TODO hardcoded
        self.eos_idx = 2  # fs439: TODO hardcoded
        self.best_models = []
        self.val_bleu_curve = []
        self.beam_search = BeamSearch(samples=samples)
        self.multibleu_cmd = (self.config['bleu_script'] %
                              self.config['val_set_grndtruth']).split()
        print("BLEU command: %s" % self.multibleu_cmd)

        # Create saving directory if it does not exist
        if not os.path.exists(self.config['saveto']):
            os.makedirs(self.config['saveto'])

        if self.config['reload']:
            try:
                bleu_score = numpy.load(
                    os.path.join(self.config['saveto'], 'val_bleu_scores.npz'))
                self.val_bleu_curve = bleu_score['bleu_scores'].tolist()

                # Track n best previous bleu scores
                for i, bleu in enumerate(
                        sorted(self.val_bleu_curve, reverse=True)):
                    if i < self.track_n_models:
                        self.best_models.append(ModelInfo(bleu))
                logger.info("BleuScores Reloaded")
            except:
                logger.info("BleuScores not Found")
Пример #5
0
    def __init__(self,
                 source_sentence,
                 initial_state_context,
                 samples,
                 model,
                 data_stream,
                 config,
                 src_vocab=None,
                 trg_vocab=None,
                 n_best=1,
                 track_n_models=1,
                 normalize=True,
                 **kwargs):
        super(MeteorValidator, self).__init__(**kwargs)
        self.source_sentence = source_sentence
        self.initial_context = initial_state_context

        self.src_vocab = src_vocab
        self.trg_vocab = trg_vocab

        self.samples = samples
        self.model = model
        self.data_stream = data_stream
        self.config = config
        self.n_best = n_best
        self.track_n_models = track_n_models
        self.normalize = normalize
        self.verbose = config.get('val_set_out', None)

        # Helpers
        self.best_models = []
        self.val_meteor_curve = []
        self.beam_search = BeamSearch(samples=samples)

        # Info for Meteor
        self.target_language = self.config['target_lang']
        self.meteor_directory = self.config['meteor_directory']

        # Create save directory if it does not exist
        if not os.path.exists(self.config['saveto']):
            os.makedirs(self.config['saveto'])

        if self.config['reload']:
            try:
                meteor_score = numpy.load(
                    os.path.join(self.config['saveto'],
                                 'val_meteor_scores.npz'))
                self.val_meteor_curve = meteor_score['meteor_scores'].tolist()

                # Track n best previous meteor scores
                for i, meteor in enumerate(
                        sorted(self.val_meteor_curve, reverse=True)):
                    if i < self.track_n_models:
                        self.best_models.append(ModelInfo(meteor,
                                                          key='METEOR'))
                logger.info("MeteorScores Reloaded")
            except:
                logger.info("MeteorScores not Found")
Пример #6
0
    def __init__(self,
                 source_sentence,
                 target_prefix,
                 samples,
                 model,
                 data_stream,
                 config,
                 src_vocab=None,
                 trg_vocab=None,
                 n_best=1,
                 track_n_models=1,
                 normalize=True,
                 **kwargs):
        super(BleuValidator, self).__init__(**kwargs)
        self.source_sentence = source_sentence
        self.target_prefix = target_prefix

        self.src_vocab = src_vocab
        self.trg_vocab = trg_vocab

        self.samples = samples
        self.model = model
        self.data_stream = data_stream
        self.config = config
        self.n_best = n_best
        self.track_n_models = track_n_models
        self.normalize = normalize
        self.verbose = config.get('val_set_out', None)

        # Helpers
        self.best_models = []
        self.val_bleu_curve = []
        self.beam_search = BeamSearch(samples=samples)
        self.multibleu_cmd = [
            'perl', self.config['bleu_script'],
            self.config['val_set_grndtruth'], '<'
        ]

        # Create save directory if it does not exist
        if not os.path.exists(self.config['saveto']):
            os.makedirs(self.config['saveto'])

        if self.config['reload']:
            try:
                bleu_score = numpy.load(
                    os.path.join(self.config['saveto'], 'val_bleu_scores.npz'))
                self.val_bleu_curve = bleu_score['bleu_scores'].tolist()

                # Track n best previous bleu scores
                for i, bleu in enumerate(
                        sorted(self.val_bleu_curve, reverse=True)):
                    if i < self.track_n_models:
                        self.best_models.append(ModelInfo(bleu, key='BLEU'))
                logger.info("BleuScores Reloaded")
            except:
                logger.info("BleuScores not Found")
Пример #7
0
    def init_beam_search(self, beam_size):
        """Compile beam search and set the beam size.

        See Blocks issue #500.

        """
        self.beam_size = beam_size
        generated = self.get_generate_graph()
        samples, = VariableFilter(applications=[self.generator.generate],
                                  name="outputs")(ComputationGraph(
                                      generated['outputs']))
        self._beam_search = BeamSearch(beam_size, samples)
        self._beam_search.compile()
Пример #8
0
    def __init__(self,
                 samples,
                 model,
                 data_stream,
                 config,
                 n_best=1,
                 track_n_models=1,
                 normalize=True,
                 **kwargs):
        # TODO: change config structure
        super(F1Validator, self).__init__(**kwargs)
        self.samples = samples
        self.model = model
        self.data_stream = data_stream
        self.config = config
        self.n_best = n_best
        self.track_n_models = track_n_models
        self.normalize = normalize
        self.verbose = config.get('val_set_out', None)

        # Helpers
        self.vocab = config["src_vocab"]
        self.unk_sym = config["unk_token"]
        self.eos_sym = config["eos_token"]
        self.trg_vocab = config["trg_vocab"]
        self.trg_ivocab = {v: k for k, v in self.trg_vocab.items()}
        self.trg_eos_idx = self.trg_vocab[config["eos_token"]]
        self.unk_idx = self.vocab[self.unk_sym]
        self.eos_idx = self.vocab[self.eos_sym]
        self.best_models = []
        self.val_f1_curve = []
        self.beam_search = BeamSearch(samples=samples)

        # Create saving directory if it does not exist
        if not os.path.exists(self.config['saveto']):
            os.makedirs(self.config['saveto'])

        if self.config['reload']:
            try:
                f1_score = numpy.load(
                    os.path.join(self.config['saveto'], 'val_f1_scores.npz'))
                self.val_f1_curve = f1_score['f1_scores'].tolist()

                # Track n best previous f1 scores
                for i, f1 in enumerate(sorted(self.val_f1_curve,
                                              reverse=True)):
                    if i < self.track_n_models:
                        self.best_models.append(ModelInfo(f1))
                logger.info("F1Scores Reloaded")
            except:
                logger.info("F1Scores not Found")
Пример #9
0
    def __init__(self, source_char_seq, source_sample_matrix, source_char_aux,
                 source_word_mask, samples, model, data_stream,
                 config, n_best=1, track_n_models=1,
                 normalize=True, **kwargs):
        # TODO: change config structure
        super(BleuValidator, self).__init__(**kwargs)
        self.source_char_seq = source_char_seq
        self.source_sample_matrix = source_sample_matrix
        self.source_char_aux = source_char_aux
        self.source_word_mask = source_word_mask
        self.samples = samples
        self.model = model
        self.data_stream = data_stream
        self.config = config
        self.n_best = n_best
        self.track_n_models = track_n_models
        self.normalize = normalize
        self.verbose = config.get('val_set_out', None)

        # Helpers
        self.vocab = data_stream.dataset.dictionary
        self.src_ivocab = {v: k for k, v in self.vocab.items()}
        self.unk_sym = data_stream.dataset.unk_token
        self.eos_sym = data_stream.dataset.eos_token
        self.unk_idx = self.vocab[self.unk_sym]
        self.eos_idx = self.vocab[self.eos_sym]
        self.best_models = []
        self.val_bleu_curve = []
        self.beam_search = BeamSearch(samples=samples)
        self.multibleu_cmd = ['perl', self.config['bleu_script'],
                              self.config['val_set_grndtruth'], '<']

        # Create saving directory if it does not exist
        if not os.path.exists(self.config['saveto']):
            os.makedirs(self.config['saveto'])

        if self.config['reload']:
            try:
                bleu_score = numpy.load(os.path.join(self.config['saveto'],
                                                     'val_bleu_scores.npz'))
                self.val_bleu_curve = bleu_score['bleu_scores'].tolist()
                # Track n best previous bleu scores
                for i, bleu in enumerate(
                        sorted([list(v.values())[0] for v in self.val_bleu_curve], reverse=True)):
                    if i < self.track_n_models:
                        self.best_models.append(ModelInfo(bleu, self.config['saveto']))
                logger.info("BleuScores Reloaded")
            except:
                logger.info("BleuScores not Found")
Пример #10
0
    def init_beam_search(self, beam_size):
        """Compile beam search and set the beam size.

        See Blocks issue #500.

        """
        if hasattr(self, '_beam_search') and self.beam_size == beam_size:
            # Only recompile if the user wants a different beam size
            return
        self.beam_size = beam_size
        generated = self.get_generate_graph(use_mask=False, n_steps=3)
        cg = ComputationGraph(generated.values())
        samples, = VariableFilter(applications=[self.generator.generate],
                                  name="outputs")(cg)
        self._beam_search = BeamSearch(beam_size, samples)
        self._beam_search.compile()
Пример #11
0
def test_beam_search():
    """Test beam search using the model similar to the reverse_words demo.

    Ideally this test should be done with a trained model, but so far
    only with a randomly initialized one. So it does not really test
    the ability to find the best output sequence, but only correctness
    of returned costs.

    """
    rng = numpy.random.RandomState(1234)
    alphabet_size = 20
    beam_size = 10
    length = 15

    simple_generator = SimpleGenerator(10, alphabet_size, seed=1234)
    simple_generator.weights_init = IsotropicGaussian(0.5)
    simple_generator.biases_init = IsotropicGaussian(0.5)
    simple_generator.initialize()

    inputs = tensor.lmatrix('inputs')
    samples, = VariableFilter(
            applications=[simple_generator.generator.generate],
            name="outputs")(
        ComputationGraph(simple_generator.generate(inputs)))

    input_vals = numpy.tile(rng.randint(alphabet_size, size=(length,)),
                            (beam_size, 1)).T

    search = BeamSearch(samples)
    results, mask, costs = search.search(
        {inputs: input_vals}, 0, 3 * length, as_arrays=True)
    # Just check sum
    assert results.sum() == 2816

    true_costs = simple_generator.cost(
        input_vals, numpy.ones((length, beam_size),
                               dtype=theano.config.floatX),
        results, mask).eval()
    true_costs = (true_costs * mask).sum(axis=0)
    assert_allclose(costs.sum(axis=0), true_costs, rtol=1e-5)

    # Test `as_lists=True`
    results2, costs2 = search.search({inputs: input_vals},
                                     0, 3 * length)
    for i in range(len(results2)):
        assert results2[i] == list(results.T[i, :mask.T[i].sum()])
Пример #12
0
def test_beam_search():
    """Test beam search using the model similar to the reverse_words demo.

    Ideally this test should be done with a trained model, but so far
    only with a randomly initialized one. So it does not really test
    the ability to find the best output sequence, but only correctness
    of returned costs.

    """
    rng = numpy.random.RandomState(1234)
    alphabet_size = 20
    beam_size = 10
    length = 15

    simple_generator = SimpleGenerator(10, alphabet_size, seed=1234)
    simple_generator.weights_init = IsotropicGaussian(0.5)
    simple_generator.biases_init = IsotropicGaussian(0.5)
    simple_generator.initialize()

    inputs = tensor.lmatrix('inputs')
    samples, = VariableFilter(
        applications=[simple_generator.generator.generate],
        name="outputs")(ComputationGraph(simple_generator.generate(inputs)))

    input_vals = numpy.tile(rng.randint(alphabet_size, size=(length, )),
                            (beam_size, 1)).T

    search = BeamSearch(samples)
    results, mask, costs = search.search({inputs: input_vals},
                                         0,
                                         3 * length,
                                         as_arrays=True)
    # Just check sum
    assert results.sum() == 2816

    true_costs = simple_generator.cost(
        input_vals, numpy.ones((length, beam_size),
                               dtype=theano.config.floatX), results,
        mask).eval()
    true_costs = (true_costs * mask).sum(axis=0)
    assert_allclose(costs.sum(axis=0), true_costs, rtol=1e-5)

    # Test `as_lists=True`
    results2, costs2 = search.search({inputs: input_vals}, 0, 3 * length)
    for i in range(len(results2)):
        assert results2[i] == list(results.T[i, :mask.T[i].sum()])
Пример #13
0
        def generate(input_):
            """Generate output sequences for an input sequence.

            Incapsulates most of the difference between sampling and beam
            search.

            Returns
            -------
            outputs : list of lists
                Trimmed output sequences.
            costs : list
                The negative log-likelihood of generating the respective
                sequences.

            """
            if mode == "beam_search":
                samples, = VariableFilter(
                    bricks=[reverser.generator], name="outputs")(
                        ComputationGraph(generated[1]))
                # NOTE: this will recompile beam search functions
                # every time user presses Enter. Do not create
                # a new `BeamSearch` object every time if
                # speed is important for you.
                beam_search = BeamSearch(input_.shape[1], samples)
                outputs, _, costs = beam_search.search(
                    {chars: input_}, char2code['</S>'],
                    3 * input_.shape[0])
            else:
                _1, outputs, _2, _3, costs = (
                    model.get_theano_function()(input_))
                costs = costs.T

            outputs = list(outputs.T)
            costs = list(costs)
            for i in range(len(outputs)):
                outputs[i] = list(outputs[i])
                try:
                    true_length = outputs[i].index(char2code['</S>']) + 1
                except ValueError:
                    true_length = len(outputs[i])
                outputs[i] = outputs[i][:true_length]
                if mode == "sample":
                    costs[i] = costs[i][:true_length].sum()
            return outputs, costs
 def __init__(self, eol_symbol, beam_size, x, x_mask, samples,
              phoneme_dict=None, black_list=None, language_model=False):
     if black_list is None:
         self.black_list = []
     else:
         self.black_list = black_list
     self.x = x
     self.x_mask = x_mask
     self.eol_symbol = eol_symbol
     self.beam_size = beam_size
     if language_model:
         lm = TrigramLanguageModel()
         ind_to_word = dict(enumerate(lm.unigrams))
         self.beam_search = BeamSearchLM(
             lm, 1., ind_to_word, beam_size, samples)
     else:
         self.beam_search = BeamSearch(beam_size, samples)
     self.beam_search.compile()
     self.phoneme_dict = phoneme_dict
 def __init__(self,
              eol_symbol,
              beam_size,
              x,
              x_mask,
              samples,
              phoneme_dict=None,
              black_list=None):
     if black_list is None:
         self.black_list = []
     else:
         self.black_list = black_list
     self.x = x
     self.x_mask = x_mask
     self.eol_symbol = eol_symbol
     self.beam_size = beam_size
     self.beam_search = BeamSearch(beam_size, samples)
     self.beam_search.compile()
     self.phoneme_dict = phoneme_dict
Пример #16
0
    def __init__(
        self,
        source_sentence,
        samples,
        model,
        data_stream,
        config,
        n_best=1,
        track_n_models=1,
        trg_ivocab=None,
        src_eos_idx=-1,
        trg_eos_idx=-1,
        **kwargs
    ):
        super(BleuValidator, self).__init__(**kwargs)
        self.source_sentence = source_sentence
        self.samples = samples
        self.model = model
        self.data_stream = data_stream
        self.config = config
        self.n_best = n_best
        self.track_n_models = track_n_models
        self.verbose = config.get("val_set_out", None)

        self.src_eos_idx = src_eos_idx
        self.trg_eos_idx = trg_eos_idx

        # Helpers
        self.vocab = data_stream.dataset.dictionary
        self.trg_ivocab = trg_ivocab
        self.unk_sym = data_stream.dataset.unk_token
        self.eos_sym = data_stream.dataset.eos_token
        self.unk_idx = self.vocab[self.unk_sym]
        self.eos_idx = self.src_eos_idx  # self.vocab[self.eos_sym]
        self.best_models = []
        self.val_bleu_curve = []
        self.beam_search = BeamSearch(beam_size=self.config["beam_size"], samples=samples)
        self.multibleu_cmd = ["perl", self.config["bleu_script"], self.config["val_set_grndtruth"], "<"]

        # Create saving directory if it does not exist
        if not os.path.exists(self.config["saveto"]):
            os.makedirs(self.config["saveto"])

        if self.config["reload"]:
            try:
                bleu_score = numpy.load(os.path.join(self.config["saveto"], "val_bleu_scores.npz"))
                self.val_bleu_curve = bleu_score["bleu_scores"].tolist()

                # Track n best previous bleu scores
                for i, bleu in enumerate(sorted(self.val_bleu_curve, reverse=True)):
                    if i < self.track_n_models:
                        self.best_models.append(ModelInfo(bleu))
                logger.info("BleuScores Reloaded")
            except:
                logger.info("BleuScores not Found")
Пример #17
0
    def __init__(self, source_sentence, samples, model, data_stream, ground_truth, config,
                 val_out=None, val_best_out=None, n_best=1, normalize=True, **kwargs):
        # TODO: change config structure
        super(BleuEvaluator, self).__init__(**kwargs)
        self.source_sentence = source_sentence
        self.samples = samples
        self.model = model
        self.data_stream = data_stream
        self.config = config
        self.n_best = n_best
        self.normalize = normalize
        self.val_out = val_out
        self.val_best_out = val_out and val_best_out
        self.bleu_scores = []

        self.trg_ivocab = None
        self.unk_id = config['unk_id']
        self.eos_id = config['eos_id']
        self.beam_search = BeamSearch(samples=samples)
        self.multibleu_cmd = ['perl', self.config['bleu_script'], ground_truth, '<']
Пример #18
0
        def generate(input_):
            """Generate output sequences for an input sequence.

            Incapsulates most of the difference between sampling and beam
            search.

            Returns
            -------
            outputs : list of lists
                Trimmed output sequences.
            costs : list
                The negative log-likelihood of generating the respective
                sequences.

            """
            if mode == "beam_search":
                samples, = VariableFilter(
                    bricks=[reverser.generator], name="outputs")(
                        ComputationGraph(generated[1]))
                # NOTE: this will recompile beam search functions
                # every time user presses Enter. Do not create
                # a new `BeamSearch` object every time if
                # speed is important for you.
                beam_search = BeamSearch(input_.shape[1], samples)
                outputs, costs = beam_search.search(
                    {chars: input_}, char2code['</S>'],
                    3 * input_.shape[0])
            else:
                _1, outputs, _2, _3, costs = (
                    model.get_theano_function()(input_))
                outputs = list(outputs.T)
                costs = list(costs.T)
                for i in range(len(outputs)):
                    outputs[i] = list(outputs[i])
                    try:
                        true_length = outputs[i].index(char2code['</S>']) + 1
                    except ValueError:
                        true_length = len(outputs[i])
                    outputs[i] = outputs[i][:true_length]
                    costs[i] = costs[i][:true_length].sum()
            return outputs, costs
Пример #19
0
    def init_beam_search(self, beam_size):
        """Compile beam search and set the beam size.

        See Blocks issue #500.

        """
        self.beam_size = beam_size
        generated = self.get_generate_graph()
        samples, = VariableFilter(
            applications=[self.generator.generate], name="outputs")(
            ComputationGraph(generated['outputs']))
        self._beam_search = BeamSearch(beam_size, samples)
        self._beam_search.compile()
 def __init__(self, eol_symbol, beam_size, x, x_mask, samples,
              phoneme_dict=None, black_list=None):
     if black_list is None:
         self.black_list = []
     else:
         self.black_list = black_list
     self.x = x
     self.x_mask = x_mask
     self.eol_symbol = eol_symbol
     self.beam_size = beam_size
     self.beam_search = BeamSearch(beam_size, samples)
     self.beam_search.compile()
     self.phoneme_dict = phoneme_dict
 def set_up_decoder(self, nmt_model_path):
     """This method uses the NMT configuration in ``self.config`` to
     initialize the NMT model. This method basically corresponds to 
     ``blocks.machine_translation.main``.
     
     Args:
         nmt_model_path (string):  Path to the NMT model file (.npz)
     """
     self.nmt_model = NMTModel(self.config)
     self.nmt_model.set_up()
     loader = LoadNMTUtils(nmt_model_path, self.config['saveto'],
                           self.nmt_model.search_model)
     loader.load_weights()
     self.src_sparse_feat_map = self.config['src_sparse_feat_map'] \
             if self.config['src_sparse_feat_map'] else FlatSparseFeatMap()
     if self.config['trg_sparse_feat_map']:
         self.trg_sparse_feat_map = self.config['trg_sparse_feat_map']
         self.beam_search = SparseBeamSearch(
             samples=self.nmt_model.samples,
             trg_sparse_feat_map=self.trg_sparse_feat_map)
     else:
         self.trg_sparse_feat_map = FlatSparseFeatMap()
         self.beam_search = BeamSearch(samples=self.nmt_model.samples)
Пример #22
0
    def __init__(self, source_sentence, samples, model, data_stream,
                 config, n_best=1, track_n_models=1, trg_ivocab=None,
                 normalize=True, store_full_main_loop=False, **kwargs):
        # TODO: change config structure
        super(BleuValidator, self).__init__(**kwargs)
        self.store_full_main_loop = store_full_main_loop
        self.source_sentence = source_sentence
        self.samples = samples
        self.model = model
        self.data_stream = data_stream
        self.config = config
        self.n_best = n_best
        self.track_n_models = track_n_models
        self.normalize = normalize
        self.verbose = config.get('val_set_out', None)

        # Helpers
        #self.vocab = data_stream.dataset.dictionary
        self.trg_ivocab = trg_ivocab
        #self.unk_sym = data_stream.dataset.unk_token
        #self.eos_sym = data_stream.dataset.eos_token
        #self.unk_idx = self.vocab[self.unk_sym]
        #self.eos_idx = self.vocab[self.eos_sym]
        self.unk_idx = 0 # fs439: TODO hardcoded
        self.eos_idx = 2 # fs439: TODO hardcoded
        self.best_models = []
        self.val_bleu_curve = []
        self.beam_search = BeamSearch(samples=samples)
        self.multibleu_cmd = (self.config['bleu_script'] % self.config['val_set_grndtruth']).split()
        print("BLEU command: %s" % self.multibleu_cmd)

        # Create saving directory if it does not exist
        if not os.path.exists(self.config['saveto']):
            os.makedirs(self.config['saveto'])

        if self.config['reload']:
            try:
                bleu_score = numpy.load(os.path.join(self.config['saveto'],
                                        'val_bleu_scores.npz'))
                self.val_bleu_curve = bleu_score['bleu_scores'].tolist()

                # Track n best previous bleu scores
                for i, bleu in enumerate(
                        sorted(self.val_bleu_curve, reverse=True)):
                    if i < self.track_n_models:
                        self.best_models.append(ModelInfo(bleu))
                logger.info("BleuScores Reloaded")
            except:
                logger.info("BleuScores not Found")
Пример #23
0
 def set_up_decoder(self, nmt_model_path):
     """This method uses the NMT configuration in ``self.config`` to
     initialize the NMT model. This method basically corresponds to 
     ``blocks.machine_translation.main``.
     
     Args:
         nmt_model_path (string):  Path to the NMT model file (.npz)
     """
     self.nmt_model = NMTModel(self.config)
     self.nmt_model.set_up()
     loader = LoadNMTUtils(nmt_model_path,
                           self.config['saveto'],
                           self.nmt_model.search_model)
     loader.load_weights()
     self.src_sparse_feat_map = self.config['src_sparse_feat_map'] \
             if self.config['src_sparse_feat_map'] else FlatSparseFeatMap()
     if self.config['trg_sparse_feat_map']:
         self.trg_sparse_feat_map = self.config['trg_sparse_feat_map']
         self.beam_search = SparseBeamSearch(
                              samples=self.nmt_model.samples, 
                              trg_sparse_feat_map=self.trg_sparse_feat_map) 
     else:
         self.trg_sparse_feat_map = FlatSparseFeatMap()
         self.beam_search = BeamSearch(samples=self.nmt_model.samples)
Пример #24
0
def load_params_and_get_beam_search(exp_config):

    encoder = BidirectionalEncoder(exp_config['src_vocab_size'],
                                   exp_config['enc_embed'],
                                   exp_config['enc_nhids'])

    # let user specify the target transition class name in config,
    # eval it and pass to decoder
    target_transition_name = exp_config.get(
        'target_transition', 'GRUInitialStateWithInitialStateSumContext')
    target_transition = eval(target_transition_name)

    decoder = InitialContextDecoder(exp_config['trg_vocab_size'],
                                    exp_config['dec_embed'],
                                    exp_config['dec_nhids'],
                                    exp_config['enc_nhids'] * 2,
                                    exp_config['context_dim'],
                                    target_transition)

    # Create Theano variables
    logger.info('Creating theano variables')
    sampling_input = tensor.lmatrix('source')
    sampling_context = tensor.matrix('context_input')

    logger.info("Building sampling model")
    sampling_representation = encoder.apply(sampling_input,
                                            tensor.ones(sampling_input.shape))

    generated = decoder.generate(sampling_input, sampling_representation,
                                 sampling_context)
    _, samples = VariableFilter(
        bricks=[decoder.sequence_generator], name="outputs")(ComputationGraph(
            generated[1]))  # generated[1] is next_outputs

    beam_search = BeamSearch(samples=samples)

    # Set the parameters
    logger.info("Creating Model...")
    model = Model(generated)
    logger.info("Loading parameters from model: {}".format(
        exp_config['saved_parameters']))

    # load the parameter values from an .npz file
    param_values = LoadNMT.load_parameter_values(
        exp_config['saved_parameters'])
    LoadNMT.set_model_parameters(model, param_values)

    return beam_search, sampling_input, sampling_context
Пример #25
0
    def init_beam_search(self, beam_size):
        """Compile beam search and set the beam size.

        See Blocks issue #500.

        """
        if hasattr(self, '_beam_search') and self.beam_size == beam_size:
            # Only recompile if the user wants a different beam size
            return
        self.beam_size = beam_size
        generated = self.get_generate_graph(use_mask=False, n_steps=3)
        cg = ComputationGraph(generated.values())
        samples, = VariableFilter(
            applications=[self.generator.generate], name="outputs")(cg)
        self._beam_search = BeamSearch(beam_size, samples)
        self._beam_search.compile()
 def set_up_decoder(self, nmt_specs):
     """This method sets up a list of NMT models and BeamSearch 
     instances, one for each model in the ensemble. Note that we do
     not use the ``BeamSearch.search`` method for ensemble decoding
     directly.
     
     Args:
         nmt_model_path (string):  Path to the NMT model file (.npz)
     """
     self.nmt_models = []
     self.beam_searches = []
     for nmt_model_path, nmt_config in nmt_specs:
         nmt_model = NMTModel(nmt_config)
         nmt_model.set_up()
         loader = LoadNMTUtils(nmt_model_path, nmt_config['saveto'],
                               nmt_model.search_model)
         loader.load_weights()
         self.nmt_models.append(nmt_model)
         self.beam_searches.append(BeamSearch(samples=nmt_model.samples))
Пример #27
0
def load_params_and_get_beam_search(exp_config):

    encoder = BidirectionalEncoder(exp_config['src_vocab_size'],
                                   exp_config['enc_embed'],
                                   exp_config['enc_nhids'])

    decoder = Decoder(exp_config['trg_vocab_size'], exp_config['dec_embed'],
                      exp_config['dec_nhids'], exp_config['enc_nhids'] * 2)

    # Create Theano variables
    logger.info('Creating theano variables')
    sampling_input = tensor.lmatrix('source')

    # Get beam search
    logger.info("Building sampling model")
    sampling_representation = encoder.apply(sampling_input,
                                            tensor.ones(sampling_input.shape))
    generated = decoder.generate(sampling_input, sampling_representation)

    _, samples = VariableFilter(
        bricks=[decoder.sequence_generator], name="outputs")(ComputationGraph(
            generated[1]))  # generated[1] is next_outputs
    beam_search = BeamSearch(samples=samples)

    # Set the parameters
    logger.info("Creating Model...")
    model = Model(generated)
    logger.info("Loading parameters from model: {}".format(
        exp_config['saved_parameters']))

    # load the parameter values from an .npz file if the `saved_parameters` field is present in the config
    param_values = LoadNMT.load_parameter_values(
        exp_config['saved_parameters'],
        brick_delimiter=exp_config.get('brick_delimiter', None))
    LoadNMT.set_model_parameters(model, param_values)

    return beam_search, sampling_input
Пример #28
0
class BlocksNMTVanillaDecoder(Decoder):
    """Adaptor class for blocks.search.BeamSearch. We implement the
    ``Decoder`` class but ignore functionality for predictors or
    heuristics. Instead, we pass through decoding directly to the 
    blocks beam search module. This is fast, but breaks with the
    predictor framework. It can only be used for pure single system
    NMT decoding. Note that this decoder supports sparse feat maps
    on both source and target side.
    """
    
    def __init__(self, nmt_model_path, config, decoder_args):
        """Set up the NMT model used by the decoder.
        
        Args:
            nmt_model_path (string):  Path to the NMT model file (.npz)
            config (dict): NMT configuration
            decoder_args (object): Decoder configuration passed through
                                   from configuration API.
        """
        super(BlocksNMTVanillaDecoder, self).__init__(decoder_args)
        self.config = config
        self.set_up_decoder(nmt_model_path)
        self.src_eos = self.src_sparse_feat_map.word2dense(utils.EOS_ID)
    
    def set_up_decoder(self, nmt_model_path):
        """This method uses the NMT configuration in ``self.config`` to
        initialize the NMT model. This method basically corresponds to 
        ``blocks.machine_translation.main``.
        
        Args:
            nmt_model_path (string):  Path to the NMT model file (.npz)
        """
        self.nmt_model = NMTModel(self.config)
        self.nmt_model.set_up()
        loader = LoadNMTUtils(nmt_model_path,
                              self.config['saveto'],
                              self.nmt_model.search_model)
        loader.load_weights()
        self.src_sparse_feat_map = self.config['src_sparse_feat_map'] \
                if self.config['src_sparse_feat_map'] else FlatSparseFeatMap()
        if self.config['trg_sparse_feat_map']:
            self.trg_sparse_feat_map = self.config['trg_sparse_feat_map']
            self.beam_search = SparseBeamSearch(
                                 samples=self.nmt_model.samples, 
                                 trg_sparse_feat_map=self.trg_sparse_feat_map) 
        else:
            self.trg_sparse_feat_map = FlatSparseFeatMap()
            self.beam_search = BeamSearch(samples=self.nmt_model.samples)
    
    def decode(self, src_sentence):
        """Decodes a single source sentence with the original blocks
        beam search decoder. Does not use predictors. Note that the
        score breakdowns in returned hypotheses are only on the 
        sentence level, not on the word level. For finer grained NMT
        scores you need to use the nmt predictor. ``src_sentence`` is a
        list of source word ids representing the source sentence without
        <S> or </S> symbols. As blocks expects to see </S>, this method
        adds it automatically.
        
        Args:
            src_sentence (list): List of source word ids without <S> or
                                 </S> which make up the source sentence
        
        Returns:
            list. A list of ``Hypothesis`` instances ordered by their
            score.
        """
        seq = self.src_sparse_feat_map.words2dense(utils.oov_to_unk(
                src_sentence,
                self.config['src_vocab_size'])) + [self.src_eos]
        if self.src_sparse_feat_map.dim > 1: # sparse src feats
            input_ = np.transpose(
                            np.tile(seq, (self.config['beam_size'], 1, 1)),
                            (2,0,1))
        else: # word ids on the source side
            input_ = np.tile(seq, (self.config['beam_size'], 1))
        trans, costs = self.beam_search.search(
                    input_values={self.nmt_model.sampling_input: input_},
                    max_length=3*len(src_sentence),
                    eol_symbol=utils.EOS_ID,
                    ignore_first_eol=True)
        hypos = []
        max_len = 0
        for idx in xrange(len(trans)):
            max_len = max(max_len, len(trans[idx]))
            hypo = Hypothesis(trans[idx], -costs[idx])
            hypo.score_breakdown = len(trans[idx]) * [[(0.0,1.0)]]
            hypo.score_breakdown[0] = [(-costs[idx],1.0)]
            hypos.append(hypo)
        self.apply_predictors_count = max_len * self.config['beam_size']
        return hypos
    
    def has_predictors(self):
        """Always returns true. """
        return True
Пример #29
0
    def __init__(self,
                 source_sentence,
                 samples,
                 model,
                 data_stream,
                 config,
                 n_best=1,
                 track_n_models=1,
                 trg_ivocab=None,
                 normalize=True,
                 **kwargs):
        # TODO: change config structure
        super(BleuValidator, self).__init__(**kwargs)
        self.source_sentence = source_sentence
        self.samples = samples
        self.model = model
        self.data_stream = data_stream
        self.config = config
        self.n_best = n_best
        self.track_n_models = track_n_models
        self.normalize = normalize
        self.verbose = config.get(
            'val_set_out',
            None)  # TODO: set this to a file and True for a sentence output

        # Helpers
        '''
        self.vocab = data_stream.dataset.dictionary
        self.trg_ivocab = trg_ivocab
        self.unk_sym = data_stream.dataset.unk_token
        self.eos_sym = data_stream.dataset.eos_token
        self.unk_idx = self.vocab[self.unk_sym]
        self.eos_idx = self.vocab[self.eos_sym]
        self.best_models = []
        self.val_bleu_curve = []
        self.beam_search = BeamSearch(samples=samples)
        self.multibleu_cmd = ['perl', self.config['bleu_script'],
                              self.config['val_set_grndtruth'], '<']
        '''
        self.beam_search = BeamSearch(samples=samples)
        self.eow_idx = 2  # TODO: this is a hack

        # Create saving directory if it does not exist
        if not os.path.exists(self.config['saveto']):
            os.makedirs(self.config['saveto'])

        #if self.config['reload']:
        if False:
            try:
                bleu_score = numpy.load(
                    os.path.join(self.config['saveto'], 'val_bleu_scores.npz'))
                self.val_bleu_curve = bleu_score['bleu_scores'].tolist()

                # Track n best previous bleu scores
                for i, bleu in enumerate(
                        sorted(self.val_bleu_curve, reverse=True)):
                    if i < self.track_n_models:
                        self.best_models.append(ModelInfo(bleu))
                logger.info("BleuScores Reloaded")
            except:
                logger.info("BleuScores not Found")
Пример #30
0
    encoder.push_initialization_config()  # push_initialization_config 已经被预先定义在Initializable里的方法
    decoder.push_initialization_config()
    encoder.bidir.prototype.weights_init = Orthogonal()
    decoder.transition.weights_init = Orthogonal()
    encoder.initialize()
    decoder.initialize()

    sampling_representation = encoder.apply( source_sentence, tensor.ones(source_sentence.shape))
    generated = decoder.generate(source_sentence, sampling_representation)  # modified here to add the functions.
    search_model = Model(generated)

    _, samples = VariableFilter(bricks=[decoder.sequence_generator], name="outputs")(ComputationGraph(generated[1]))

    weights = VariableFilter(bricks=[decoder.sequence_generator],name="weights")(cg.variables)
    getAlignment = function([source_sentence, target_sentence], weights)
    beam_search = BeamSearch(samples=samples)

    
    saveTo = "/Users/lqy/Documents/search_model_fr2en_backup/"
    load_model = loadNMTfromFile(saveTo)
    model = load_model.load_to(search_model)
    nmt = translateSentence(config=config,model=search_model, data_stream=validate_stream,
                            hook_samples=config['hook_samples'],
                            every_n_batches=config['sampling_freq'],
                            src_vocab_size=config['src_vocab_size'])

    nmt.initialValue(tr_stream)

    unk_idx = config['unk_id']
    src_eos_idx = config['src_vocab_size'] - 1
    trg_eos_idx = config['trg_vocab_size'] - 1
    def decode(self, src_sentence):
        """This is a generalization to NMT ensembles of 
        ``BeamSearch.search``.
        
        Args:
            src_sentence (list): List of source word ids without <S> or
                                 </S> which make up the source sentence
        
        Returns:
            list. A list of ``Hypothesis`` instances ordered by their
            score.
        """
        for search in self.beam_searches:
            if not search.compiled:
                search.compile()
        seq = self.src_sparse_feat_map.words2dense(
            utils.oov_to_unk(src_sentence,
                             self.src_vocab_size)) + [self.src_eos]
        if self.src_sparse_feat_map.dim > 1:  # sparse src feats
            input_ = np.transpose(np.tile(seq, (self.beam_size, 1, 1)),
                                  (2, 0, 1))
        else:  # word ids on the source side
            input_ = np.tile(seq, (self.beam_size, 1))

        contexts_and_states = []
        for sys_idx in xrange(self.n_networks):
            contexts, states, _ = \
                self.beam_searches[sys_idx].compute_initial_states_and_contexts(
                            {self.nmt_models[sys_idx].sampling_input: input_})
            contexts_and_states.append(
                (contexts, states, self.beam_searches[sys_idx]))

        # This array will store all generated outputs, including those from
        # previous step and those from already finished sequences.
        all_outputs = states['outputs'][None, :]
        all_masks = np.ones_like(all_outputs, dtype=config.floatX)
        all_costs = np.zeros_like(all_outputs, dtype=config.floatX)

        for i in range(3 * len(src_sentence)):
            if all_masks[-1].sum() == 0:
                break
            logprobs_lst = []
            for contexts, states, search in contexts_and_states:
                logprobs_lst.append(search.compute_logprobs(contexts, states))

            logprobs = np.sum(logprobs_lst, axis=0)
            next_costs = (all_costs[-1, :, None] +
                          logprobs * all_masks[-1, :, None])
            (finished, ) = np.where(all_masks[-1] == 0)
            next_costs[finished, :utils.EOS_ID] = np.inf
            next_costs[finished, utils.EOS_ID + 1:] = np.inf

            # The `i == 0` is required because at the first step the beam
            # size is effectively only 1.
            (indexes, outputs), chosen_costs = BeamSearch._smallest(
                next_costs, self.beam_size, only_first_row=i == 0)

            all_outputs = all_outputs[:, indexes]
            all_masks = all_masks[:, indexes]
            all_costs = all_costs[:, indexes]

            # Rearrange everything
            for contexts, states, search in contexts_and_states:
                for name in states:
                    states[name] = states[name][indexes]
                states.update(
                    search.compute_next_states(contexts, states, outputs))

            all_outputs = np.vstack([all_outputs, outputs[None, :]])
            all_costs = np.vstack([all_costs, chosen_costs[None, :]])
            mask = outputs != utils.EOS_ID
            if i == 0:
                mask[:] = 1
            all_masks = np.vstack([all_masks, mask[None, :]])

        all_outputs = all_outputs[1:]
        all_masks = all_masks[:-1]
        all_costs = all_costs[1:] - all_costs[:-1]
        result = all_outputs, all_masks, all_costs
        trans, costs = BeamSearch.result_to_lists(result)
        hypos = []
        max_len = 0
        for idx in xrange(len(trans)):
            max_len = max(max_len, len(trans[idx]))
            hypo = Hypothesis(trans[idx], -costs[idx])
            hypo.score_breakdown = len(trans[idx]) * [[(0.0, 1.0)]]
            hypo.score_breakdown[0] = [(-costs[idx], 1.0)]
            hypos.append(hypo)
        self.apply_predictors_count = max_len * self.beam_size
        return hypos
Пример #32
0
class IMT_F1_Validator(SimpleExtension, SamplingBase):
    """Implements early stopping based on METEOR score."""
    def __init__(self,
                 source_sentence,
                 target_prefix,
                 samples,
                 model,
                 data_stream,
                 config,
                 src_vocab=None,
                 trg_vocab=None,
                 n_best=1,
                 track_n_models=1,
                 normalize=True,
                 **kwargs):
        super(IMT_F1_Validator, self).__init__(**kwargs)
        self.source_sentence = source_sentence
        self.target_prefix = target_prefix

        self.src_vocab = src_vocab
        self.trg_vocab = trg_vocab

        self.samples = samples
        self.model = model
        self.data_stream = data_stream
        self.config = config
        self.n_best = n_best
        self.track_n_models = track_n_models
        self.normalize = normalize
        self.verbose = config.get('val_set_out', None)

        # Helpers
        self.best_models = []
        self.val_imt_f1_curve = []
        self.beam_search = BeamSearch(samples=samples)

        # Info for Meteor
        self.target_language = self.config['target_lang']

        # Create save directory if it does not exist
        if not os.path.exists(self.config['saveto']):
            os.makedirs(self.config['saveto'])

        if self.config['reload']:
            try:
                imt_f1_score = numpy.load(
                    os.path.join(self.config['saveto'],
                                 'val_imt_f1_scores.npz'))
                self.val_imt_f1_curve = imt_f1_score['imt_f1_scores'].tolist()

                # Track n best previous f1_bad scores
                for i, imt_f1_val in enumerate(
                        sorted(self.val_imt_f1_curve, reverse=True)):
                    if i < self.track_n_models:
                        self.best_models.append(
                            ModelInfo(imt_f1_val, key='IMT_F1'))
                logger.info("IMT_F1_Scores Reloaded")
            except:
                logger.info("IMT_F1_Scores not found")

    def do(self, which_callback, *args):

        # Track validation burn in
        if self.main_loop.status['iterations_done'] <= self.config[
                'val_burn_in']:
            return

        # Evaluate the model
        imt_f1_score = self._evaluate_model()
        # add an entry to the log
        self.main_loop.log.current_row[
            'validation_set_imt_f1_score'] = imt_f1_score
        # save if necessary
        self._save_model(imt_f1_score)

    # TODO: if we are evaluating both BLEU and METEOR, we shouldn't need to translate twice!!
    def _evaluate_model(self):
        # Set in the superclass -- SamplingBase
        if not hasattr(self, 'target_dataset'):
            self._initialize_dataset_info()

        self.unk_sym = '<UNK>'
        self.eos_sym = '</S>'
        self.unk_idx = self.trg_vocab[self.unk_sym]
        self.eos_idx = self.trg_vocab[self.eos_sym]

        logger.info("Started Validation: ")
        val_start_time = time.time()

        ref_file = self.config['val_set_grndtruth']

        trg_hyp_file = tempfile.NamedTemporaryFile(delete=False)

        if self.verbose:
            ftrans = codecs.open(self.config['val_set_out'],
                                 'w',
                                 encoding='utf8')

        total_cost = 0.0
        with codecs.open(trg_hyp_file.name, 'w', encoding='utf8') as hyps_out:
            for i, line in enumerate(self.data_stream.get_epoch_iterator()):
                """
                Load the sentence, retrieve the sample, write to file
                """

                # TODO: the section with beam search and translation is shared by all validators
                # WORKING: switch this to IMT prefix validation
                # Note that the indices of source and target in the datastream are hard-coded
                # currently our datastream is (source,target,prefix,suffix)
                seq = self._oov_to_unk(line[0], self.config['src_vocab_size'],
                                       self.unk_idx)

                target_prefix = line[2]

                input_ = numpy.tile(seq, (self.config['beam_size'], 1))
                prefix_input_ = numpy.tile(target_prefix,
                                           (self.config['beam_size'], 1))

                # draw sample, checking to ensure we don't get an empty string back
                # beam search param names come from WHERE??
                trans, costs = self.beam_search.search(input_values={
                    self.source_sentence:
                    input_,
                    self.target_prefix:
                    prefix_input_
                },
                                                       max_length=3 * len(seq),
                                                       eol_symbol=self.eos_idx,
                                                       ignore_first_eol=False)

                # normalize costs according to the sequence lengths
                if self.normalize:
                    lengths = numpy.array([len(s) for s in trans])
                    costs = costs / lengths

                nbest_idx = numpy.argsort(costs)[:self.n_best]
                for j, best in enumerate(nbest_idx):
                    try:
                        total_cost += costs[best]
                        trans_out = trans[best]

                        # convert idx to words
                        trans_out = self._idx_to_word(trans_out,
                                                      self.trg_ivocab)

                    except ValueError:
                        logger.info(
                            "Can NOT find a translation for line: {}".format(
                                i + 1))
                        trans_out = '<UNK>'

                    if j == 0:
                        # Write to subprocess and file if it exists
                        hyps_out.write(trans_out.decode('utf8') + '\n')
                        if self.verbose:
                            print(trans_out.decode('utf8'), file=ftrans)

                if i != 0 and i % 100 == 0:
                    logger.info(
                        "Translated {} lines of validation set...".format(i))

            logger.info("Total cost of the validation: {}".format(total_cost))

            self.data_stream.reset()
            if self.verbose:
                ftrans.close()

        imt_f1_score, imt_precision, imt_recall = imt_f1_from_files(
            trg_hyp_file.name, ref_file)

        logger.info("IMT F1 Validation Took: {} minutes".format(
            float(time.time() - val_start_time) / 60.))
        logger.info("IMT F1: {}, Precision: {}, Recall: {}".format(
            imt_f1_score, imt_precision, imt_recall))

        return imt_f1_score

    def _is_valid_to_save(self, imt_f1_score):
        if not self.best_models or min(
                self.best_models,
                key=operator.attrgetter('score')).score < imt_f1_score:
            return True
        return False

    def _save_model(self, imt_f1_score):
        if self._is_valid_to_save(imt_f1_score):
            model = ModelInfo(imt_f1_score,
                              self.config['saveto'],
                              key='IMT_F1')

            # Manage n-best model list first
            if len(self.best_models) >= self.track_n_models:
                old_model = self.best_models[0]
                if old_model.path and os.path.isfile(old_model.path):
                    logger.info("Deleting old model %s" % old_model.path)
                    os.remove(old_model.path)
                self.best_models.remove(old_model)

            self.best_models.append(model)
            self.best_models.sort(key=operator.attrgetter('score'))

            # Save the model here
            s = signal.signal(signal.SIGINT, signal.SIG_IGN)
            logger.info("Saving new model {}".format(model.path))

            SaveLoadUtils.save_parameter_values(
                self.main_loop.model.get_parameter_values(), model.path)
            numpy.savez(os.path.join(self.config['saveto'],
                                     'val_imt_f1_scores.npz'),
                        imt_f1_scores=self.val_imt_f1_curve)
            signal.signal(signal.SIGINT, s)
Пример #33
0
    def decode(self, src_sentence):
        """This is a generalization to NMT ensembles of 
        ``BeamSearch.search``.
        
        Args:
            src_sentence (list): List of source word ids without <S> or
                                 </S> which make up the source sentence
        
        Returns:
            list. A list of ``Hypothesis`` instances ordered by their
            score.
        """
        for search in self.beam_searches:
            if not search.compiled:
                search.compile()
        seq = self.src_sparse_feat_map.words2dense(utils.oov_to_unk(
                src_sentence,
                self.src_vocab_size)) + [self.src_eos]
        if self.src_sparse_feat_map.dim > 1: # sparse src feats
            input_ = np.transpose(
                            np.tile(seq, (self.beam_size, 1, 1)),
                            (2,0,1))
        else: # word ids on the source side
            input_ = np.tile(seq, (self.beam_size, 1))

        contexts_and_states = []
        for sys_idx in xrange(self.n_networks):
            contexts, states, _ = \
                self.beam_searches[sys_idx].compute_initial_states_and_contexts(
                            {self.nmt_models[sys_idx].sampling_input: input_})
            contexts_and_states.append((contexts, 
                                        states, 
                                        self.beam_searches[sys_idx]))

        # This array will store all generated outputs, including those from
        # previous step and those from already finished sequences.
        all_outputs = states['outputs'][None, :]
        all_masks = np.ones_like(all_outputs, dtype=config.floatX)
        all_costs = np.zeros_like(all_outputs, dtype=config.floatX)

        for i in range(3*len(src_sentence)):
            if all_masks[-1].sum() == 0:
                break
            logprobs_lst = []
            for contexts, states, search in contexts_and_states:
                logprobs_lst.append(search.compute_logprobs(contexts, states))
            
            logprobs = np.sum(logprobs_lst, axis=0)
            next_costs = (all_costs[-1, :, None] +
                          logprobs * all_masks[-1, :, None])
            (finished,) = np.where(all_masks[-1] == 0)
            next_costs[finished, :utils.EOS_ID] = np.inf
            next_costs[finished, utils.EOS_ID + 1:] = np.inf

            # The `i == 0` is required because at the first step the beam
            # size is effectively only 1.
            (indexes, outputs), chosen_costs = BeamSearch._smallest(
                next_costs, self.beam_size, only_first_row=i == 0)

            all_outputs = all_outputs[:, indexes]
            all_masks = all_masks[:, indexes]
            all_costs = all_costs[:, indexes]
            
            # Rearrange everything
            for contexts, states, search in contexts_and_states:
                for name in states:
                    states[name] = states[name][indexes]
                states.update(search.compute_next_states(contexts, 
                                                         states, 
                                                         outputs))
            
            all_outputs = np.vstack([all_outputs, outputs[None, :]])
            all_costs = np.vstack([all_costs, chosen_costs[None, :]])
            mask = outputs != utils.EOS_ID
            if i == 0:
                mask[:] = 1
            all_masks = np.vstack([all_masks, mask[None, :]])

        all_outputs = all_outputs[1:]
        all_masks = all_masks[:-1]
        all_costs = all_costs[1:] - all_costs[:-1]
        result = all_outputs, all_masks, all_costs
        trans, costs = BeamSearch.result_to_lists(result)
        hypos = []
        max_len = 0
        for idx in xrange(len(trans)):
            max_len = max(max_len, len(trans[idx]))
            hypo = Hypothesis(trans[idx], -costs[idx])
            hypo.score_breakdown = len(trans[idx]) * [[(0.0,1.0)]]
            hypo.score_breakdown[0] = [(-costs[idx],1.0)]
            hypos.append(hypo)
        self.apply_predictors_count = max_len * self.beam_size
        return hypos
Пример #34
0
class SpeechRecognizer(Initializable):
    """Encapsulate all reusable logic.

    This class plays a few roles: (a) it's a top brick that knows
    how to combine bottom, bidirectional and recognizer network, (b)
    it has the inputs variables and can build whole computation graphs
    starting with them (c) it hides compilation of Theano functions
    and initialization of beam search. I find it simpler to have it all
    in one place for research code.

    Parameters
    ----------
    All defining the structure and the dimensions of the model. Typically
    receives everything from the "net" section of the config.

    """
    def __init__(self, recordings_source, labels_source, eos_label,
                 num_features, num_phonemes,
                 dim_dec, dims_bidir, dims_bottom,
                 enc_transition, dec_transition,
                 use_states_for_readout,
                 attention_type,
                 lm=None, character_map=None,
                 subsample=None,
                 dims_top=None,
                 prior=None, conv_n=None,
                 bottom_activation=None,
                 post_merge_activation=None,
                 post_merge_dims=None,
                 dim_matcher=None,
                 embed_outputs=True,
                 dec_stack=1,
                 conv_num_filters=1,
                 data_prepend_eos=True,
                 energy_normalizer=None,  # softmax is th edefault set in SequenceContentAndConvAttention
                 **kwargs):
        if bottom_activation is None:
            bottom_activation = Tanh()
        if post_merge_activation is None:
            post_merge_activation = Tanh()
        super(SpeechRecognizer, self).__init__(**kwargs)
        self.recordings_source = recordings_source
        self.labels_source = labels_source
        self.eos_label = eos_label
        self.data_prepend_eos = data_prepend_eos

        self.rec_weights_init = None
        self.initial_states_init = None

        self.enc_transition = enc_transition
        self.dec_transition = dec_transition
        self.dec_stack = dec_stack

        bottom_activation = bottom_activation
        post_merge_activation = post_merge_activation

        if dim_matcher is None:
            dim_matcher = dim_dec

        # The bottom part, before BiRNN
        if dims_bottom:
            bottom = MLP([bottom_activation] * len(dims_bottom),
                         [num_features] + dims_bottom,
                         name="bottom")
        else:
            bottom = Identity(name='bottom')

        # BiRNN
        if not subsample:
            subsample = [1] * len(dims_bidir)
        encoder = Encoder(self.enc_transition, dims_bidir,
                          dims_bottom[-1] if len(dims_bottom) else num_features,
                          subsample)

        # The top part, on top of BiRNN but before the attention
        if dims_top:
            top = MLP([Tanh()],
                      [2 * dims_bidir[-1]] + dims_top + [2 * dims_bidir[-1]], name="top")
        else:
            top = Identity(name='top')

        if dec_stack == 1:
            transition = self.dec_transition(
                dim=dim_dec, activation=Tanh(), name="transition")
        else:
            transitions = [self.dec_transition(dim=dim_dec,
                                               activation=Tanh(),
                                               name="transition_{}".format(trans_level))
                           for trans_level in xrange(dec_stack)]
            transition = RecurrentStack(transitions=transitions,
                                        skip_connections=True)
        # Choose attention mechanism according to the configuration
        if attention_type == "content":
            attention = SequenceContentAttention(
                state_names=transition.apply.states,
                attended_dim=2 * dims_bidir[-1], match_dim=dim_matcher,
                name="cont_att")
        elif attention_type == "content_and_conv":
            attention = SequenceContentAndConvAttention(
                state_names=transition.apply.states,
                conv_n=conv_n,
                conv_num_filters=conv_num_filters,
                attended_dim=2 * dims_bidir[-1], match_dim=dim_matcher,
                prior=prior,
                energy_normalizer=energy_normalizer,
                name="conv_att")
        else:
            raise ValueError("Unknown attention type {}"
                             .format(attention_type))
        if embed_outputs:
            feedback = LookupFeedback(num_phonemes + 1, dim_dec)
        else:
            feedback = OneOfNFeedback(num_phonemes + 1)
        if lm:
            # In case we use LM it is Readout that is responsible
            # for normalization.
            emitter = LMEmitter()
        else:
            emitter = SoftmaxEmitter(initial_output=num_phonemes, name="emitter")
        readout_config = dict(
            readout_dim=num_phonemes,
            source_names=(transition.apply.states if use_states_for_readout else [])
                         + [attention.take_glimpses.outputs[0]],
            emitter=emitter,
            feedback_brick=feedback,
            name="readout")
        if post_merge_dims:
            readout_config['merged_dim'] = post_merge_dims[0]
            readout_config['post_merge'] = InitializableSequence([
                Bias(post_merge_dims[0]).apply,
                post_merge_activation.apply,
                MLP([post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()],
                    # MLP was designed to support Maxout is activation
                    # (because Maxout in a way is not one). However
                    # a single layer Maxout network works with the trick below.
                    # For deeper Maxout network one has to use the
                    # Sequence brick.
                    [d//getattr(post_merge_activation, 'num_pieces', 1)
                     for d in post_merge_dims] + [num_phonemes]).apply,
            ],
                name='post_merge')
        readout = Readout(**readout_config)

        language_model = None
        if lm:
            lm_weight = lm.pop('weight', 0.0)
            normalize_am_weights = lm.pop('normalize_am_weights', True)
            normalize_lm_weights = lm.pop('normalize_lm_weights', False)
            normalize_tot_weights = lm.pop('normalize_tot_weights', False)
            am_beta = lm.pop('am_beta', 1.0)
            if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1:
                logger.warn("Beam search is prone to fail with no log-prob normalization")
            language_model = LanguageModel(nn_char_map=character_map, **lm)
            readout = ShallowFusionReadout(lm_costs_name='lm_add',
                                           lm_weight=lm_weight,
                                           normalize_am_weights=normalize_am_weights,
                                           normalize_lm_weights=normalize_lm_weights,
                                           normalize_tot_weights=normalize_tot_weights,
                                           am_beta=am_beta,
                                           **readout_config)

        generator = SequenceGenerator(
            readout=readout, transition=transition, attention=attention,
            language_model=language_model,
            name="generator")

        # Remember child bricks
        self.encoder = encoder
        self.bottom = bottom
        self.top = top
        self.generator = generator
        self.children = [encoder, top, bottom, generator]

        # Create input variables
        self.recordings = tensor.tensor3(self.recordings_source)
        self.recordings_mask = tensor.matrix(self.recordings_source + "_mask")
        self.labels = tensor.lmatrix(self.labels_source)
        self.labels_mask = tensor.matrix(self.labels_source + "_mask")
        self.batch_inputs = [self.recordings, self.recordings_source,
                             self.labels, self.labels_mask]
        self.single_recording = tensor.matrix(self.recordings_source)
        self.single_transcription = tensor.lvector(self.labels_source)

    def push_initialization_config(self):
        super(SpeechRecognizer, self).push_initialization_config()
        if self.rec_weights_init:
            rec_weights_config = {'weights_init': self.rec_weights_init,
                                  'recurrent_weights_init': self.rec_weights_init}
            global_push_initialization_config(self,
                                              rec_weights_config,
                                              BaseRecurrent)
        if self.initial_states_init:
            global_push_initialization_config(self,
                                              {'initial_states_init': self.initial_states_init})

    @application
    def cost(self, recordings, recordings_mask, labels, labels_mask):
        bottom_processed = self.bottom.apply(recordings)
        encoded, encoded_mask = self.encoder.apply(
            input_=bottom_processed,
            mask=recordings_mask)
        encoded = self.top.apply(encoded)
        return self.generator.cost_matrix(
            labels, labels_mask,
            attended=encoded, attended_mask=encoded_mask)

    @application
    def generate(self, recordings):
        encoded, encoded_mask = self.encoder.apply(
            input_=self.bottom.apply(recordings))
        encoded = self.top.apply(encoded)
        return self.generator.generate(
            n_steps=recordings.shape[0], batch_size=recordings.shape[1],
            attended=encoded,
            attended_mask=encoded_mask,
            as_dict=True)

    def load_params(self, path):
        generated = self.get_generate_graph()
        param_values = load_parameter_values(path)
        SpeechModel(generated['outputs']).set_parameter_values(param_values)

    def get_generate_graph(self):
        result = self.generate(self.recordings)
        return result

    def get_cost_graph(self, batch=True):
        if batch:
            return self.cost(
                self.recordings, self.recordings_mask,
                self.labels, self.labels_mask)
        recordings = self.single_recording[:, None, :]
        labels = self.single_transcription[:, None]
        return self.cost(
            recordings, tensor.ones_like(recordings[:, :, 0]),
            labels, None)

    def analyze(self, recording, transcription):
        """Compute cost and aligment for a recording/transcription pair."""
        if not hasattr(self, "_analyze"):
            cost = self.get_cost_graph(batch=False)
            cg = ComputationGraph(cost)
            energies = VariableFilter(
                bricks=[self.generator], name="energies")(cg)
            energies_output = [energies[0][:, 0, :] if energies
                               else tensor.zeros((self.single_transcription.shape[0],
                                                  self.single_recording.shape[0]))]
            states, = VariableFilter(
                applications=[self.encoder.apply], roles=[OUTPUT],
                name="encoded")(cg)
            ctc_matrix_output = []
            # Temporarily disabled for compatibility with LM code
            # if len(self.generator.readout.source_names) == 1:
            #    ctc_matrix_output = [
            #        self.generator.readout.readout(weighted_averages=states)[:, 0, :]]
            weights, = VariableFilter(
                bricks=[self.generator], name="weights")(cg)
            self._analyze = theano.function(
                [self.single_recording, self.single_transcription],
                [cost[:, 0], weights[:, 0, :]] + energies_output + ctc_matrix_output)
        return self._analyze(recording, transcription)

    def init_beam_search(self, beam_size):
        """Compile beam search and set the beam size.

        See Blocks issue #500.

        """
        self.beam_size = beam_size
        generated = self.get_generate_graph()
        samples, = VariableFilter(
            applications=[self.generator.generate], name="outputs")(
            ComputationGraph(generated['outputs']))
        self._beam_search = BeamSearch(beam_size, samples)
        self._beam_search.compile()

    def beam_search(self, recording, char_discount=0.0):
        if not hasattr(self, '_beam_search'):
            self.init_beam_search(self.beam_size)
        input_ = recording[:,numpy.newaxis,:]
        outputs, search_costs = self._beam_search.search(
            {self.recordings: input_}, self.eos_label, input_.shape[0] / 3,
            ignore_first_eol=self.data_prepend_eos,
            char_discount=char_discount)
        return outputs, search_costs

    def __getstate__(self):
        state = dict(self.__dict__)
        for attr in ['_analyze', '_beam_search']:
            state.pop(attr, None)
        return state

    def __setstate__(self, state):
        self.__dict__.update(state)
        # To use bricks used on a GPU first on a CPU later
        try:
            emitter = self.generator.readout.emitter
            del emitter._theano_rng
        except:
            pass
Пример #35
0
class BleuValidator(SimpleExtension, SamplingBase):
    def __init__(
        self,
        source_sentence,
        samples,
        model,
        data_stream,
        config,
        n_best=1,
        track_n_models=1,
        trg_ivocab=None,
        src_eos_idx=-1,
        trg_eos_idx=-1,
        **kwargs
    ):
        super(BleuValidator, self).__init__(**kwargs)
        self.source_sentence = source_sentence
        self.samples = samples
        self.model = model
        self.data_stream = data_stream
        self.config = config
        self.n_best = n_best
        self.track_n_models = track_n_models
        self.verbose = config.get("val_set_out", None)

        self.src_eos_idx = src_eos_idx
        self.trg_eos_idx = trg_eos_idx

        # Helpers
        self.vocab = data_stream.dataset.dictionary
        self.trg_ivocab = trg_ivocab
        self.unk_sym = data_stream.dataset.unk_token
        self.eos_sym = data_stream.dataset.eos_token
        self.unk_idx = self.vocab[self.unk_sym]
        self.eos_idx = self.src_eos_idx  # self.vocab[self.eos_sym]
        self.best_models = []
        self.val_bleu_curve = []
        self.beam_search = BeamSearch(beam_size=self.config["beam_size"], samples=samples)
        self.multibleu_cmd = ["perl", self.config["bleu_script"], self.config["val_set_grndtruth"], "<"]

        # Create saving directory if it does not exist
        if not os.path.exists(self.config["saveto"]):
            os.makedirs(self.config["saveto"])

        if self.config["reload"]:
            try:
                bleu_score = numpy.load(os.path.join(self.config["saveto"], "val_bleu_scores.npz"))
                self.val_bleu_curve = bleu_score["bleu_scores"].tolist()

                # Track n best previous bleu scores
                for i, bleu in enumerate(sorted(self.val_bleu_curve, reverse=True)):
                    if i < self.track_n_models:
                        self.best_models.append(ModelInfo(bleu))
                logger.info("BleuScores Reloaded")
            except:
                logger.info("BleuScores not Found")

    def do(self, which_callback, *args):

        # Track validation burn in
        if self.main_loop.status["iterations_done"] <= self.config["val_burn_in"]:
            return

        # Get current model parameters
        self.model.set_param_values(self.main_loop.model.get_param_values())

        # Evaluate and save if necessary
        self._save_model(self._evaluate_model())

    def _evaluate_model(self):

        logger.info("Started Validation: ")
        val_start_time = time.time()
        mb_subprocess = Popen(self.multibleu_cmd, stdin=PIPE, stdout=PIPE)
        total_cost = 0.0

        # Get target vocabulary
        if not self.trg_ivocab:
            sources = self._get_attr_rec(self.main_loop, "data_stream")
            trg_vocab = sources.data_streams[1].dataset.dictionary
            self.trg_ivocab = {v: k for k, v in trg_vocab.items()}

        if self.verbose:
            ftrans = open(self.config["val_set_out"], "w")

        for i, line in enumerate(self.data_stream.get_epoch_iterator()):
            """
            Load the sentence, retrieve the sample, write to file
            """

            line[0][-1] = self.src_eos_idx
            seq = self._oov_to_unk(line[0])
            input_ = numpy.tile(seq, (self.config["beam_size"], 1))

            # draw sample, checking to ensure we don't get an empty string back
            trans, costs = self.beam_search.search(
                input_values={self.source_sentence: input_},
                max_length=3 * len(seq),
                eol_symbol=self.trg_eos_idx,
                ignore_first_eol=True,
            )

            nbest_idx = numpy.argsort(costs)[: self.n_best]
            for j, best in enumerate(nbest_idx):
                try:
                    total_cost += costs[best]
                    trans_out = trans[best]

                    # convert idx to words
                    trans_out = self._idx_to_word(trans_out[:-1], self.trg_ivocab)

                except ValueError:
                    print "Can NOT find a translation for line: {}".format(i + 1)
                    trans_out = "<UNK>"

                if j == 0:
                    # Write to subprocess and file if it exists
                    print >> mb_subprocess.stdin, trans_out
                    if self.verbose:
                        print >> ftrans, trans_out

            if i != 0 and i % 100 == 0:
                print "Translated {} lines of validation set...".format(i)

            mb_subprocess.stdin.flush()

        print "Total cost of the validation: {}".format(total_cost)
        self.data_stream.reset()
        if self.verbose:
            ftrans.close()

        # send end of file, read output.
        mb_subprocess.stdin.close()
        stdout = mb_subprocess.stdout.readline()
        print "output ", stdout
        out_parse = re.match(r"BLEU = [-.0-9]+", stdout)
        logger.info("Validation Took: {} minutes".format(float(time.time() - val_start_time) / 60.0))
        assert out_parse is not None

        # extract the score
        bleu_score = float(out_parse.group()[6:])
        self.val_bleu_curve.append(bleu_score)
        print bleu_score
        mb_subprocess.terminate()

        return bleu_score

    def _is_valid_to_save(self, bleu_score):
        if not self.best_models or min(self.best_models, key=operator.attrgetter("bleu_score")).bleu_score < bleu_score:
            return True
        return False

    def _save_model(self, bleu_score):
        if self._is_valid_to_save(bleu_score):
            model = ModelInfo(bleu_score, self.config["saveto"])

            # Manage n-best model list first
            if len(self.best_models) >= self.track_n_models:
                old_model = self.best_models[0]
                if old_model.path and os.path.isfile(old_model.path):
                    logger.info("Deleting old model %s" % old_model.path)
                    os.remove(old_model.path)
                self.best_models.remove(old_model)

            self.best_models.append(model)
            self.best_models.sort(key=operator.attrgetter("bleu_score"))

            # Save the model here
            s = signal.signal(signal.SIGINT, signal.SIG_IGN)
            logger.info("Saving new model {}".format(model.path))
            numpy.savez(model.path, **self.main_loop.model.get_param_values())
            numpy.savez(os.path.join(self.config["saveto"], "val_bleu_scores.npz"), bleu_scores=self.val_bleu_curve)
            signal.signal(signal.SIGINT, s)
Пример #36
0
def test_beam_search_smallest():
    a = numpy.array([[3, 6, 4], [1, 2, 7]])
    ind, mins = BeamSearch._smallest(a, 2)
    assert numpy.all(numpy.array(ind) == numpy.array([[1, 1], [0, 1]]))
    assert numpy.all(mins == [1, 2])
Пример #37
0
class BleuValidator(SimpleExtension, SamplingBase):
    # TODO: a lot has been changed in NMT, sync respectively
    """Implements early stopping based on BLEU score."""

    def __init__(self, source_sentence, samples, model, data_stream,
                 config, n_best=1, track_n_models=1,
                 normalize=True, **kwargs):
        # TODO: change config structure
        super(BleuValidator, self).__init__(**kwargs)
        self.source_sentence = source_sentence
        self.samples = samples
        self.model = model
        self.data_stream = data_stream
        self.config = config
        self.n_best = n_best
        self.track_n_models = track_n_models
        self.normalize = normalize
        self.verbose = config.get('val_set_out', None)

        # Helpers
        self.vocab = data_stream.dataset.dictionary
        self.unk_sym = data_stream.dataset.unk_token
        self.eos_sym = data_stream.dataset.eos_token
        self.unk_idx = self.vocab[self.unk_sym]
        self.eos_idx = self.vocab[self.eos_sym]
        self.best_models = []
        self.val_bleu_curve = []
        self.beam_search = BeamSearch(samples=samples)
        self.multibleu_cmd = ['perl', self.config['bleu_script'],
                              self.config['val_set_grndtruth'], '<']

        # Create saving directory if it does not exist
        if not os.path.exists(self.config['saveto']):
            os.makedirs(self.config['saveto'])

        if self.config['reload']:
            try:
                bleu_score = numpy.load(os.path.join(self.config['saveto'],
                                        'val_bleu_scores.npz'))
                self.val_bleu_curve = bleu_score['bleu_scores'].tolist()

                # Track n best previous bleu scores
                for i, bleu in enumerate(
                        sorted(self.val_bleu_curve, reverse=True)):
                    if i < self.track_n_models:
                        self.best_models.append(ModelInfo(bleu))
                logger.info("BleuScores Reloaded")
            except:
                logger.info("BleuScores not Found")

    def do(self, which_callback, *args):

        # Track validation burn in
        if self.main_loop.status['iterations_done'] <= \
                self.config['val_burn_in']:
            return

        # Evaluate and save if necessary
        self._save_model(self._evaluate_model())

    def _evaluate_model(self):

        logger.info("Started Validation: ")
        val_start_time = time.time()
        mb_subprocess = Popen(self.multibleu_cmd, stdin=PIPE, stdout=PIPE)
        total_cost = 0.0

        # Get target vocabulary
        sources = self._get_attr_rec(self.main_loop, 'data_stream')
        trg_vocab = sources.data_streams[1].dataset.dictionary
        self.trg_ivocab = {v: k for k, v in trg_vocab.items()}
        trg_eos_sym = sources.data_streams[1].dataset.eos_token
        self.trg_eos_idx = trg_vocab[trg_eos_sym]

        if self.verbose:
            ftrans = open(self.config['val_set_out'], 'w')

        for i, line in enumerate(self.data_stream.get_epoch_iterator()):
            """
            Load the sentence, retrieve the sample, write to file
            """

            seq = self._oov_to_unk(
                line[0], self.config['src_vocab_size'], self.unk_idx)
            input_ = numpy.tile(seq, (self.config['beam_size'], 1))

            # draw sample, checking to ensure we don't get an empty string back
            trans, costs = \
                self.beam_search.search(
                    input_values={self.source_sentence: input_},
                    max_length=3*len(seq), eol_symbol=self.trg_eos_idx,
                    ignore_first_eol=True)

            # normalize costs according to the sequence lengths
            if self.normalize:
                lengths = numpy.array([len(s) for s in trans])
                costs = costs / lengths

            nbest_idx = numpy.argsort(costs)[:self.n_best]
            for j, best in enumerate(nbest_idx):
                try:
                    total_cost += costs[best]
                    trans_out = trans[best]

                    # convert idx to words
                    trans_out = self._idx_to_word(trans_out, self.trg_ivocab)

                except ValueError:
                    logger.info(
                        "Can NOT find a translation for line: {}".format(i+1))
                    trans_out = '<UNK>'

                if j == 0:
                    # Write to subprocess and file if it exists
                    print(trans_out, file=mb_subprocess.stdin)
                    if self.verbose:
                        print(trans_out, file=ftrans)

            if i != 0 and i % 100 == 0:
                logger.info(
                    "Translated {} lines of validation set...".format(i))

            mb_subprocess.stdin.flush()

        logger.info("Total cost of the validation: {}".format(total_cost))
        self.data_stream.reset()
        if self.verbose:
            ftrans.close()

        # send end of file, read output.
        mb_subprocess.stdin.close()
        stdout = mb_subprocess.stdout.readline()
        logger.info(stdout)
        out_parse = re.match(r'BLEU = [-.0-9]+', stdout)
        logger.info("Validation Took: {} minutes".format(
            float(time.time() - val_start_time) / 60.))
        assert out_parse is not None

        # extract the score
        bleu_score = float(out_parse.group()[6:])
        self.val_bleu_curve.append(bleu_score)
        logger.info(bleu_score)
        mb_subprocess.terminate()

        return bleu_score

    def _is_valid_to_save(self, bleu_score):
        if not self.best_models or min(self.best_models,
           key=operator.attrgetter('bleu_score')).bleu_score < bleu_score:
            return True
        return False

    def _save_model(self, bleu_score):
        if self._is_valid_to_save(bleu_score):
            model = ModelInfo(bleu_score, self.config['saveto'])

            # Manage n-best model list first
            if len(self.best_models) >= self.track_n_models:
                old_model = self.best_models[0]
                if old_model.path and os.path.isfile(old_model.path):
                    logger.info("Deleting old model %s" % old_model.path)
                    os.remove(old_model.path)
                self.best_models.remove(old_model)

            self.best_models.append(model)
            self.best_models.sort(key=operator.attrgetter('bleu_score'))

            # Save the model here
            s = signal.signal(signal.SIGINT, signal.SIG_IGN)
            logger.info("Saving new model {}".format(model.path))
            numpy.savez(
                model.path, **self.main_loop.model.get_parameter_dict())
            numpy.savez(
                os.path.join(self.config['saveto'], 'val_bleu_scores.npz'),
                bleu_scores=self.val_bleu_curve)
            signal.signal(signal.SIGINT, s)
Пример #38
0
    def set_up_decoder(self, nmt_model_path):
        """This method uses the NMT configuration in ``self.config`` to
        initialize the NMT model. This method basically corresponds to 
        ``blocks.machine_translation.main``.
        
        Args:
            nmt_model_path (string):  Path to the NMT model file (.npz)
        """
        # Create Theano variables
        logging.info('Creating theano variables')
        source_sentence = tensor.lmatrix('source')
        source_sentence_mask = tensor.matrix('source_mask')
        target_sentence = tensor.lmatrix('target')
        target_sentence_mask = tensor.matrix('target_mask')
        sampling_input = tensor.lmatrix('input')

        # Construct model
        logging.info('Building RNN encoder-decoder')
        encoder = BidirectionalEncoder(self.config['src_vocab_size'],
                                       self.config['enc_embed'],
                                       self.config['enc_nhids'])
        decoder = Decoder(self.config['trg_vocab_size'],
                          self.config['dec_embed'], self.config['dec_nhids'],
                          self.config['enc_nhids'] * 2)
        cost = decoder.cost(
            encoder.apply(source_sentence, source_sentence_mask),
            source_sentence_mask, target_sentence, target_sentence_mask)

        logging.info('Creating computational graph')
        cg = ComputationGraph(cost)

        # Initialize model (TODO: do i really need this?)
        logging.info('Initializing model')
        encoder.weights_init = decoder.weights_init = IsotropicGaussian(
            self.config['weight_scale'])
        encoder.biases_init = decoder.biases_init = Constant(0)
        encoder.push_initialization_config()
        decoder.push_initialization_config()
        encoder.bidir.prototype.weights_init = Orthogonal()
        decoder.transition.weights_init = Orthogonal()
        encoder.initialize()
        decoder.initialize()

        # apply dropout for regularization (TODO: remove?)
        if self.config['dropout'] < 1.0:
            # dropout is applied to the output of maxout in ghog
            logging.info('Applying dropout')
            dropout_inputs = [
                x for x in cg.intermediary_variables
                if x.name == 'maxout_apply_output'
            ]
            cg = apply_dropout(cg, dropout_inputs, self.config['dropout'])

        # Apply weight noise for regularization (TODO: remove?)
        if self.config['weight_noise_ff'] > 0.0:
            logging.info('Applying weight noise to ff layers')
            enc_params = Selector(encoder.lookup).get_params().values()
            enc_params += Selector(encoder.fwd_fork).get_params().values()
            enc_params += Selector(encoder.back_fork).get_params().values()
            dec_params = Selector(
                decoder.sequence_generator.readout).get_params().values()
            dec_params += Selector(
                decoder.sequence_generator.fork).get_params().values()
            dec_params += Selector(decoder.state_init).get_params().values()
            cg = apply_noise(cg, enc_params + dec_params,
                             self.config['weight_noise_ff'])

        # Print shapes
        shapes = [param.get_value().shape for param in cg.parameters]
        logging.info("Parameter shapes: ")
        for shape, count in Counter(shapes).most_common():
            logging.info('    {:15}: {}'.format(shape, count))
        logging.info("Total number of parameters: {}".format(len(shapes)))

        # Print parameter names
        enc_dec_param_dict = merge(
            Selector(encoder).get_parameters(),
            Selector(decoder).get_parameters())
        logging.info("Parameter names: ")
        for name, value in enc_dec_param_dict.items():
            logging.info('    {:15}: {}'.format(value.get_value().shape, name))
        logging.info("Total number of parameters: {}".format(
            len(enc_dec_param_dict)))

        # Set up training model
        logging.info("Building model")

        # Set extensions
        logging.info("Initializing extensions")

        # Set up beam search and sampling computation graphs if necessary
        logging.info("Building sampling model")
        sampling_representation = encoder.apply(
            sampling_input, tensor.ones(sampling_input.shape))
        generated = decoder.generate(sampling_input, sampling_representation)
        search_model = Model(generated)
        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs

        # Compare with blocks.machine_translation.BleuValidator.__init__
        self.source_sentence = sampling_input
        self.samples = samples
        self.model = search_model
        self.normalize = True
        self.verbose = self.config.get('val_set_out', None)

        # Reload model if necessary
        if self.config['reload']:
            loader = LoadNMT(nmt_model_path, self.config['saveto'],
                             search_model)
            loader.load_weights()

        self.best_models = []
        self.val_bleu_curve = []
        self.beam_search = BeamSearch(samples=samples)
Пример #39
0
decoder.transition.weights_init = Orthogonal()
encoder.initialize()
decoder.initialize()

logger.info("Building sampling model")
sampling_representation = encoder.apply(sampling_input, tensor.ones(sampling_input.shape))
generated = decoder.generate(sampling_input, sampling_representation)
search_model = Model(generated)

params = search_model.get_parameter_dict()
param_values = SaveLoadUtils().load_parameter_values(os.path.join(config['saveto'], 'params.npz'))
for k in params:
    params[k].set_value(param_values[k])

_, samples = VariableFilter(bricks=[decoder.sequence_generator], name="outputs")(ComputationGraph(generated[1]))
beam_search = BeamSearch(samples=samples)

# Read from standard input
stream = get_stdin_stream(**config)

vocab = get_vocab(config['trg_vocab'], config['trg_vocab_size'], config['unk_id'], config['eos_id'], config['bos_id'])
inv_vocab = {v: k for k, v in vocab.iteritems()}

unk_id = config['unk_id']
eos_id = config['eos_id']

for sample in stream.get_epoch_iterator():
    seq = sample[0]
    input_ = np.tile(seq, (config['beam_size'], 1))

    trans, costs = beam_search.search(
Пример #40
0
def main(config,
         tr_stream,
         dev_stream,
         use_bokeh=False,
         slim_iteration_state=False,
         switch_controller=None,
         reset_epoch=False):
    """This method largely corresponds to the ``main`` method in the
    original Blocks implementation in blocks-examples and most of the
    code is copied from there. Following modifications have been made:
    
    - Support fixing word embedding during training
    - Dropout fix https://github.com/mila-udem/blocks-examples/issues/46
    - If necessary, add the exp3s extension
    
    Args:
        config (dict): NMT config
        tr_stream (DataStream): Training data stream
        dev_stream (DataStream): Validation data stream
        use_bokeh (bool): Whether to use bokeh for plotting
        slim_iteration_state (bool): Whether to store the full iteration
                                     state or only the epoch iterator
                                     without data stream state
        switch_controller (SourceSwitchController): Controlling strategy
                                                    if monolingual data
                                                    is used as well
        reset_epoch (bool): Set epoch_started in main loop status to
                            false. Sometimes required if you change
                            training parameters such as 
                            mono_data_integration
    """

    nmt_model = NMTModel(config)
    nmt_model.set_up()

    # Set extensions
    logging.info("Initializing extensions")
    extensions = [
        FinishAfter(after_n_batches=config['finish_after']),
        TrainingDataMonitoring([nmt_model.cost], after_batch=True),
        Printing(after_batch=True),
        CheckpointNMT(config['saveto'],
                      slim_iteration_state,
                      every_n_batches=config['save_freq'])
    ]

    # Add early stopping based on bleu
    if config['bleu_script'] is not None:
        logging.info("Building bleu validator")
        extensions.append(
            BleuValidator(nmt_model.sampling_input,
                          samples=nmt_model.samples,
                          config=config,
                          model=nmt_model.search_model,
                          data_stream=dev_stream,
                          normalize=config['normalized_bleu'],
                          store_full_main_loop=config['store_full_main_loop'],
                          every_n_batches=config['bleu_val_freq']))

    if switch_controller:
        switch_controller.beam_search = BeamSearch(samples=nmt_model.samples)
        switch_controller.src_sentence = nmt_model.sampling_input
        extensions.append(switch_controller)

    # Reload model if necessary
    if config['reload']:
        extensions.append(
            LoadNMT(config['saveto'], slim_iteration_state, reset_epoch))

    # Plot cost in bokeh if necessary
    if use_bokeh and BOKEH_AVAILABLE:
        extensions.append(
            Plot('Decoding cost',
                 channels=[['decoder_cost_cost']],
                 after_batch=True))

    # Add an extension for correct handling of SIGTERM and SIGINT
    extensions.append(AlwaysEpochInterrupt(every_n_batches=1))

    # Set up training algorithm
    logging.info("Initializing training algorithm")
    # https://github.com/mila-udem/blocks-examples/issues/46
    train_params = nmt_model.cg.parameters
    # fs439: fix embeddings?
    if config['fix_embeddings']:
        train_params = []
        embedding_params = [
            'softmax1', 'softmax0', 'maxout_bias', 'embeddings', 'lookuptable',
            'transform_feedback'
        ]
        for p in nmt_model.cg.parameters:
            add_param = True
            for ann in p.tag.annotations:
                if ann.name in embedding_params:
                    logging.info("Do not train %s due to annotation %s" %
                                 (p, ann))
                    add_param = False
                    break
            if add_param:
                train_params.append(p)
    # Change cost=cost to cg.outputs[0] ?
    algorithm = GradientDescent(cost=nmt_model.cg.outputs[0]
                                if config['dropout'] < 1.0 else nmt_model.cost,
                                parameters=train_params,
                                step_rule=CompositeRule([
                                    StepClipping(config['step_clipping']),
                                    eval(config['step_rule'])()
                                ]))

    # Initialize main loop
    logging.info("Initializing main loop")
    main_loop = MainLoop(model=nmt_model.training_model,
                         algorithm=algorithm,
                         data_stream=tr_stream,
                         extensions=extensions)

    # Reset epoch
    if reset_epoch:
        main_loop.status['epoch_started'] = False

    # Train!
    main_loop.run()
Пример #41
0
def test_beam_search_smallest():
    a = numpy.array([[3, 6, 4], [1, 2, 7]])
    ind, mins = BeamSearch._smallest(a, 2)
    assert numpy.all(numpy.array(ind) == numpy.array([[1, 1], [0, 1]]))
    assert numpy.all(mins == [1, 2])
Пример #42
0
def main(mode, config, use_bokeh=False):

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(
        config['src_vocab_size'], config['enc_embed'], config['enc_nhids'])
    decoder = Decoder(
        config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'],
        config['enc_nhids'] * 2)

    if mode == "train":

        # Create Theano variables
        logger.info('Creating theano variables')
        source_sentence = tensor.lmatrix('source')
        source_sentence_mask = tensor.matrix('source_mask')
        target_sentence = tensor.lmatrix('target')
        target_sentence_mask = tensor.matrix('target_mask')
        sampling_input = tensor.lmatrix('input')

        # Get training and development set streams
        tr_stream = get_tr_stream(**config)
        dev_stream = get_dev_stream(**config)

        # Get cost of the model
        cost = decoder.cost(
            encoder.apply(source_sentence, source_sentence_mask),
            source_sentence_mask, target_sentence, target_sentence_mask)

        logger.info('Creating computational graph')
        cg = ComputationGraph(cost)

        # Initialize model
        logger.info('Initializing model')
        encoder.weights_init = decoder.weights_init = IsotropicGaussian(
            config['weight_scale'])
        encoder.biases_init = decoder.biases_init = Constant(0)
        encoder.push_initialization_config()
        decoder.push_initialization_config()
        encoder.bidir.prototype.weights_init = Orthogonal()
        decoder.transition.weights_init = Orthogonal()
        encoder.initialize()
        decoder.initialize()

        # apply dropout for regularization
        if config['dropout'] < 1.0:
            # dropout is applied to the output of maxout in ghog
            logger.info('Applying dropout')
            dropout_inputs = [x for x in cg.intermediary_variables
                              if x.name == 'maxout_apply_output']
            cg = apply_dropout(cg, dropout_inputs, config['dropout'])

        # Apply weight noise for regularization
        if config['weight_noise_ff'] > 0.0:
            logger.info('Applying weight noise to ff layers')
            enc_params = Selector(encoder.lookup).get_params().values()
            enc_params += Selector(encoder.fwd_fork).get_params().values()
            enc_params += Selector(encoder.back_fork).get_params().values()
            dec_params = Selector(
                decoder.sequence_generator.readout).get_params().values()
            dec_params += Selector(
                decoder.sequence_generator.fork).get_params().values()
            dec_params += Selector(decoder.state_init).get_params().values()
            cg = apply_noise(
                cg, enc_params+dec_params, config['weight_noise_ff'])

        # Print shapes
        shapes = [param.get_value().shape for param in cg.parameters]
        logger.info("Parameter shapes: ")
        for shape, count in Counter(shapes).most_common():
            logger.info('    {:15}: {}'.format(shape, count))
        logger.info("Total number of parameters: {}".format(len(shapes)))

        # Print parameter names
        enc_dec_param_dict = merge(Selector(encoder).get_parameters(),
                                   Selector(decoder).get_parameters())
        logger.info("Parameter names: ")
        for name, value in enc_dec_param_dict.items():
            logger.info('    {:15}: {}'.format(value.get_value().shape, name))
        logger.info("Total number of parameters: {}"
                    .format(len(enc_dec_param_dict)))

        # Set up training model
        logger.info("Building model")
        training_model = Model(cost)

        # Set extensions
        logger.info("Initializing extensions")
        extensions = [
            FinishAfter(after_n_batches=config['finish_after']),
            TrainingDataMonitoring([cost], after_batch=True),
            Printing(after_batch=True),
            CheckpointNMT(config['saveto'],
                          every_n_batches=config['save_freq'])
        ]

        # Set up beam search and sampling computation graphs if necessary
        if config['hook_samples'] >= 1 or config['bleu_script'] is not None:
            logger.info("Building sampling model")
            sampling_representation = encoder.apply(
                sampling_input, tensor.ones(sampling_input.shape))
            generated = decoder.generate(
                sampling_input, sampling_representation)
            search_model = Model(generated)
            _, samples = VariableFilter(
                bricks=[decoder.sequence_generator], name="outputs")(
                    ComputationGraph(generated[1]))

        # Add sampling
        if config['hook_samples'] >= 1:
            logger.info("Building sampler")
            extensions.append(
                Sampler(model=search_model, data_stream=tr_stream,
                        hook_samples=config['hook_samples'],
                        every_n_batches=config['sampling_freq'],
                        src_vocab_size=config['src_vocab_size']))

        # Add early stopping based on bleu
        if config['bleu_script'] is not None:
            logger.info("Building bleu validator")
            extensions.append(
                BleuValidator(sampling_input, samples=samples, config=config,
                              model=search_model, data_stream=dev_stream,
                              normalize=config['normalized_bleu'],
                              every_n_batches=config['bleu_val_freq']))

        # Reload model if necessary
        if config['reload']:
            extensions.append(LoadNMT(config['saveto']))

        # Plot cost in bokeh if necessary
        if use_bokeh and BOKEH_AVAILABLE:
            extensions.append(
                Plot('Cs-En', channels=[['decoder_cost_cost']],
                     after_batch=True))

        # Set up training algorithm
        logger.info("Initializing training algorithm")
        algorithm = GradientDescent(
            cost=cost, parameters=cg.parameters,
            step_rule=CompositeRule([StepClipping(config['step_clipping']),
                                     eval(config['step_rule'])()])
        )

        # Initialize main loop
        logger.info("Initializing main loop")
        main_loop = MainLoop(
            model=training_model,
            algorithm=algorithm,
            data_stream=tr_stream,
            extensions=extensions
        )

        # Train!
        main_loop.run()

    elif mode == 'translate':

        # Create Theano variables
        logger.info('Creating theano variables')
        sampling_input = tensor.lmatrix('source')

        # Get test set stream
        test_stream = get_dev_stream(
            config['test_set'], config['src_vocab'],
            config['src_vocab_size'], config['unk_id'])
        ftrans = open(config['test_set'] + '.trans.out', 'w')

        # Helper utilities
        sutils = SamplingBase()
        unk_idx = config['unk_id']
        src_eos_idx = config['src_vocab_size'] - 1
        trg_eos_idx = config['trg_vocab_size'] - 1

        # Get beam search
        logger.info("Building sampling model")
        sampling_representation = encoder.apply(
            sampling_input, tensor.ones(sampling_input.shape))
        generated = decoder.generate(sampling_input, sampling_representation)
        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs
        beam_search = BeamSearch(samples=samples)

        logger.info("Loading the model..")
        model = Model(generated)
        loader = LoadNMT(config['saveto'])
        loader.set_model_parameters(model, loader.load_parameters())

        # Get target vocabulary
        trg_vocab = _ensure_special_tokens(
            pickle.load(open(config['trg_vocab'], 'rb')), bos_idx=0,
            eos_idx=trg_eos_idx, unk_idx=unk_idx)
        trg_ivocab = {v: k for k, v in trg_vocab.items()}

        logger.info("Started translation: ")
        total_cost = 0.0

        for i, line in enumerate(test_stream.get_epoch_iterator()):

            seq = sutils._oov_to_unk(
                line[0], config['src_vocab_size'], unk_idx)
            input_ = numpy.tile(seq, (config['beam_size'], 1))

            # draw sample, checking to ensure we don't get an empty string back
            trans, costs = \
                beam_search.search(
                    input_values={sampling_input: input_},
                    max_length=3*len(seq), eol_symbol=src_eos_idx,
                    ignore_first_eol=True)

            # normalize costs according to the sequence lengths
            if config['normalized_bleu']:
                lengths = numpy.array([len(s) for s in trans])
                costs = costs / lengths

            best = numpy.argsort(costs)[0]
            try:
                total_cost += costs[best]
                trans_out = trans[best]

                # convert idx to words
                trans_out = sutils._idx_to_word(trans_out, trg_ivocab)

            except ValueError:
                logger.info(
                    "Can NOT find a translation for line: {}".format(i+1))
                trans_out = '<UNK>'

            print(trans_out, file=ftrans)

            if i != 0 and i % 100 == 0:
                logger.info(
                    "Translated {} lines of test set...".format(i))

        logger.info("Total cost of the test: {}".format(total_cost))
        ftrans.close()
Пример #43
0
class BleuValidator(SimpleExtension, SamplingBase):
    """Implements early stopping based on BLEU score."""
    def __init__(self,
                 source_sentence,
                 target_prefix,
                 samples,
                 model,
                 data_stream,
                 config,
                 src_vocab=None,
                 trg_vocab=None,
                 n_best=1,
                 track_n_models=1,
                 normalize=True,
                 **kwargs):
        super(BleuValidator, self).__init__(**kwargs)
        self.source_sentence = source_sentence
        self.target_prefix = target_prefix

        self.src_vocab = src_vocab
        self.trg_vocab = trg_vocab

        self.samples = samples
        self.model = model
        self.data_stream = data_stream
        self.config = config
        self.n_best = n_best
        self.track_n_models = track_n_models
        self.normalize = normalize
        self.verbose = config.get('val_set_out', None)

        # Helpers
        self.best_models = []
        self.val_bleu_curve = []
        self.beam_search = BeamSearch(samples=samples)
        self.multibleu_cmd = [
            'perl', self.config['bleu_script'],
            self.config['val_set_grndtruth'], '<'
        ]

        # Create save directory if it does not exist
        if not os.path.exists(self.config['saveto']):
            os.makedirs(self.config['saveto'])

        if self.config['reload']:
            try:
                bleu_score = numpy.load(
                    os.path.join(self.config['saveto'], 'val_bleu_scores.npz'))
                self.val_bleu_curve = bleu_score['bleu_scores'].tolist()

                # Track n best previous bleu scores
                for i, bleu in enumerate(
                        sorted(self.val_bleu_curve, reverse=True)):
                    if i < self.track_n_models:
                        self.best_models.append(ModelInfo(bleu, key='BLEU'))
                logger.info("BleuScores Reloaded")
            except:
                logger.info("BleuScores not Found")

    def do(self, which_callback, *args):

        # Track validation burn in
        if self.main_loop.status['iterations_done'] <= self.config[
                'val_burn_in']:
            return

        # Evaluate the model
        bleu_score = self._evaluate_model()
        # add an entry to the log
        self.main_loop.log.current_row[
            'validation_set_bleu_score'] = bleu_score
        # save if necessary
        self._save_model(bleu_score)

    def _evaluate_model(self):
        # Set in the superclass -- SamplingBase
        if not hasattr(self, 'target_dataset'):
            self._initialize_dataset_info()

        #         self.unk_sym = self.target_dataset.unk_token
        #         self.eos_sym = self.target_dataset.eos_token

        self.unk_sym = '<UNK>'
        self.eos_sym = '</S>'
        self.unk_idx = self.trg_vocab[self.unk_sym]
        self.eos_idx = self.trg_vocab[self.eos_sym]

        logger.info("Started Validation: ")
        val_start_time = time.time()
        mb_subprocess = Popen(self.multibleu_cmd, stdin=PIPE, stdout=PIPE)
        total_cost = 0.0

        if self.verbose:
            ftrans = open(self.config['val_set_out'], 'w')

        print('LENGTH OF DEV STREAM: {}'.format(
            len(list(self.data_stream.get_epoch_iterator()))))
        for i, line in enumerate(self.data_stream.get_epoch_iterator()):
            """
            Load the sentence, retrieve the sample, write to file
            """

            # Note that the indices of source and target in the datastream are hard-coded
            # currently our datastream is (source,target,prefix,suffix)
            seq = self._oov_to_unk(line[0], self.config['src_vocab_size'],
                                   self.unk_idx)

            target_prefix = line[2]

            input_ = numpy.tile(seq, (self.config['beam_size'], 1))
            prefix_input_ = numpy.tile(target_prefix,
                                       (self.config['beam_size'], 1))

            # draw sample, checking to ensure we don't get an empty string back
            # beam search param names come from WHERE??
            trans, costs = self.beam_search.search(input_values={
                self.source_sentence:
                input_,
                self.target_prefix:
                prefix_input_
            },
                                                   max_length=3 * len(seq),
                                                   eol_symbol=self.eos_idx,
                                                   ignore_first_eol=False)

            # normalize costs according to the sequence lengths
            if self.normalize:
                lengths = numpy.array([len(s) for s in trans])
                costs = costs / lengths

            nbest_idx = numpy.argsort(costs)[:self.n_best]

            for j, best in enumerate(nbest_idx):
                try:
                    total_cost += costs[best]
                    trans_out = trans[best]

                    # convert idx to words
                    #print('input_seq: {}'.format(seq))
                    #print('input_prefix: {}'.format(target_prefix))
                    #print('trans_out_raw: {}'.format(trans_out))
                    trans_out = self._idx_to_word(trans_out, self.trg_ivocab)
#print('trans_out_text: {}'.format(trans_out))

                except ValueError:
                    logger.info(
                        "Can NOT find a translation for line: {}".format(i +
                                                                         1))
                    trans_out = '<UNK>'

                if j == 0:
                    # Write to subprocess and file if it exists
                    print(trans_out, file=mb_subprocess.stdin)
                    if self.verbose:
                        print(trans_out, file=ftrans)

                if i != 0 and i % 100 == 0:
                    logger.info(
                        "Translated {} lines of validation set...".format(i))

                mb_subprocess.stdin.flush()

        logger.info("Total cost of the validation: {}".format(total_cost))
        self.data_stream.reset()
        if self.verbose:
            ftrans.close()

        # send end of file, read output.
        mb_subprocess.stdin.close()
        stdout = mb_subprocess.stdout.readline()
        logger.info(stdout)
        out_parse = re.match(r'BLEU = [-.0-9]+', stdout)
        logger.info("Validation Took: {} minutes".format(
            float(time.time() - val_start_time) / 60.))
        assert out_parse is not None

        # extract the score
        bleu_score = float(out_parse.group()[6:])
        self.val_bleu_curve.append(bleu_score)
        logger.info(bleu_score)
        mb_subprocess.terminate()

        return bleu_score

    def _is_valid_to_save(self, bleu_score):
        if not self.best_models or min(
                self.best_models,
                key=operator.attrgetter('score')).score < bleu_score:
            return True
        return False

    def _save_model(self, bleu_score):
        if self._is_valid_to_save(bleu_score):
            model = ModelInfo(bleu_score, self.config['saveto'], key='BLEU')

            # Manage n-best model list first
            if len(self.best_models) >= self.track_n_models:
                old_model = self.best_models[0]
                if old_model.path and os.path.isfile(old_model.path):
                    logger.info("Deleting old model %s" % old_model.path)
                    os.remove(old_model.path)
                self.best_models.remove(old_model)

            self.best_models.append(model)
            self.best_models.sort(key=operator.attrgetter('score'))

            # Save the model here
            s = signal.signal(signal.SIGINT, signal.SIG_IGN)
            logger.info("Saving new model {}".format(model.path))

            SaveLoadUtils.save_parameter_values(
                self.main_loop.model.get_parameter_values(), model.path)
            numpy.savez(os.path.join(self.config['saveto'],
                                     'val_bleu_scores.npz'),
                        bleu_scores=self.val_bleu_curve)
            signal.signal(signal.SIGINT, s)
Пример #44
0
class SpeechRecognizer(Initializable):
    """Encapsulate all reusable logic.

    This class plays a few roles: (a) it's a top brick that knows
    how to combine bottom, bidirectional and recognizer network, (b)
    it has the inputs variables and can build whole computation graphs
    starting with them (c) it hides compilation of Theano functions
    and initialization of beam search. I find it simpler to have it all
    in one place for research code.

    Parameters
    ----------
    All defining the structure and the dimensions of the model. Typically
    receives everything from the "net" section of the config.

    """
    def __init__(
        self,
        recordings_source,
        labels_source,
        eos_label,
        num_features,
        num_phonemes,
        dim_dec,
        dims_bidir,
        dims_bottom,
        enc_transition,
        dec_transition,
        use_states_for_readout,
        attention_type,
        lm=None,
        character_map=None,
        subsample=None,
        dims_top=None,
        prior=None,
        conv_n=None,
        bottom_activation=None,
        post_merge_activation=None,
        post_merge_dims=None,
        dim_matcher=None,
        embed_outputs=True,
        dec_stack=1,
        conv_num_filters=1,
        data_prepend_eos=True,
        energy_normalizer=None,  # softmax is th edefault set in SequenceContentAndConvAttention
        **kwargs):
        if bottom_activation is None:
            bottom_activation = Tanh()
        if post_merge_activation is None:
            post_merge_activation = Tanh()
        super(SpeechRecognizer, self).__init__(**kwargs)
        self.recordings_source = recordings_source
        self.labels_source = labels_source
        self.eos_label = eos_label
        self.data_prepend_eos = data_prepend_eos

        self.rec_weights_init = None
        self.initial_states_init = None

        self.enc_transition = enc_transition
        self.dec_transition = dec_transition
        self.dec_stack = dec_stack

        bottom_activation = bottom_activation
        post_merge_activation = post_merge_activation

        if dim_matcher is None:
            dim_matcher = dim_dec

        # The bottom part, before BiRNN
        if dims_bottom:
            bottom = MLP([bottom_activation] * len(dims_bottom),
                         [num_features] + dims_bottom,
                         name="bottom")
        else:
            bottom = Identity(name='bottom')

        # BiRNN
        if not subsample:
            subsample = [1] * len(dims_bidir)
        encoder = Encoder(
            self.enc_transition, dims_bidir,
            dims_bottom[-1] if len(dims_bottom) else num_features, subsample)

        # The top part, on top of BiRNN but before the attention
        if dims_top:
            top = MLP([Tanh()],
                      [2 * dims_bidir[-1]] + dims_top + [2 * dims_bidir[-1]],
                      name="top")
        else:
            top = Identity(name='top')

        if dec_stack == 1:
            transition = self.dec_transition(dim=dim_dec,
                                             activation=Tanh(),
                                             name="transition")
        else:
            transitions = [
                self.dec_transition(dim=dim_dec,
                                    activation=Tanh(),
                                    name="transition_{}".format(trans_level))
                for trans_level in xrange(dec_stack)
            ]
            transition = RecurrentStack(transitions=transitions,
                                        skip_connections=True)
        # Choose attention mechanism according to the configuration
        if attention_type == "content":
            attention = SequenceContentAttention(
                state_names=transition.apply.states,
                attended_dim=2 * dims_bidir[-1],
                match_dim=dim_matcher,
                name="cont_att")
        elif attention_type == "content_and_conv":
            attention = SequenceContentAndConvAttention(
                state_names=transition.apply.states,
                conv_n=conv_n,
                conv_num_filters=conv_num_filters,
                attended_dim=2 * dims_bidir[-1],
                match_dim=dim_matcher,
                prior=prior,
                energy_normalizer=energy_normalizer,
                name="conv_att")
        else:
            raise ValueError(
                "Unknown attention type {}".format(attention_type))
        if embed_outputs:
            feedback = LookupFeedback(num_phonemes + 1, dim_dec)
        else:
            feedback = OneOfNFeedback(num_phonemes + 1)
        if lm:
            # In case we use LM it is Readout that is responsible
            # for normalization.
            emitter = LMEmitter()
        else:
            emitter = SoftmaxEmitter(initial_output=num_phonemes,
                                     name="emitter")
        readout_config = dict(readout_dim=num_phonemes,
                              source_names=(transition.apply.states if
                                            use_states_for_readout else []) +
                              [attention.take_glimpses.outputs[0]],
                              emitter=emitter,
                              feedback_brick=feedback,
                              name="readout")
        if post_merge_dims:
            readout_config['merged_dim'] = post_merge_dims[0]
            readout_config['post_merge'] = InitializableSequence(
                [
                    Bias(post_merge_dims[0]).apply,
                    post_merge_activation.apply,
                    MLP(
                        [post_merge_activation] *
                        (len(post_merge_dims) - 1) + [Identity()],
                        # MLP was designed to support Maxout is activation
                        # (because Maxout in a way is not one). However
                        # a single layer Maxout network works with the trick below.
                        # For deeper Maxout network one has to use the
                        # Sequence brick.
                        [
                            d //
                            getattr(post_merge_activation, 'num_pieces', 1)
                            for d in post_merge_dims
                        ] + [num_phonemes]).apply,
                ],
                name='post_merge')
        readout = Readout(**readout_config)

        language_model = None
        if lm:
            lm_weight = lm.pop('weight', 0.0)
            normalize_am_weights = lm.pop('normalize_am_weights', True)
            normalize_lm_weights = lm.pop('normalize_lm_weights', False)
            normalize_tot_weights = lm.pop('normalize_tot_weights', False)
            am_beta = lm.pop('am_beta', 1.0)
            if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1:
                logger.warn(
                    "Beam search is prone to fail with no log-prob normalization"
                )
            language_model = LanguageModel(nn_char_map=character_map, **lm)
            readout = ShallowFusionReadout(
                lm_costs_name='lm_add',
                lm_weight=lm_weight,
                normalize_am_weights=normalize_am_weights,
                normalize_lm_weights=normalize_lm_weights,
                normalize_tot_weights=normalize_tot_weights,
                am_beta=am_beta,
                **readout_config)

        generator = SequenceGenerator(readout=readout,
                                      transition=transition,
                                      attention=attention,
                                      language_model=language_model,
                                      name="generator")

        # Remember child bricks
        self.encoder = encoder
        self.bottom = bottom
        self.top = top
        self.generator = generator
        self.children = [encoder, top, bottom, generator]

        # Create input variables
        self.recordings = tensor.tensor3(self.recordings_source)
        self.recordings_mask = tensor.matrix(self.recordings_source + "_mask")
        self.labels = tensor.lmatrix(self.labels_source)
        self.labels_mask = tensor.matrix(self.labels_source + "_mask")
        self.batch_inputs = [
            self.recordings, self.recordings_source, self.labels,
            self.labels_mask
        ]
        self.single_recording = tensor.matrix(self.recordings_source)
        self.single_transcription = tensor.lvector(self.labels_source)

    def push_initialization_config(self):
        super(SpeechRecognizer, self).push_initialization_config()
        if self.rec_weights_init:
            rec_weights_config = {
                'weights_init': self.rec_weights_init,
                'recurrent_weights_init': self.rec_weights_init
            }
            global_push_initialization_config(self, rec_weights_config,
                                              BaseRecurrent)
        if self.initial_states_init:
            global_push_initialization_config(
                self, {'initial_states_init': self.initial_states_init})

    @application
    def cost(self, recordings, recordings_mask, labels, labels_mask):
        bottom_processed = self.bottom.apply(recordings)
        encoded, encoded_mask = self.encoder.apply(input_=bottom_processed,
                                                   mask=recordings_mask)
        encoded = self.top.apply(encoded)
        return self.generator.cost_matrix(labels,
                                          labels_mask,
                                          attended=encoded,
                                          attended_mask=encoded_mask)

    @application
    def generate(self, recordings):
        encoded, encoded_mask = self.encoder.apply(
            input_=self.bottom.apply(recordings))
        encoded = self.top.apply(encoded)
        return self.generator.generate(n_steps=recordings.shape[0],
                                       batch_size=recordings.shape[1],
                                       attended=encoded,
                                       attended_mask=encoded_mask,
                                       as_dict=True)

    def load_params(self, path):
        generated = self.get_generate_graph()
        param_values = load_parameter_values(path)
        SpeechModel(generated['outputs']).set_parameter_values(param_values)

    def get_generate_graph(self):
        result = self.generate(self.recordings)
        return result

    def get_cost_graph(self, batch=True):
        if batch:
            return self.cost(self.recordings, self.recordings_mask,
                             self.labels, self.labels_mask)
        recordings = self.single_recording[:, None, :]
        labels = self.single_transcription[:, None]
        return self.cost(recordings, tensor.ones_like(recordings[:, :, 0]),
                         labels, None)

    def analyze(self, recording, transcription):
        """Compute cost and aligment for a recording/transcription pair."""
        if not hasattr(self, "_analyze"):
            cost = self.get_cost_graph(batch=False)
            cg = ComputationGraph(cost)
            energies = VariableFilter(bricks=[self.generator],
                                      name="energies")(cg)
            energies_output = [
                energies[0][:, 0, :] if energies else tensor.zeros(
                    (self.single_transcription.shape[0],
                     self.single_recording.shape[0]))
            ]
            states, = VariableFilter(applications=[self.encoder.apply],
                                     roles=[OUTPUT],
                                     name="encoded")(cg)
            ctc_matrix_output = []
            # Temporarily disabled for compatibility with LM code
            # if len(self.generator.readout.source_names) == 1:
            #    ctc_matrix_output = [
            #        self.generator.readout.readout(weighted_averages=states)[:, 0, :]]
            weights, = VariableFilter(bricks=[self.generator],
                                      name="weights")(cg)
            self._analyze = theano.function(
                [self.single_recording, self.single_transcription],
                [cost[:, 0], weights[:, 0, :]] + energies_output +
                ctc_matrix_output)
        return self._analyze(recording, transcription)

    def init_beam_search(self, beam_size):
        """Compile beam search and set the beam size.

        See Blocks issue #500.

        """
        self.beam_size = beam_size
        generated = self.get_generate_graph()
        samples, = VariableFilter(applications=[self.generator.generate],
                                  name="outputs")(ComputationGraph(
                                      generated['outputs']))
        self._beam_search = BeamSearch(beam_size, samples)
        self._beam_search.compile()

    def beam_search(self, recording, char_discount=0.0):
        if not hasattr(self, '_beam_search'):
            self.init_beam_search(self.beam_size)
        input_ = recording[:, numpy.newaxis, :]
        outputs, search_costs = self._beam_search.search(
            {self.recordings: input_},
            self.eos_label,
            input_.shape[0] / 3,
            ignore_first_eol=self.data_prepend_eos,
            char_discount=char_discount)
        return outputs, search_costs

    def __getstate__(self):
        state = dict(self.__dict__)
        for attr in ['_analyze', '_beam_search']:
            state.pop(attr, None)
        return state

    def __setstate__(self, state):
        self.__dict__.update(state)
        # To use bricks used on a GPU first on a CPU later
        try:
            emitter = self.generator.readout.emitter
            del emitter._theano_rng
        except:
            pass
Пример #45
0
class SpeechRecognizer(Initializable):
    """Encapsulate all reusable logic.

    This class plays a few roles: (a) it's a top brick that knows
    how to combine bottom, bidirectional and recognizer network, (b)
    it has the inputs variables and can build whole computation graphs
    starting with them (c) it hides compilation of Theano functions
    and initialization of beam search. I find it simpler to have it all
    in one place for research code.

    Parameters
    ----------
    All defining the structure and the dimensions of the model. Typically
    receives everything from the "net" section of the config.

    """

    def __init__(self,
                 input_dims,
                 input_num_chars,
                 eos_label,
                 num_phonemes,
                 dim_dec, dims_bidir,
                 enc_transition, dec_transition,
                 use_states_for_readout,
                 attention_type,
                 criterion,
                 bottom,
                 lm=None, character_map=None,
                 bidir=True,
                 subsample=None,
                 dims_top=None,
                 prior=None, conv_n=None,
                 post_merge_activation=None,
                 post_merge_dims=None,
                 dim_matcher=None,
                 embed_outputs=True,
                 dim_output_embedding=None,
                 dec_stack=1,
                 conv_num_filters=1,
                 data_prepend_eos=True,
                 # softmax is the default set in SequenceContentAndConvAttention
                 energy_normalizer=None,
                 # for speech this is the approximate phoneme duration in frames
                 max_decoded_length_scale=1,
                 **kwargs):

        if post_merge_activation is None:
            post_merge_activation = Tanh()
        super(SpeechRecognizer, self).__init__(**kwargs)
        self.eos_label = eos_label
        self.data_prepend_eos = data_prepend_eos

        self.rec_weights_init = None
        self.initial_states_init = None

        self.enc_transition = enc_transition
        self.dec_transition = dec_transition
        self.dec_stack = dec_stack

        self.criterion = criterion

        self.max_decoded_length_scale = max_decoded_length_scale

        post_merge_activation = post_merge_activation

        if dim_matcher is None:
            dim_matcher = dim_dec

        # The bottom part, before BiRNN
        bottom_class = bottom.pop('bottom_class')
        bottom = bottom_class(
            input_dims=input_dims, input_num_chars=input_num_chars,
            name='bottom',
            **bottom)

        # BiRNN
        if not subsample:
            subsample = [1] * len(dims_bidir)
        encoder = Encoder(self.enc_transition, dims_bidir,
                          bottom.get_dim(bottom.apply.outputs[0]),
                          subsample, bidir=bidir)
        dim_encoded = encoder.get_dim(encoder.apply.outputs[0])

        # The top part, on top of BiRNN but before the attention
        if dims_top:
            top = MLP([Tanh()],
                      [dim_encoded] + dims_top + [dim_encoded], name="top")
        else:
            top = Identity(name='top')

        if dec_stack == 1:
            transition = self.dec_transition(
                dim=dim_dec, activation=Tanh(), name="transition")
        else:
            transitions = [self.dec_transition(dim=dim_dec,
                                               activation=Tanh(),
                                               name="transition_{}".format(trans_level))
                           for trans_level in xrange(dec_stack)]
            transition = RecurrentStack(transitions=transitions,
                                        skip_connections=True)
        # Choose attention mechanism according to the configuration
        if attention_type == "content":
            attention = SequenceContentAttention(
                state_names=transition.apply.states,
                attended_dim=dim_encoded, match_dim=dim_matcher,
                name="cont_att")
        elif attention_type == "content_and_conv":
            attention = SequenceContentAndConvAttention(
                state_names=transition.apply.states,
                conv_n=conv_n,
                conv_num_filters=conv_num_filters,
                attended_dim=dim_encoded, match_dim=dim_matcher,
                prior=prior,
                energy_normalizer=energy_normalizer,
                name="conv_att")
        else:
            raise ValueError("Unknown attention type {}"
                             .format(attention_type))
        if embed_outputs:
            feedback = LookupFeedback(num_phonemes + 1,
                                      dim_dec if
                                      dim_output_embedding is None
                                      else dim_output_embedding)
        else:
            feedback = OneOfNFeedback(num_phonemes + 1)
        if criterion['name'] == 'log_likelihood':
            emitter = SoftmaxEmitter(initial_output=num_phonemes, name="emitter")
            if lm:
                # In case we use LM it is Readout that is responsible
                # for normalization.
                emitter = LMEmitter()
        elif criterion['name'].startswith('mse'):
            emitter = RewardRegressionEmitter(
                criterion['name'], eos_label, num_phonemes,
                criterion.get('min_reward', -1.0),
                name="emitter")
        else:
            raise ValueError("Unknown criterion {}".format(criterion['name']))
        readout_config = dict(
            readout_dim=num_phonemes,
            source_names=(transition.apply.states if use_states_for_readout else [])
                         + [attention.take_glimpses.outputs[0]],
            emitter=emitter,
            feedback_brick=feedback,
            name="readout")
        if post_merge_dims:
            readout_config['merged_dim'] = post_merge_dims[0]
            readout_config['post_merge'] = InitializableSequence([
                Bias(post_merge_dims[0]).apply,
                post_merge_activation.apply,
                MLP([post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()],
                    # MLP was designed to support Maxout is activation
                    # (because Maxout in a way is not one). However
                    # a single layer Maxout network works with the trick below.
                    # For deeper Maxout network one has to use the
                    # Sequence brick.
                    [d//getattr(post_merge_activation, 'num_pieces', 1)
                     for d in post_merge_dims] + [num_phonemes]).apply,
            ],
                name='post_merge')
        readout = Readout(**readout_config)

        language_model = None
        if lm and lm.get('path'):
            lm_weight = lm.pop('weight', 0.0)
            normalize_am_weights = lm.pop('normalize_am_weights', True)
            normalize_lm_weights = lm.pop('normalize_lm_weights', False)
            normalize_tot_weights = lm.pop('normalize_tot_weights', False)
            am_beta = lm.pop('am_beta', 1.0)
            if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1:
                logger.warn("Beam search is prone to fail with no log-prob normalization")
            language_model = LanguageModel(nn_char_map=character_map, **lm)
            readout = ShallowFusionReadout(lm_costs_name='lm_add',
                                           lm_weight=lm_weight,
                                           normalize_am_weights=normalize_am_weights,
                                           normalize_lm_weights=normalize_lm_weights,
                                           normalize_tot_weights=normalize_tot_weights,
                                           am_beta=am_beta,
                                           **readout_config)

        generator = SequenceGenerator(
            readout=readout, transition=transition, attention=attention,
            language_model=language_model,
            name="generator")

        # Remember child bricks
        self.encoder = encoder
        self.bottom = bottom
        self.top = top
        self.generator = generator
        self.children = [encoder, top, bottom, generator]

        # Create input variables
        self.inputs = self.bottom.batch_inputs
        self.inputs_mask = self.bottom.mask

        self.labels = tensor.lmatrix('labels')
        self.labels_mask = tensor.matrix("labels_mask")

        self.single_inputs = self.bottom.single_inputs
        self.single_labels = tensor.lvector('labels')
        self.n_steps = tensor.lscalar('n_steps')

    def push_initialization_config(self):
        super(SpeechRecognizer, self).push_initialization_config()
        if self.rec_weights_init:
            rec_weights_config = {'weights_init': self.rec_weights_init,
                                  'recurrent_weights_init': self.rec_weights_init}
            global_push_initialization_config(self,
                                              rec_weights_config,
                                              BaseRecurrent)
        if self.initial_states_init:
            global_push_initialization_config(self,
                                              {'initial_states_init': self.initial_states_init})

    @application
    def cost(self, **kwargs):
        # pop inputs we know about
        inputs_mask = kwargs.pop('inputs_mask')
        labels = kwargs.pop('labels')
        labels_mask = kwargs.pop('labels_mask')

        # the rest is for bottom
        bottom_processed = self.bottom.apply(**kwargs)
        encoded, encoded_mask = self.encoder.apply(
            input_=bottom_processed,
            mask=inputs_mask)
        encoded = self.top.apply(encoded)
        return self.generator.cost_matrix(
            labels, labels_mask,
            attended=encoded, attended_mask=encoded_mask)

    @application
    def generate(self, **kwargs):
        inputs_mask = kwargs.pop('inputs_mask')
        n_steps = kwargs.pop('n_steps')

        encoded, encoded_mask = self.encoder.apply(
            input_=self.bottom.apply(**kwargs),
            mask=inputs_mask)
        encoded = self.top.apply(encoded)
        return self.generator.generate(
            n_steps=n_steps if n_steps is not None else self.n_steps,
            batch_size=encoded.shape[1],
            attended=encoded,
            attended_mask=encoded_mask,
            as_dict=True)

    def load_params(self, path):
        generated = self.get_generate_graph()
        with open(path, 'r') as src:
            param_values = load_parameters(src)
        Model(generated['outputs']).set_parameter_values(param_values)

    def get_generate_graph(self, use_mask=True, n_steps=None):
        inputs_mask = None
        if use_mask:
            inputs_mask = self.inputs_mask
        bottom_inputs = self.inputs
        return self.generate(n_steps=n_steps,
                             inputs_mask=inputs_mask,
                             **bottom_inputs)

    def get_cost_graph(self, batch=True,
                       prediction=None, prediction_mask=None):

        if batch:
            inputs = self.inputs
            inputs_mask = self.inputs_mask
            groundtruth = self.labels
            groundtruth_mask = self.labels_mask
        else:
            inputs, inputs_mask = self.bottom.single_to_batch_inputs(
                self.single_inputs)
            groundtruth = self.single_labels[:, None]
            groundtruth_mask = None

        if not prediction:
            prediction = groundtruth
        if not prediction_mask:
            prediction_mask = groundtruth_mask

        cost = self.cost(inputs_mask=inputs_mask,
                         labels=prediction,
                         labels_mask=prediction_mask,
                         **inputs)
        cost_cg = ComputationGraph(cost)
        if self.criterion['name'].startswith("mse"):
            placeholder, = VariableFilter(theano_name='groundtruth')(cost_cg)
            cost_cg = cost_cg.replace({placeholder: groundtruth})
        return cost_cg

    def analyze(self, inputs, groundtruth, prediction=None):
        """Compute cost and aligment."""

        input_values_dict = dict(inputs)
        input_values_dict['groundtruth'] = groundtruth
        if prediction is not None:
            input_values_dict['prediction'] = prediction
        if not hasattr(self, "_analyze"):
            input_variables = list(self.single_inputs.values())
            input_variables.append(self.single_labels.copy(name='groundtruth'))

            prediction_variable = tensor.lvector('prediction')
            if prediction is not None:
                input_variables.append(prediction_variable)
                cg = self.get_cost_graph(
                    batch=False, prediction=prediction_variable[:, None])
            else:
                cg = self.get_cost_graph(batch=False)
            cost = cg.outputs[0]

            weights, = VariableFilter(
                bricks=[self.generator], name="weights")(cg)

            energies = VariableFilter(
                bricks=[self.generator], name="energies")(cg)
            energies_output = [energies[0][:, 0, :] if energies
                               else tensor.zeros_like(weights)]

            states, = VariableFilter(
                applications=[self.encoder.apply], roles=[OUTPUT],
                name="encoded")(cg)

            ctc_matrix_output = []
            # Temporarily disabled for compatibility with LM code
            # if len(self.generator.readout.source_names) == 1:
            #    ctc_matrix_output = [
            #        self.generator.readout.readout(weighted_averages=states)[:, 0, :]]

            self._analyze = theano.function(
                input_variables,
                [cost[:, 0], weights[:, 0, :]] + energies_output + ctc_matrix_output,
                on_unused_input='warn')
        return self._analyze(**input_values_dict)

    def init_beam_search(self, beam_size):
        """Compile beam search and set the beam size.

        See Blocks issue #500.

        """
        if hasattr(self, '_beam_search') and self.beam_size == beam_size:
            # Only recompile if the user wants a different beam size
            return
        self.beam_size = beam_size
        generated = self.get_generate_graph(use_mask=False, n_steps=3)
        cg = ComputationGraph(generated.values())
        samples, = VariableFilter(
            applications=[self.generator.generate], name="outputs")(cg)
        self._beam_search = BeamSearch(beam_size, samples)
        self._beam_search.compile()

    def beam_search(self, inputs, **kwargs):
        # When a recognizer is unpickled, self.beam_size is available
        # but beam search has to be recompiled.

        self.init_beam_search(self.beam_size)
        inputs = dict(inputs)
        max_length = int(self.bottom.num_time_steps(**inputs) /
                         self.max_decoded_length_scale)
        search_inputs = {}
        for var in self.inputs.values():
            search_inputs[var] = inputs.pop(var.name)[:, numpy.newaxis, ...]
        if inputs:
            raise Exception(
                'Unknown inputs passed to beam search: {}'.format(
                    inputs.keys()))
        outputs, search_costs = self._beam_search.search(
            search_inputs, self.eos_label,
            max_length,
            ignore_first_eol=self.data_prepend_eos,
            **kwargs)
        return outputs, search_costs

    def init_generate(self):
        generated = self.get_generate_graph(use_mask=False)
        cg = ComputationGraph(generated['outputs'])
        self._do_generate = cg.get_theano_function()

    def sample(self, inputs, n_steps=None):
        if not hasattr(self, '_do_generate'):
            self.init_generate()
        batch, unused_mask = self.bottom.single_to_batch_inputs(inputs)
        batch['n_steps'] = n_steps if n_steps is not None \
            else int(self.bottom.num_time_steps(**batch) /
                     self.max_decoded_length_scale)
        return self._do_generate(**batch)[0]

    def __getstate__(self):
        state = dict(self.__dict__)
        for attr in ['_analyze', '_beam_search']:
            state.pop(attr, None)
        return state

    def __setstate__(self, state):
        self.__dict__.update(state)
        # To use bricks used on a GPU first on a CPU later
        try:
            emitter = self.generator.readout.emitter
            del emitter._theano_rng
        except:
            pass
Пример #46
0
    def __init__(self, 
                 source_sentence, 
                 samples, 
                 model, 
                 data_stream,
                 config, 
                 n_best=1, 
                 track_n_models=1,
                 normalize=True, 
                 store_full_main_loop=False, 
                 **kwargs):
        """Creates a new extension which adds model selection based on
        the BLEU score to the training main loop.
        
        Args:
            source_sentence (Variable): Input variable to the sampling
                                        computation graph
            samples (Variable): Samples variable of the CG
            model (NMTModel): See the model module
            data_stream (DataStream): Data stream to the development 
                                      set
            config (dict): NMT configuration
            n_best (int): beam size
            track_n_models (int): Number of n-best models for which to 
                                  create checkpoints.
            normalize (boolean): Enables length normalization
            store_full_main_loop (boolean): Stores the iteration state
                                            in the old style of
                                            Blocks 0.1. Not recommended
        """
        super(BleuValidator, self).__init__(**kwargs)
        self.store_full_main_loop = store_full_main_loop
        self.source_sentence = source_sentence
        self.samples = samples
        self.model = model
        self.data_stream = data_stream
        self.config = config
        self.n_best = n_best
        self.track_n_models = track_n_models
        self.normalize = normalize
        self.best_models = []
        self.val_bleu_curve = []
        self.multibleu_cmd = (self.config['bleu_script'] % self.config['val_set_grndtruth']).split()
        logging.debug("BLEU command: %s" % self.multibleu_cmd)

        self.src_sparse_feat_map = config['src_sparse_feat_map'] if config['src_sparse_feat_map'] \
                                                                 else FlatSparseFeatMap()
        if config['trg_sparse_feat_map']:
            self.trg_sparse_feat_map = config['trg_sparse_feat_map']
            self.beam_search = SparseBeamSearch(
                                 samples=samples, 
                                 trg_sparse_feat_map=self.trg_sparse_feat_map) 
        else:
            self.trg_sparse_feat_map = FlatSparseFeatMap()
            self.beam_search = BeamSearch(samples=samples)
        
        # Create saving directory if it does not exist
        if not os.path.exists(self.config['saveto']):
            os.makedirs(self.config['saveto'])

        if self.config['reload']:
            try:
                bleu_score = numpy.load(os.path.join(self.config['saveto'],
                                        'val_bleu_scores.npz'))
                self.val_bleu_curve = bleu_score['bleu_scores'].tolist()
                # Track n best previous bleu scores
                for i, bleu in enumerate(
                        sorted(self.val_bleu_curve, reverse=True)):
                    if i < self.track_n_models:
                        self.best_models.append(ModelInfo(bleu))
                logging.info("BleuScores Reloaded")
            except:
                logging.info("BleuScores not Found")
class BeamSearchEvaluator(object):
    def __init__(self, eol_symbol, beam_size, x, x_mask, samples,
                 phoneme_dict=None, black_list=None):
        if black_list is None:
            self.black_list = []
        else:
            self.black_list = black_list
        self.x = x
        self.x_mask = x_mask
        self.eol_symbol = eol_symbol
        self.beam_size = beam_size
        self.beam_search = BeamSearch(beam_size, samples)
        self.beam_search.compile()
        self.phoneme_dict = phoneme_dict

    def evaluate(self, data_stream, train=False, file_pred=None,
                 file_targets=None):
        loss = 0.
        num_examples = 0
        iterator = data_stream.get_epoch_iterator()
        if train:
            print 'Train evaluation started'
        i = 0
        for inputs in iterator:
            inputs = dict(zip(data_stream.sources, inputs))
            x_mask_val = inputs['features_mask']
            x_val = inputs['features']
            y_val = inputs['phonemes']
            y_mask_val = inputs['phonemes_mask']
            for batch_ind in xrange(inputs['features'].shape[1]):
                if x_val.ndim == 2:
                    input_beam = numpy.tile(x_val[:, batch_ind][:, None],
                        (1, self.beam_size))
                else:
                    input_beam = numpy.tile(x_val[:, batch_ind, :][:, None, :],
                                            (1, self.beam_size, 1))
                input_mask_beam = numpy.tile(x_mask_val[:, batch_ind][:, None],
                                             (1, self.beam_size))
                predictions, _ = self.beam_search.search(
                    {self.x: input_beam,
                     self.x_mask: input_mask_beam},
                    self.eol_symbol, 100)
                predictions = [self.phoneme_dict[phone_ind] for phone_ind
                             in predictions[0]
                             if self.phoneme_dict[phone_ind] not in
                             self.black_list][1:-1]

                targets = y_val[:sum(y_mask_val[:, batch_ind]), batch_ind]
                targets = [self.phoneme_dict[phone_ind] for phone_ind
                             in targets
                             if self.phoneme_dict[phone_ind] not in
                             self.black_list][1:-1]
                predictions = [x[0] for x in groupby(predictions)]
                targets = [x[0] for x in groupby(targets)]
                i += 1
                if file_pred:
                    file_pred.write(' '.join(predictions) + '(%d)\n' % i)
                if file_targets:
                    file_targets.write(' '.join(targets) + '(%d)\n' %i)

                loss += Evaluation.wer([predictions], [targets])
                num_examples += 1

            print '.. found sequence example:', ' '.join(predictions)
            print '.. real output was:       ', ' '.join(targets)
            if train:
                break
        if train:
            print 'Train evaluation finished'
        per = loss.sum() / num_examples
        return {'per': per}
Пример #48
0
class BleuValidator(SimpleExtension):
    """Implements early stopping based on BLEU score. This class is 
    still very similar to the ``BleuValidator`` in the NMT Blocks
    example.
    
    TODO: Refactor, make this more similar to the rest of SGNMT, use
    vanilla_decoder.py
    """

    def __init__(self, 
                 source_sentence, 
                 samples, 
                 model, 
                 data_stream,
                 config, 
                 n_best=1, 
                 track_n_models=1,
                 normalize=True, 
                 store_full_main_loop=False, 
                 **kwargs):
        """Creates a new extension which adds model selection based on
        the BLEU score to the training main loop.
        
        Args:
            source_sentence (Variable): Input variable to the sampling
                                        computation graph
            samples (Variable): Samples variable of the CG
            model (NMTModel): See the model module
            data_stream (DataStream): Data stream to the development 
                                      set
            config (dict): NMT configuration
            n_best (int): beam size
            track_n_models (int): Number of n-best models for which to 
                                  create checkpoints.
            normalize (boolean): Enables length normalization
            store_full_main_loop (boolean): Stores the iteration state
                                            in the old style of
                                            Blocks 0.1. Not recommended
        """
        super(BleuValidator, self).__init__(**kwargs)
        self.store_full_main_loop = store_full_main_loop
        self.source_sentence = source_sentence
        self.samples = samples
        self.model = model
        self.data_stream = data_stream
        self.config = config
        self.n_best = n_best
        self.track_n_models = track_n_models
        self.normalize = normalize
        self.best_models = []
        self.val_bleu_curve = []
        self.multibleu_cmd = (self.config['bleu_script'] % self.config['val_set_grndtruth']).split()
        logging.debug("BLEU command: %s" % self.multibleu_cmd)

        self.src_sparse_feat_map = config['src_sparse_feat_map'] if config['src_sparse_feat_map'] \
                                                                 else FlatSparseFeatMap()
        if config['trg_sparse_feat_map']:
            self.trg_sparse_feat_map = config['trg_sparse_feat_map']
            self.beam_search = SparseBeamSearch(
                                 samples=samples, 
                                 trg_sparse_feat_map=self.trg_sparse_feat_map) 
        else:
            self.trg_sparse_feat_map = FlatSparseFeatMap()
            self.beam_search = BeamSearch(samples=samples)
        
        # Create saving directory if it does not exist
        if not os.path.exists(self.config['saveto']):
            os.makedirs(self.config['saveto'])

        if self.config['reload']:
            try:
                bleu_score = numpy.load(os.path.join(self.config['saveto'],
                                        'val_bleu_scores.npz'))
                self.val_bleu_curve = bleu_score['bleu_scores'].tolist()
                # Track n best previous bleu scores
                for i, bleu in enumerate(
                        sorted(self.val_bleu_curve, reverse=True)):
                    if i < self.track_n_models:
                        self.best_models.append(ModelInfo(bleu))
                logging.info("BleuScores Reloaded")
            except:
                logging.info("BleuScores not Found")

    def do(self, which_callback, *args):
        """Decodes the dev set and stores checkpoints in case the BLEU
        score has improved.
        """
        if self.main_loop.status['iterations_done'] <= \
                self.config['val_burn_in']:
            return
        self._save_model(self._evaluate_model())

    def _evaluate_model(self):
        """Evaluate model and store checkpoints. """
        logging.info("Started Validation: ")
        val_start_time = time.time()
        mb_subprocess = Popen(self.multibleu_cmd, stdin=PIPE, stdout=PIPE)
        total_cost = 0.0
        ftrans = open(self.config['saveto'] + '/validation_out.txt', 'w')
        for i, line in enumerate(self.data_stream.get_epoch_iterator()):
            seq = self.src_sparse_feat_map.words2dense(utils.oov_to_unk(
                line[0], self.config['src_vocab_size']))
            if self.src_sparse_feat_map.dim > 1: # sparse src feats
                input_ = numpy.transpose(
                             numpy.tile(seq, (self.config['beam_size'], 1, 1)),
                             (2,0,1))
            else: # word ids on the source side
                input_ = numpy.tile(seq, (self.config['beam_size'], 1))
            # draw sample, checking to ensure we don't get an empty string back
            trans, costs = \
                self.beam_search.search(
                    input_values={self.source_sentence: input_},
                    max_length=3*len(seq), eol_symbol=utils.EOS_ID,
                    ignore_first_eol=True)
            # normalize costs according to the sequence lengths
            if self.normalize:
                lengths = numpy.array([len(s) for s in trans])
                costs = costs / lengths

            nbest_idx = numpy.argsort(costs)[:self.n_best]
            for j, best in enumerate(nbest_idx):
                try:
                    total_cost += costs[best]
                    trans = trans[best]
                    if trans and trans[-1] == utils.EOS_ID:
                        trans = trans[:-1]
                    trans_out = ' '.join([str(w) for w in trans])
                except ValueError:
                    logging.info(
                        "Can NOT find a translation for line: {}".format(i+1))
                    trans_out = '<UNK>'
                if j == 0:
                    # Write to subprocess and file if it exists
                    print(trans_out, file=mb_subprocess.stdin)
                    print(trans_out, file=ftrans)
            if i != 0 and i % 100 == 0:
                logging.info(
                    "Translated {} lines of validation set...".format(i))

            mb_subprocess.stdin.flush()
        logging.info("Total cost of the validation: {}".format(total_cost))
        self.data_stream.reset()
        ftrans.close()
        # send end of file, read output.
        mb_subprocess.stdin.close()
        stdout = mb_subprocess.stdout.readline()
        logging.info(stdout)
        out_parse = re.match(r'BLEU = [-.0-9]+', stdout)
        logging.info("Validation Took: {} minutes".format(
            float(time.time() - val_start_time) / 60.))
        assert out_parse is not None
        # extract the score
        bleu_score = float(out_parse.group()[6:])
        self.val_bleu_curve.append(bleu_score)
        logging.info(bleu_score)
        mb_subprocess.terminate()
        return bleu_score

    def _is_valid_to_save(self, bleu_score):
        if not self.best_models or min(self.best_models,
           key=operator.attrgetter('bleu_score')).bleu_score < bleu_score:
            return True
        return False

    def save_parameter_values(self, param_values, path):
        ''' This method is copied from blocks.machine_translation.checkpoint '''
        param_values = {name.replace("/", "-"): param
                        for name, param in param_values.items()}
        numpy.savez(path, **param_values)

    def _save_model(self, bleu_score):
        if self._is_valid_to_save(bleu_score):
            model = ModelInfo(bleu_score, self.config['saveto'])
            # Manage n-best model list first
            if len(self.best_models) >= self.track_n_models:
                old_model = self.best_models[0]
                if old_model.path and os.path.isfile(old_model.path):
                    logging.info("Deleting old model %s" % old_model.path)
                    os.remove(old_model.path)
                self.best_models.remove(old_model)
            self.best_models.append(model)
            self.best_models.sort(key=operator.attrgetter('bleu_score'))
            # Save the model here
            s = signal.signal(signal.SIGINT, signal.SIG_IGN)
            # fs439: introduce store_full_main_loop and 
            # storing best_bleu_params_* files
            if self.store_full_main_loop:
                logging.info("Saving full main loop model {}".format(model.path))
                numpy.savez(model.path, 
                            **self.main_loop.model.get_parameter_dict())
            else:
                logging.info("Saving model parameters {}".format(model.path))
                params_to_save = self.main_loop.model.get_parameter_values()
                self.save_parameter_values(params_to_save, model.path)
            numpy.savez(
                os.path.join(self.config['saveto'], 'val_bleu_scores.npz'),
                bleu_scores=self.val_bleu_curve)
            signal.signal(signal.SIGINT, s)
class BlocksNMTVanillaDecoder(Decoder):
    """Adaptor class for blocks.search.BeamSearch. We implement the
    ``Decoder`` class but ignore functionality for predictors or
    heuristics. Instead, we pass through decoding directly to the 
    blocks beam search module. This is fast, but breaks with the
    predictor framework. It can only be used for pure single system
    NMT decoding. Note that this decoder supports sparse feat maps
    on both source and target side.
    """
    def __init__(self, nmt_model_path, config, decoder_args):
        """Set up the NMT model used by the decoder.
        
        Args:
            nmt_model_path (string):  Path to the NMT model file (.npz)
            config (dict): NMT configuration
            decoder_args (object): Decoder configuration passed through
                                   from configuration API.
        """
        super(BlocksNMTVanillaDecoder, self).__init__(decoder_args)
        self.config = config
        self.set_up_decoder(nmt_model_path)
        self.src_eos = self.src_sparse_feat_map.word2dense(utils.EOS_ID)

    def set_up_decoder(self, nmt_model_path):
        """This method uses the NMT configuration in ``self.config`` to
        initialize the NMT model. This method basically corresponds to 
        ``blocks.machine_translation.main``.
        
        Args:
            nmt_model_path (string):  Path to the NMT model file (.npz)
        """
        self.nmt_model = NMTModel(self.config)
        self.nmt_model.set_up()
        loader = LoadNMTUtils(nmt_model_path, self.config['saveto'],
                              self.nmt_model.search_model)
        loader.load_weights()
        self.src_sparse_feat_map = self.config['src_sparse_feat_map'] \
                if self.config['src_sparse_feat_map'] else FlatSparseFeatMap()
        if self.config['trg_sparse_feat_map']:
            self.trg_sparse_feat_map = self.config['trg_sparse_feat_map']
            self.beam_search = SparseBeamSearch(
                samples=self.nmt_model.samples,
                trg_sparse_feat_map=self.trg_sparse_feat_map)
        else:
            self.trg_sparse_feat_map = FlatSparseFeatMap()
            self.beam_search = BeamSearch(samples=self.nmt_model.samples)

    def decode(self, src_sentence):
        """Decodes a single source sentence with the original blocks
        beam search decoder. Does not use predictors. Note that the
        score breakdowns in returned hypotheses are only on the 
        sentence level, not on the word level. For finer grained NMT
        scores you need to use the nmt predictor. ``src_sentence`` is a
        list of source word ids representing the source sentence without
        <S> or </S> symbols. As blocks expects to see </S>, this method
        adds it automatically.
        
        Args:
            src_sentence (list): List of source word ids without <S> or
                                 </S> which make up the source sentence
        
        Returns:
            list. A list of ``Hypothesis`` instances ordered by their
            score.
        """
        seq = self.src_sparse_feat_map.words2dense(
            utils.oov_to_unk(src_sentence,
                             self.config['src_vocab_size'])) + [self.src_eos]
        if self.src_sparse_feat_map.dim > 1:  # sparse src feats
            input_ = np.transpose(
                np.tile(seq, (self.config['beam_size'], 1, 1)), (2, 0, 1))
        else:  # word ids on the source side
            input_ = np.tile(seq, (self.config['beam_size'], 1))
        trans, costs = self.beam_search.search(
            input_values={self.nmt_model.sampling_input: input_},
            max_length=3 * len(src_sentence),
            eol_symbol=utils.EOS_ID,
            ignore_first_eol=True)
        hypos = []
        max_len = 0
        for idx in xrange(len(trans)):
            max_len = max(max_len, len(trans[idx]))
            hypo = Hypothesis(trans[idx], -costs[idx])
            hypo.score_breakdown = len(trans[idx]) * [[(0.0, 1.0)]]
            hypo.score_breakdown[0] = [(-costs[idx], 1.0)]
            hypos.append(hypo)
        self.apply_predictors_count = max_len * self.config['beam_size']
        return hypos

    def has_predictors(self):
        """Always returns true. """
        return True
Пример #50
0
class BeamSearchEvaluator(object):
    def __init__(self,
                 eol_symbol,
                 beam_size,
                 x,
                 x_mask,
                 samples,
                 phoneme_dict=None,
                 black_list=None,
                 language_model=False):
        if black_list is None:
            self.black_list = []
        else:
            self.black_list = black_list
        self.x = x
        self.x_mask = x_mask
        self.eol_symbol = eol_symbol
        self.beam_size = beam_size
        if language_model:
            lm = TrigramLanguageModel()
            ind_to_word = dict(enumerate(lm.unigrams))
            self.beam_search = BeamSearchLM(lm, 1., ind_to_word, beam_size,
                                            samples)
        else:
            self.beam_search = BeamSearch(beam_size, samples)
        self.beam_search.compile()
        self.phoneme_dict = phoneme_dict

    def evaluate(self,
                 data_stream,
                 train=False,
                 file_pred=None,
                 file_targets=None):
        loss = 0.
        num_examples = 0
        iterator = data_stream.get_epoch_iterator()
        if train:
            print 'Train evaluation started'
        i = 0
        for inputs in iterator:
            inputs = dict(zip(data_stream.sources, inputs))
            x_mask_val = inputs['features_mask']
            x_val = inputs['features']
            y_val = inputs['phonemes']
            y_mask_val = inputs['phonemes_mask']
            for batch_ind in xrange(inputs['features'].shape[1]):
                if x_val.ndim == 2:
                    input_beam = numpy.tile(x_val[:, batch_ind][:, None],
                                            (1, self.beam_size))
                else:
                    input_beam = numpy.tile(x_val[:, batch_ind, :][:, None, :],
                                            (1, self.beam_size, 1))
                input_mask_beam = numpy.tile(x_mask_val[:, batch_ind][:, None],
                                             (1, self.beam_size))
                predictions, _ = self.beam_search.search(
                    {
                        self.x: input_beam,
                        self.x_mask: input_mask_beam
                    }, self.eol_symbol, 100)
                predictions = [
                    self.phoneme_dict[phone_ind]
                    for phone_ind in predictions[0]
                    if self.phoneme_dict[phone_ind] not in self.black_list
                ][1:-1]

                targets = y_val[:sum(y_mask_val[:, batch_ind]), batch_ind]
                targets = [
                    self.phoneme_dict[phone_ind] for phone_ind in targets
                    if self.phoneme_dict[phone_ind] not in self.black_list
                ][1:-1]
                predictions = [x[0] for x in groupby(predictions)]
                targets = [x[0] for x in groupby(targets)]
                i += 1
                if file_pred:
                    file_pred.write(' '.join(predictions) + '(%d)\n' % i)
                if file_targets:
                    file_targets.write(' '.join(targets) + '(%d)\n' % i)

                loss += Evaluation.wer([predictions], [targets])
                num_examples += 1

            print '.. found sequence example:', ' '.join(predictions)
            print '.. real output was:       ', ' '.join(targets)
            if train:
                break
        if train:
            print 'Train evaluation finished'
        per = loss.sum() / num_examples
        return {'per': per}
Пример #51
0
class BlocksVanillaDecoder(cam.sgnmt.decoding.core.Decoder):
    """Adaptor class for blocks.search.BeamSearch. We implement the
    ``Decoder`` class but ignore functionality for predictors or
    heuristics. Instead, we pass through decoding directly to the 
    blocks beam search module. This is fast, but breaks with the
    predictor framework. It can only be used for pure single system
    NMT decoding.
    """
    def __init__(self, nmt_model_path, config):
        """Set up the NMT model used by the decoder.
        
        Args:
            nmt_model_path (string):  Path to the NMT model file (.npz)
            config (dict): NMT configuration
        """
        super(BlocksVanillaDecoder, self).__init__()
        self.config = config
        self.set_up_decoder(nmt_model_path)

    def set_up_decoder(self, nmt_model_path):
        """This method uses the NMT configuration in ``self.config`` to
        initialize the NMT model. This method basically corresponds to 
        ``blocks.machine_translation.main``.
        
        Args:
            nmt_model_path (string):  Path to the NMT model file (.npz)
        """
        # Create Theano variables
        logging.info('Creating theano variables')
        source_sentence = tensor.lmatrix('source')
        source_sentence_mask = tensor.matrix('source_mask')
        target_sentence = tensor.lmatrix('target')
        target_sentence_mask = tensor.matrix('target_mask')
        sampling_input = tensor.lmatrix('input')

        # Construct model
        logging.info('Building RNN encoder-decoder')
        encoder = BidirectionalEncoder(self.config['src_vocab_size'],
                                       self.config['enc_embed'],
                                       self.config['enc_nhids'])
        decoder = Decoder(self.config['trg_vocab_size'],
                          self.config['dec_embed'], self.config['dec_nhids'],
                          self.config['enc_nhids'] * 2)
        cost = decoder.cost(
            encoder.apply(source_sentence, source_sentence_mask),
            source_sentence_mask, target_sentence, target_sentence_mask)

        logging.info('Creating computational graph')
        cg = ComputationGraph(cost)

        # Initialize model (TODO: do i really need this?)
        logging.info('Initializing model')
        encoder.weights_init = decoder.weights_init = IsotropicGaussian(
            self.config['weight_scale'])
        encoder.biases_init = decoder.biases_init = Constant(0)
        encoder.push_initialization_config()
        decoder.push_initialization_config()
        encoder.bidir.prototype.weights_init = Orthogonal()
        decoder.transition.weights_init = Orthogonal()
        encoder.initialize()
        decoder.initialize()

        # apply dropout for regularization (TODO: remove?)
        if self.config['dropout'] < 1.0:
            # dropout is applied to the output of maxout in ghog
            logging.info('Applying dropout')
            dropout_inputs = [
                x for x in cg.intermediary_variables
                if x.name == 'maxout_apply_output'
            ]
            cg = apply_dropout(cg, dropout_inputs, self.config['dropout'])

        # Apply weight noise for regularization (TODO: remove?)
        if self.config['weight_noise_ff'] > 0.0:
            logging.info('Applying weight noise to ff layers')
            enc_params = Selector(encoder.lookup).get_params().values()
            enc_params += Selector(encoder.fwd_fork).get_params().values()
            enc_params += Selector(encoder.back_fork).get_params().values()
            dec_params = Selector(
                decoder.sequence_generator.readout).get_params().values()
            dec_params += Selector(
                decoder.sequence_generator.fork).get_params().values()
            dec_params += Selector(decoder.state_init).get_params().values()
            cg = apply_noise(cg, enc_params + dec_params,
                             self.config['weight_noise_ff'])

        # Print shapes
        shapes = [param.get_value().shape for param in cg.parameters]
        logging.info("Parameter shapes: ")
        for shape, count in Counter(shapes).most_common():
            logging.info('    {:15}: {}'.format(shape, count))
        logging.info("Total number of parameters: {}".format(len(shapes)))

        # Print parameter names
        enc_dec_param_dict = merge(
            Selector(encoder).get_parameters(),
            Selector(decoder).get_parameters())
        logging.info("Parameter names: ")
        for name, value in enc_dec_param_dict.items():
            logging.info('    {:15}: {}'.format(value.get_value().shape, name))
        logging.info("Total number of parameters: {}".format(
            len(enc_dec_param_dict)))

        # Set up training model
        logging.info("Building model")

        # Set extensions
        logging.info("Initializing extensions")

        # Set up beam search and sampling computation graphs if necessary
        logging.info("Building sampling model")
        sampling_representation = encoder.apply(
            sampling_input, tensor.ones(sampling_input.shape))
        generated = decoder.generate(sampling_input, sampling_representation)
        search_model = Model(generated)
        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs

        # Compare with blocks.machine_translation.BleuValidator.__init__
        self.source_sentence = sampling_input
        self.samples = samples
        self.model = search_model
        self.normalize = True
        self.verbose = self.config.get('val_set_out', None)

        # Reload model if necessary
        if self.config['reload']:
            loader = LoadNMT(nmt_model_path, self.config['saveto'],
                             search_model)
            loader.load_weights()

        self.best_models = []
        self.val_bleu_curve = []
        self.beam_search = BeamSearch(samples=samples)

    def decode(self, src_sentence):
        """Decodes a single source sentence with the original blocks
        beam search decoder. Does not use predictors. Note that the
        score breakdowns in returned hypotheses are only on the 
        sentence level, not on the word level. For finer grained NMT
        scores you need to use the nmt predictor. ``src_sentence`` is a
        list of source word ids representing the source sentence without
        <S> or </S> symbols. As blocks expects to see </S>, this method
        adds it automatically.
        
        Args:
            src_sentence (list): List of source word ids without <S> or
                                 </S> which make up the source sentence
        
        Returns:
            list. A list of ``Hypothesis`` instances ordered by their
            score.
        """
        seq = self._oov_to_unk(src_sentence, self.config['src_vocab_size'],
                               utils.UNK_ID) + [utils.EOS_ID]
        input_ = np.tile(seq, (self.config['beam_size'], 1))
        trans, costs = self.beam_search.search(
            input_values={self.source_sentence: input_},
            max_length=3 * len(src_sentence),
            eol_symbol=utils.EOS_ID,
            ignore_first_eol=True)
        hypos = []
        max_len = 0
        for idx in xrange(len(trans)):
            max_len = max(max_len, len(trans[idx]))
            hypo = Hypothesis(trans[idx], -costs[idx])
            hypo.score_breakdown = len(trans[idx]) * [[(0.0, 1.0)]]
            hypo.score_breakdown[0] = [(-costs[idx], 1.0)]
            hypos.append(hypo)
        self.apply_predictors_count = max_len * self.config['beam_size']
        return hypos

    def _oov_to_unk(self, seq, vocab_size, unk_idx):
        return [x if x < vocab_size else unk_idx for x in seq]

    def has_predictors(self):
        """Always returns true. """
        return True
Пример #52
0
class BleuEvaluator(SimpleExtension, SamplingBase):
    def __init__(self, source_sentence, samples, model, data_stream, ground_truth, config,
                 val_out=None, val_best_out=None, n_best=1, normalize=True, **kwargs):
        # TODO: change config structure
        super(BleuEvaluator, self).__init__(**kwargs)
        self.source_sentence = source_sentence
        self.samples = samples
        self.model = model
        self.data_stream = data_stream
        self.config = config
        self.n_best = n_best
        self.normalize = normalize
        self.val_out = val_out
        self.val_best_out = val_out and val_best_out
        self.bleu_scores = []

        self.trg_ivocab = None
        self.unk_id = config['unk_id']
        self.eos_id = config['eos_id']
        self.beam_search = BeamSearch(samples=samples)
        self.multibleu_cmd = ['perl', self.config['bleu_script'], ground_truth, '<']

    def do(self, which_callback, *args):
        # Track validation burn in
        if self.main_loop.status['iterations_done'] <= self.config['val_burn_in']:
            return

        self._evaluate_model()

    def _evaluate_model(self):
        logger.info("Started Validation: ")
        val_start_time = time.time()
        mb_subprocess = Popen(self.multibleu_cmd, stdin=PIPE, stdout=PIPE)
        total_cost = 0.0

        if self.trg_ivocab is None:
            sources = self._get_attr_rec(self.main_loop, 'data_stream')
            trg_vocab = sources.data_streams[1].dataset.dictionary
            self.trg_ivocab = {v: k for k, v in trg_vocab.items()}

        if self.val_out:
            output_file = open(self.val_out, 'w')

        for i, line in enumerate(self.data_stream.get_epoch_iterator()):
            """
            Load the sentence, retrieve the sample, write to file
            """

            seq = self._oov_to_unk(line[0], self.config['src_vocab_size'], self.unk_id)
            input_ = numpy.tile(seq, (self.config['beam_size'], 1))

            # draw sample, checking to ensure we don't get an empty string back
            trans, costs = self.beam_search.search(
                input_values={self.source_sentence: input_},
                max_length=3 * len(seq), eol_symbol=self.eos_id,
                ignore_first_eol=True)

            # normalize costs according to the sequence lengths
            if self.normalize:
                lengths = numpy.array([len(s) for s in trans])
                costs = costs / lengths

            nbest_idx = numpy.argsort(costs)[:self.n_best]
            for j, best in enumerate(nbest_idx):
                try:
                    total_cost += costs[best]
                    trans_out = trans[best]

                    # keeping eos tokens reduces BLEU score
                    if self.config['remove_eos']:
                        trans_out = [idx for idx in trans_out if idx != self.eos_id]
                    # however keeping unk tokens might be a good idea (avoids brevity penalty)
                    if self.config['remove_unk']:
                        trans_out = [idx for idx in trans_out if idx != self.unk_id]

                    # convert idx to words
                    trans_out = self._idx_to_word(trans_out, self.trg_ivocab)

                except ValueError:
                    logger.info("Can NOT find a translation for line: {}".format(i + 1))
                    trans_out = '<UNK>'

                if j == 0:
                    # Write to subprocess and file if it exists
                    print(trans_out, file=mb_subprocess.stdin)
                    if self.val_out:
                        print(trans_out, file=output_file)

            if i != 0 and i % 100 == 0:
                logger.info("Translated {} lines of validation set...".format(i))

            mb_subprocess.stdin.flush()

        logger.info("Total cost of the validation: {}".format(total_cost))
        self.data_stream.reset()
        if self.val_out:
            output_file.close()

        # send end of file, read output.
        mb_subprocess.stdin.close()
        stdout = mb_subprocess.stdout.readline()
        logger.info(stdout)
        out_parse = re.match(r'BLEU = [-.0-9]+', stdout)
        logger.info("Validation Took: {} minutes".format(float(time.time() - val_start_time) / 60.))
        assert out_parse is not None

        # extract the score
        bleu_score = float(out_parse.group()[6:])
        logger.info(bleu_score)
        mb_subprocess.terminate()

        self.bleu_scores.append(bleu_score)
        if self.val_best_out and bleu_score == max(self.bleu_scores):
            shutil.copy(self.val_out, self.val_best_out)

        return bleu_score