def get_merge_activations(self):
        '''
        In the model, we will merge the VGG image representation with
        the word embeddings. We need to feed the data as a list, in which
        the order of the elements in the list is _crucial_.
        '''

        self.data_generator = VisualWordDataGenerator(self.args,
                                                      input_dataset=self.args.checkpoint_dataset,
                                                      hsn=self.args.hidden_size)
        self.data_generator.set_vocabulary(self.args.checkpoint)
        self.vocab_len = len(self.data_generator.index2word)

        if not self.use_sourcelang:
            hsn_size = 0
        else:
            hsn_size = self.data_generator.hsn_size  # ick

        m = models.OneLayerLSTM(self.args.hidden_size, self.vocab_len,
                                self.args.dropin,
                                self.args.optimiser, self.args.l2reg,
                                hsn_size=hsn_size,
                                weights=self.args.checkpoint,
                                gru=self.args.gru)

        self.model =\
            m.buildMergeActivations(use_image=self.use_image,
                                    use_sourcelang=self.use_sourcelang)

        self.generate_activations('val')
Пример #2
0
 def prepare_datagenerator(self):
     self.data_gen = VisualWordDataGenerator(self.args, self.args.dataset)
     self.args.checkpoint = self.find_best_checkpoint()
     self.data_gen.set_vocabulary(self.args.checkpoint)
     self.vocab_len = len(self.data_gen.index2word)
     self.index2word = self.data_gen.index2word
     self.word2index = self.data_gen.word2index
    def get_hidden_activations(self):
        '''
        In the model, we will merge the VGG image representation with
        the word embeddings. We need to feed the data as a list, in which
        the order of the elements in the list is _crucial_.
        '''

        self.data_generator = VisualWordDataGenerator(self.args,
                                                      self.args.dataset)
        self.args.checkpoint = self.find_best_checkpoint()
        self.data_generator.set_vocabulary(self.args.checkpoint)
        self.vocab_len = len(self.data_generator.index2word)
        t = self.args.generation_timesteps if self.args.use_predicted_tokens else self.data_generator.max_seq_len

        m = models.NIC(self.args.embed_size, self.args.hidden_size,
                       self.vocab_len,
                       self.args.dropin,
                       self.args.optimiser, self.args.l2reg,
                       weights=self.args.checkpoint,
                       gru=self.args.gru,
                       t=t)

        self.fhs = m.buildHSNActivations(use_image=self.use_image)
        if self.args.use_predicted_tokens and self.args.no_image == False:
            gen_m = models.NIC(self.args.embed_size, self.args.hidden_size,
                               self.vocab_len,
                               self.args.dropin,
                               self.args.optimiser, self.args.l2reg,
                               weights=self.args.checkpoint,
                               gru=self.args.gru,
                               t=self.args.generation_timesteps)
            self.full_model = gen_m.buildKerasModel(use_image=self.use_image)

        self.new_generate_activations('train')
        self.new_generate_activations('val')
Пример #4
0
    def generationModel(self):
        '''
        In the model, we will merge the VGG image representation with
        the word embeddings. We need to feed the data as a list, in which
        the order of the elements in the list is _crucial_.
        '''

        self.data_gen = VisualWordDataGenerator(self.args,
                                                self.args.dataset)
        self.args.checkpoint = self.find_best_checkpoint()
        self.data_gen.set_vocabulary(self.args.checkpoint)
        self.vocab_len = len(self.data_gen.index2word)
        self.index2word = self.data_gen.index2word
        self.word2index = self.data_gen.word2index

        if self.use_sourcelang:
            # HACK FIXME unexpected problem with input_data
            self.hsn_size = 256
        else:
            self.hsn_size = 0

        m = models.OneLayerLSTM(self.args.hidden_size, self.vocab_len,
                                self.args.dropin,
                                self.args.optimiser, self.args.l2reg,
                                hsn_size=self.hsn_size,
                                weights=self.args.checkpoint,
                                gru=self.args.gru)

        self.model = m.buildKerasModel(use_sourcelang=self.use_sourcelang,
                                       use_image=self.use_image)

        self.generate_sentences(self.args.checkpoint, val=not self.args.test)
        self.bleu_score(self.args.checkpoint, val=not self.args.test)
        self.calculate_pplx(self.args.checkpoint, val=not self.args.test)
 def prepare_datagenerator(self):
     self.data_gen = VisualWordDataGenerator(self.args,
                                             self.args.dataset)
     self.args.checkpoint = self.find_best_checkpoint()
     self.data_gen.set_vocabulary(self.args.checkpoint)
     self.vocab_len = len(self.data_gen.index2word)
     self.index2word = self.data_gen.index2word
     self.word2index = self.data_gen.word2index
Пример #6
0
    def prepare_datagenerator(self):
        '''
        Initialise the data generator and its datastructures, unless a valid
        data generator was already passed into the
        GroundedTranslation.__init() function.
        '''

        # Initialise the data generator if it has not yet been initialised
        if self.data_generator == None:
            self.data_generator = VisualWordDataGenerator(
                self.args, self.args.dataset)

            # Extract the working vocabulary from the training dataset
            if self.args.existing_vocab != "":
                self.data_generator.set_vocabulary(self.args.existing_vocab)
            else:
                self.data_generator.extract_vocabulary()
        self.V = self.data_generator.get_vocab_size()
Пример #7
0
    def prepare_datagenerator(self):
        '''
        Initialise the data generator and its datastructures, unless a valid
        data generator was already passed into the
        GroundedTranslation.__init() function.
        '''

        # Initialise the data generator if it has not yet been initialised
        if self.data_generator == None:
            self.data_generator = VisualWordDataGenerator(self.args,
                                                          self.args.dataset)

            # Extract the working vocabulary from the training dataset
            if self.args.existing_vocab != "":
                self.data_generator.set_vocabulary(self.args.existing_vocab)
            else:
                self.data_generator.extract_vocabulary()
        self.V = self.data_generator.get_vocab_size()
Пример #8
0
class Sweep(object):

    def __init__(self, args):
        '''
        Initialise the model and set Theano debugging model if
        self.args.debug is true
        '''

        self.args = args
        self.use_sourcelang = args.source_vectors is not None
        self.use_image = not args.no_image
        self.data_generator = None
        self.prepare_datagenerator()

        if self.args.debug:
            theano.config.optimizer = 'fast_compile'
            theano.config.exception_verbosity = 'high'

    def random_sweep(self):
        '''
        Start randomly sweeping through hyperparameter ranges.

        This current only supports sweeping through the L2 regularisation
        strength, the learning rate, and the dropout probability.
        '''

        model = GroundedTranslation(self.args, datagen=self.data_generator)

        handle = open("../logs/sweeper-%s.log" % self.args.run_string, "w")
        handle.write("{:3} | {:10} | {:10} | {:10} | {:10} | {:10} \n".format("Run",
            "loss", "val_loss", "lr", "reg", "dropin"))
        handle.close()
        for sweep in xrange(self.args.num_sweeps):
            # randomly sample a learning rate and an L2 regularisation
            handle = open("../logs/sweeper-%s.log" % self.args.run_string, "a")
            if self.args.min_lr == ceil(self.args.min_lr):
                # you provided an exponent, we'll search in log-space
                lr = 10**uniform(self.args.min_lr, self.args.max_lr)
            else:
                # you provided a specific number
                lr = 10**uniform(log10(self.args.min_lr),
                                 log10(self.args.max_lr))

            if self.args.min_l2 == ceil(self.args.min_l2):
                # you provided an exponent, we'll search in log-space
                l2 = 10**uniform(self.args.min_l2, self.args.max_l2)
            else:
                # you provide a specific number
                l2 = 10**uniform(log10(self.args.min_l2),
                                 log10(self.args.max_l2))
            drop_in = uniform(self.args.min_dropin, self.args.max_dropin)

            # modify the arguments that will be used to create the graph
            model.args.lr = lr
            model.args.l2reg = l2
            model.args.dropin = drop_in

            logger.info("Setting learning rate to: %.5e", lr)
            logger.info("Setting l2reg to: %.5e", l2)
            logger.info("Setting dropout to: %f", drop_in)

            # initialise and compile a new model
            losses = model.train_model()
            handle.write("{:3d} | {:5.5f} | {:5.5f} | {:5e} | {:5e} | {:5.4f} \n".format(sweep,
                         losses.history['loss'][-1],
                         losses.history['val_loss'][-1], lr, l2, drop_in))
            handle.close()

    def prepare_datagenerator(self):
        '''
        Initialise the data generator and its datastructures, unless a valid
        data generator was already passed into the
        GroundedTranslation.__init() function.
        '''

        # Initialise the data generator if it has not yet been initialised
        if self.data_generator == None:
            self.data_generator = VisualWordDataGenerator(self.args,
                                                          self.args.dataset)

        # Extract the working vocabulary from the training dataset
        if self.args.existing_vocab != "":
            self.data_generator.set_vocabulary(self.args.existing_vocab)
        else:
            self.data_generator.extract_vocabulary()
        self.V = self.data_generator.get_vocab_size()
class ExtractMergeActivations:

    def __init__(self, args):
        self.args = args
        self.vocab = dict()
        self.unkdict = dict()
        self.counter = 0
        self.maxSeqLen = 0

        # consistent with models.py
        # maybe use_sourcelang isn't applicable here?
        self.use_sourcelang = args.source_vectors is not None
        self.use_image = not args.no_image

        if self.args.debug:
            theano.config.optimizer = 'None'
            theano.config.exception_verbosity = 'high'

    def get_merge_activations(self):
        '''
        In the model, we will merge the VGG image representation with
        the word embeddings. We need to feed the data as a list, in which
        the order of the elements in the list is _crucial_.
        '''

        self.data_generator = VisualWordDataGenerator(self.args,
                                                      input_dataset=self.args.checkpoint_dataset,
                                                      hsn=self.args.hidden_size)
        self.data_generator.set_vocabulary(self.args.checkpoint)
        self.vocab_len = len(self.data_generator.index2word)

        if not self.use_sourcelang:
            hsn_size = 0
        else:
            hsn_size = self.data_generator.hsn_size  # ick

        m = models.OneLayerLSTM(self.args.hidden_size, self.vocab_len,
                                self.args.dropin,
                                self.args.optimiser, self.args.l2reg,
                                hsn_size=hsn_size,
                                weights=self.args.checkpoint,
                                gru=self.args.gru)

        self.model =\
            m.buildMergeActivations(use_image=self.use_image,
                                    use_sourcelang=self.use_sourcelang)

        self.generate_activations('val')

    def generate_activations(self, split):
        '''
        Generate and serialise merge state activations
        into --dataset.
        '''
        logger.info("Generating merge state activations\
                    from this model for %s\n", split)

        if split == 'train':
            """ WARNING: This collects the *entirety of the training data* in
            hidden_states, so should not be used on non-toy training data.
            """
            hidden_states = []
            batch_start = 0
            batch_end = 0
            for train_input, trainY, indicator in\
                self.data_generator.yield_training_batch(self.args.big_batch_size,
                                                         self.use_sourcelang,
                                                         self.use_image):
                feats = self.model.predict(train_input,
                                           batch_size=self.args.batch_size,
                                           verbose=1)
                for f in feats:
                    activations = f[0]  # we want the merge features
                    hidden_states.append(activations)
                    batch_end += 1
                # Note: serialisation happens over training batches too.
                # now serialise the hidden representations in the h5
                self.serialise_to_csv(split, hidden_states,
                                     batch_start, batch_end)

                batch_start = batch_end
                hidden_states = []

        elif split == 'val':
            val_input, valY = self.data_generator.get_data_by_split('val',
                self.use_sourcelang, self.use_image)
            logger.info("Generating merge activations from this model for val\n")

            hidden_states = []
            feats = self.model.predict(val_input,
                                       batch_size=self.args.batch_size,
                                       verbose=1)
            for f in feats:
                activations = f[0]  # we want the merge features
                hidden_states.append(activations)

            # now serialise the hidden representations in the h5
            self.serialise_to_csv(split, hidden_states)

    def serialise_to_csv(self, split, hidden_states,
                         batch_start=None, batch_end=None):
        """ Serialise the hidden representations from generate_activations
        into a CSV for t-SNE visualisation."""
        logger.info("Serialising merge state features from %s to csv",
                    split)
        fhf_str = "%s-initial_hidden_features" % self.args.run_string

        if self.args.source_vectors is not None:
            fhf_str = "%s-multilingual_initial_hidden_features" % self.args.run_string
        f = open(fhf_str, 'a')
        for h in hidden_states:
            np.savetxt(f, h, delimiter=',', newline=',')
            f.write("\n")
        f.close()
class GroundedTranslationGenerator:

    def __init__(self, args):
        self.args = args
        self.vocab = dict()
        self.unkdict = dict()
        self.counter = 0
        self.maxSeqLen = 0

        # consistent with models.py
        self.use_sourcelang = args.source_vectors is not None
        self.use_image = not args.no_image
        self.model = None
        self.prepare_datagenerator()

        # this results in two file handlers for dataset (here and
        # data_generator)
        if not self.args.dataset:
            logger.warn("No dataset given, using flickr8k")
            self.dataset = h5py.File("flickr8k/dataset.h5", "r")
        else:
            self.dataset = h5py.File("%s/dataset.h5" % self.args.dataset, "r")

        if self.args.debug:
            theano.config.optimizer = 'None'
            theano.config.exception_verbosity = 'high'

    def prepare_datagenerator(self):
        self.data_gen = VisualWordDataGenerator(self.args,
                                                self.args.dataset)
        self.args.checkpoint = self.find_best_checkpoint()
        self.data_gen.set_vocabulary(self.args.checkpoint)
        self.vocab_len = len(self.data_gen.index2word)
        self.index2word = self.data_gen.index2word
        self.word2index = self.data_gen.word2index

    def generate(self):
        '''
        Entry point for this module.
        Loads up a data generator to get the relevant image / source features.
        Builds the relevant model, given the command-line arguments.
        Generates sentences for the images in the val / test data.
        Calculates BLEU and PPLX, unless requested.
        '''

        if self.use_sourcelang:
            # HACK FIXME unexpected problem with input_data
            self.hsn_size = self.data_gen.hsn_size
        else:
            self.hsn_size = 0

        if self.model == None:
            self.build_model(generate=True)

        self.generate_sentences(self.args.checkpoint, val=not self.args.test)
        if not self.args.without_scores:
            score = self.bleu_score(self.args.checkpoint, val=not self.args.test)
            if self.args.multeval:
                score, _, _ = self.multeval_scores(self.args.checkpoint,
                                                    val=not self.args.test)
            if not self.args.no_pplx:
                self.build_model(generate=False)
                self.calculate_pplx(self.args.checkpoint, val=not self.args.test)
            return score

################################################################################
# Helper functions for generate_sentences()

    def get_keep_func(self):
        "Builds a keep function, given a JSON file with info on what to keep."
        with open(self.args.keep_file) as f:
            d = json.load(f)
        
        whole_word = set(d['WHOLEWORD'])
        prefixes = d['STARTSWITH']
        suffixes = d['ENDSWITH']
        
        def keep_func(word):
            "Function to determine which words to keep in the beam."
            if word in whole_word:
                return True
            for pref in prefixes:
                if word.startswith(pref):
                    return True
            for suf in suffixes:
                if word.endswith(suf):
                    return True
            return False
        # Return the function:
        return keep_func

    def get_candidates(self, t, beams, structs, keep_func):
        """
        Get candidate beams containing the next word. If the next word is one that
        should be kept according to keep_func, the beams will be added to kept_candidates.
        """
        # Store the candidates produced at timestep t, will be
        # pruned at the end of the timestep
        candidates = []
        kept_candidates = []

        # we take a view of the datastructures, which means we're only
        # ever generating a prediction for the next word. This saves a
        # lot of cycles.
        preds = self.model.predict(structs, verbose=0)

        # The last indices in preds are the predicted words
        next_word_indices = preds['output'][:, t-1]
        sorted_indices = np.argsort(-next_word_indices, axis=1)

        # Each instance in structs is holding the history of a
        # beam, and so there is a direct connection between the
        # index of a beam in beams and the index of an instance in
        # structs.
        for beam_idx, b in enumerate(beams):
            # get the sorted predictions for the beam_idx'th beam
            beam_predictions = sorted_indices[beam_idx]
            for top_idx in range(self.args.beam_width):
                wordIndex = beam_predictions[top_idx]
                wordProb = next_word_indices[beam_idx][beam_predictions[top_idx]]
                # For the beam_idxth beam, add the log probability
                # of the top_idxth predicted word to the previous
                # log probability of the sequence, and  append the
                # top_idxth predicted word to the sequence of words
                current_word = self.index2word[wordIndex]
                updated_beam = [b[0] + math.log(wordProb), b[1] + [wordIndex]]
                candidates.append(updated_beam)
                if keep_func(current_word):
                    logger.info("WORD KEPT: " + current_word)
                    self.found_negation = True
                    kept_candidates.append(updated_beam)
        return candidates, kept_candidates

    def prune(self, candidates, max_beam_width, category='regular beams'):
        """
        Prune the candidates, so that we are left with max_beam_width
        beams. Also return beams that are finished as a separate list.
        """
        beams = candidates[:max_beam_width] # prune the beams
        finished = []
        pruned = []
        for b in beams:
            # If a top candidate emitted an EOS token then
            # a) add it to the list of finished sequences
            # b) remove it from the beams and decrease the
            # maximum size of the beams.
            if b[1][-1] == self.word2index["<E>"]:
                finished.append(b)
                if max_beam_width >= 1:
                    max_beam_width -= 1
            else:
                pruned.append(b)
        
        beams = pruned[:max_beam_width]

        if self.args.verbose:
            logger.info("Pruned beams " + ''.join(['(', category, ')']))
            logger.info("---")
            for b in beams:
                logger.info(" ".join([self.index2word[x] for x in b[1]]) + "(%f)" % b[0])
        return beams, finished

    def get_structs(self, beams, data, max_beam_width):
        "Get structs for the next round."
        structs = self.make_duplicate_matrices(data, max_beam_width)
        # Rewrite the 1-hot word features with the
        # so-far-predcicted tokens in a beam.
        for bidx, b in enumerate(beams):
            for idx, w in enumerate(b[1]):
                # This variable doesn't do anything.
                # next_word_index = w
                structs['text'][bidx, idx+1, w] = 1.
        return structs

    def log_finished(self, finished, category='regular beams'):
        "Log the Length-normalised samples."
        logger.info("Length-normalised samples" + ''.join(['(', category, ')']))
        logger.info("---")
        for f in finished:
            logger.info(" ".join([self.index2word[x] for x in f[1]]) + "(%f)" % f[0])

################################################################################


    def generate_sentences(self, filepath, val=True):
        """
        Generates descriptions of images for --generation_timesteps
        iterations through the LSTM. Each input description is clipped to
        the first <BOS> token, or, if --generate_from_N_words is set, to the
        first N following words (N + 1 BOS token).
        This process can be additionally conditioned
        on source language hidden representations, if provided by the
        --source_vectors parameter.
        The output is clipped to the first EOS generated, if it exists.

        TODO: duplicated method with generate.py
        """
        try:
            assert self.args.beam_width > 1
        except AssertionError:
            raise AssertionError('Beam size too small. Cannot use dual beam search.')
        
        neg_counter = 0
        ident_desc_dict = dict()
        keep_func = self.get_keep_func()
        
        prefix = "val" if val else "test"
        handle = codecs.open("%s/%sGenerated" % (filepath, prefix), "w",
                             'utf-8')
        logger.info("Generating %s descriptions", prefix)

        start_gen = self.args.generate_from_N_words  # Default 0
        start_gen = start_gen + 1  # include BOS

        generator = self.data_gen.generation_generator(prefix, batch_size=1)

        # we are going to beam search for the most probably sentence.
        # let's do this one sentence at a time to make the logging output
        # easier to understand
        for seen, data in enumerate(generator, start=1):
            text = data['text']
            # Append the first start_gen words to the complete_sentences list
            # for each instance in the batch.
            complete_sentences = [[] for _ in range(text.shape[0])]
            for t in range(start_gen):  # minimum 1
                for i in range(text.shape[0]):
                    w = np.argmax(text[i, t])
                    complete_sentences[i].append(self.index2word[w])
            del data['text']
            text = self.reset_text_arrays(text, start_gen)
            Y_target = data['output']
            data['text'] = text

            max_beam_width = self.args.beam_width
            neg_max_beam_width = self.args.beam_width
            structs = self.make_duplicate_matrices(data, max_beam_width)

            # A beam is a 2-tuple with the probability of the sequence and
            # the words in that sequence. Start with empty beams
            beams = [(0.0, [])]
            neg_beams = []
            # collects beams that are in the top candidates and
            # emitted a <E> token.
            finished = []
            neg_finished = []
            # Flag variable. Is set to True once the first negation is found.
            self.found_negation = False
            
            for t in range(start_gen, self.args.generation_timesteps):
                # Ensure that kept_candidates is there. (And that previous results are removed.)
                kept_candidates = []
                
                ################################################################
                # GET CANDIDATES
                
                if max_beam_width > 0:
                    candidates, kept_candidates = self.get_candidates(t, beams, structs, keep_func)
                    candidates.sort(reverse = True)
                
                if self.found_negation:
                    neg_c, neg_kc = self.get_candidates(t, neg_beams, neg_structs, keep_func)
                    # don't add neg_kc: don't add examples twice.
                    neg_candidates = kept_candidates + neg_c
                    neg_candidates.sort(reverse = True)
                
                ################################################################
                # LOG NEW CANDIDATES
                
                if self.args.verbose:
                    logger.info("Candidates in the beam")
                    logger.info("---")
                    if max_beam_width > 0:
                        logger.info("REGULAR BEAM:")
                        for c in candidates:
                            logger.info(" ".join([self.index2word[x] for x in c[1]]) + " (%f)" % c[0])
                    if self.found_negation:
                        logger.info("SEPARATE BEAM:")
                        for c in neg_candidates:
                            logger.info(" ".join([self.index2word[x] for x in c[1]]) + " (%f)" % c[0])

                ################################################################
                # PRUNE
                
                beams, finished_this_round = self.prune(candidates,
                                                        max_beam_width,
                                                        category='regular beams')
                finished.extend(finished_this_round)
                if self.found_negation:
                    neg_beams, finished_this_round = self.prune(neg_candidates,
                                                                neg_max_beam_width,
                                                                category='selected beams')
                    neg_finished.extend(finished_this_round)
                

                ################################################################
                # STOP DECISION

                if self.found_negation:
                    if neg_max_beam_width == 0:
                        # We have sampled neg_max_beam_width sequences with an <E>
                        # token so stop the beam search.
                        break
                elif max_beam_width == 0:
                    # We have sampled max_beam_width sequences with an <E>
                    # token so stop the beam search.
                    break

                ################################################################
                # UPDATE STRUCTS

                # Reproduce the structs for the beam search so we can keep
                # track of the state of each beam
                if max_beam_width > 0:
                    structs = self.get_structs(beams=beams,
                                               data=data,
                                               max_beam_width=max_beam_width)
                
                neg_structs = self.get_structs(beams=neg_beams,
                                               data=data,
                                               max_beam_width=neg_max_beam_width)

            ####################################################################
            # WRAPPING UP

            # If none of the sentences emitted an <E> token while
            # decoding, add the final beams into the final candidates
            if len(finished) == 0:
                for leftover in beams:
                    finished.append(leftover)
            
            # Do the same for the neg beams.
            if self.found_negation and len(neg_finished) == 0:
                for leftover in neg_beams:
                    neg_finished.append(leftover)

            # Normalise the probabilities by the length of the sequences
            # as suggested by Graves (2012) http://arxiv.org/abs/1211.3711
            for f in finished:
                f[0] = f[0] / len(f[1])
            finished.sort(reverse=True)

            for f in neg_finished:
                f[0] = f[0] / len(f[1])
            neg_finished.sort(reverse=True)

            ####################################################################
            # LOG FINISHED
            
            if self.args.verbose:
                self.log_finished(finished, category='regular beams')
                if self.found_negation:
                    self.log_finished(finished, category='selected beams')

            # Emit the lowest (log) probability sequence
            best_beam = finished[0] if not self.found_negation else neg_finished[0]
            complete_sentences[i] = [self.index2word[x] for x in best_beam[1]]
            generated_sentence = ' '.join([x for x
                                   in itertools.takewhile(
                                       lambda n: n != "<E>", complete_sentences[i])])
            
            # The description data in the JSON file is stored together with a flag
            # indicating whether or not there is a negation in the sentence.
            ident_desc_dict[data['ident']] = [generated_sentence, self.found_negation]
            
            handle.write(generated_sentence + "\n")
            if self.args.verbose:
                logger.info("%s (%f)", generated_sentence, best_beam[0])
            
            if self.found_negation:
                neg_counter += 1
            
            if seen == self.data_gen.split_sizes[prefix]:
                # Hacky way to break out of the generator
                break
        
        # Put together the filename for the JSON data, consisting of the following:
        json_path = ''.join([filepath,                  # folder
                             '/',                        # trailing slash
                             prefix,                    # 'val' or 'test'
                             '_dual_beam_search_',      # kind of generation
                             str(self.args.beam_width), # beam width used
                             '.json'])                  # filetype
        
        # Write the JSON data.
        with codecs.open(json_path, 'w', 'utf-8') as f:
            json.dump(ident_desc_dict, f)
        
        logger.info("Total number of kept sentences: " + str(neg_counter))
        handle.close()

    def calculate_pplx(self, path, val=True):
        """ Splits the input data into batches of self.args.batch_size to
        reduce the memory footprint of holding all of the data in RAM. """

        prefix = "val" if val else "test"
        logger.info("Calculating pplx over %s data", prefix)
        sum_logprobs = 0
        y_len = 0

        generator = self.data_gen.fixed_generator(prefix)
        seen = 0
        for data in generator:
            Y_target = deepcopy(data['output'])
            del data['output']

            preds = self.model.predict(data,
                                       verbose=0,
                                       batch_size=self.args.batch_size)

            for i in range(Y_target.shape[0]):
                for t in range(Y_target.shape[1]):
                    target_idx = np.argmax(Y_target[i, t])
                    target_tok = self.index2word[target_idx]
                    if target_tok != "<P>":
                        log_p = math.log(preds['output'][i, t, target_idx],2)
                        sum_logprobs += -log_p
                        y_len += 1

            seen += data['text'].shape[0]
            if seen == self.data_gen.split_sizes[prefix]:
                # Hacky way to break out of the generator
                break

        norm_logprob = sum_logprobs / y_len
        pplx = math.pow(2, norm_logprob)
        logger.info("PPLX: %.4f", pplx)
        handle = open("%s/%sPPLX" % (path, prefix), "w")
        handle.write("%f\n" % pplx)
        handle.close()
        return pplx


    def reset_text_arrays(self, text_arrays, fixed_words=1):
        """ Reset the values in the text data structure to zero so we cannot
        accidentally pass them into the model.

        Helper function for generate_sentences().
         """
        reset_arrays = deepcopy(text_arrays)
        reset_arrays[:,fixed_words:, :] = 0
        return reset_arrays

    def make_duplicate_matrices(self, generator_data, k):
        '''
        Prepare K duplicates of the input data for a given instance yielded by
        the data generator.

        Helper function for the beam search decoder in generation_sentences().
        '''

        if self.use_sourcelang and self.use_image:
            # the data generator yielded a dictionary with the words, the
            # image features, and the source features
            dupes = [[],[],[]]
            words = generator_data['text']
            img = generator_data['img']
            source = generator_data['source']
            for x in range(k):
                # Make a deep copy of the word_feats structures
                # so the arrays will never be shared
                dupes[0].append(deepcopy(words[0,:,:]))
                dupes[1].append(source[0,:,:])
                dupes[2].append(img[0,:,:])

            # Turn the list of arrays into a numpy array
            dupes[0] = np.array(dupes[0])
            dupes[1] = np.array(dupes[1])
            dupes[2] = np.array(dupes[2])

            return {'text': dupes[0], 'img': dupes[2], 'source': dupes[1]}

        elif self.use_image:
            # the data generator yielded a dictionary with the words and the
            # image features
            dupes = [[],[]]
            words = generator_data['text']
            img = generator_data['img']
            for x in range(k):
                # Make a deep copy of the word_feats structures
                # so the arrays will never be shared
                dupes[0].append(deepcopy(words[0,:,:]))
                dupes[1].append(img[0,:,:])

            # Turn the list of arrays into a numpy array
            dupes[0] = np.array(dupes[0])
            dupes[1] = np.array(dupes[1])

            return {'text': dupes[0], 'img': dupes[1]}

        elif self.use_sourcelang:
            # the data generator yielded a dictionary with the words and the
            # source features
            dupes = [[],[]]
            words = generator_data['text']
            source= generator_data['source']
            for x in range(k):
                # Make a deep copy of the word_feats structures
                # so the arrays will never be shared
                dupes[0].append(deepcopy(words[0,:,:]))
                dupes[1].append(source[0,:,:])

            # Turn the list of arrays into a numpy array
            dupes[0] = np.array(dupes[0])
            dupes[1] = np.array(dupes[1])

            return {'text': dupes[0], 'source': dupes[1]}

    def find_best_checkpoint(self):
        '''
        Read the summary file from the directory and scrape out the run ID of
        the highest BLEU scoring checkpoint. Then do an ls-stlye function in
        the directory and return the exact path to the best model.

        Assumes only one matching prefix in the model checkpoints directory.
        '''

        summary_data = open("%s/summary" % self.args.model_checkpoints).readlines()
        summary_data = [x.replace("\n", "") for x in summary_data]
        best_id = None
        target = "Best Metric" if self.args.best_pplx else "Best loss"
        for line in summary_data:
            if line.startswith(target):
                best_id = "%03d" % (int(line.split(":")[1].split("|")[0]))

        checkpoint = None
        if best_id is not None:
            checkpoints = os.listdir(self.args.model_checkpoints)
            for c in checkpoints:
                if c.startswith(best_id):
                    checkpoint = c
                    break
        logger.info("Best checkpoint: %s/%s" % (self.args.model_checkpoints, checkpoint))
        return "%s/%s" % (self.args.model_checkpoints, checkpoint)

    def bleu_score(self, directory, val=True):
        '''
        PPLX is only weakly correlated with improvements in BLEU,
        and thus improvements in human judgements. Let's also track
        BLEU score of a subset of generated sentences in the val split
        to decide on early stopping, etc.
        '''

        prefix = "val" if val else "test"
        self.extract_references(directory, val)

        subprocess.check_call(
            ['perl multi-bleu.perl %s/%s_reference.ref < %s/%sGenerated | tee %s/%sBLEU'
             % (directory, prefix, directory, prefix, directory, prefix)], shell=True)
        bleudata = open("%s/%sBLEU" % (directory, prefix)).readline()
        data = bleudata.split(",")[0]
        bleuscore = data.split("=")[1]
        bleu = float(bleuscore.lstrip())
        return bleu

    def multeval_scores(self, directory, val=True):
        '''
        Maybe you want to evaluate with Meteor, TER, and BLEU?
        '''
        prefix = "val" if val else "test"
        self.extract_references(directory, val)

        with cd(MULTEVAL_DIR):
            subprocess.check_call(
                ['./multeval.sh eval --refs ../%s/%s_reference.* \
                 --hyps-baseline ../%s/%sGenerated \
                 --meteor.language de \
		 --threads 4 \
		2> multevaloutput 1> multevaloutput'
                % (directory, prefix, directory, prefix)], shell=True)
            handle = open("multevaloutput")
            multdata = handle.readlines()
            handle.close()
            for line in multdata:
              if line.startswith("RESULT: baseline: BLEU: AVG:"):
                mbleu = line.split(":")[4]
                mbleu = mbleu.replace("\n","")
                mbleu = mbleu.strip()
                lr = mbleu.split(".")
                mbleu = float(lr[0]+"."+lr[1][0:2])
              if line.startswith("RESULT: baseline: METEOR: AVG:"):
                mmeteor = line.split(":")[4]
                mmeteor = mmeteor.replace("\n","")
                mmeteor = mmeteor.strip()
                lr = mmeteor.split(".")
                mmeteor = float(lr[0]+"."+lr[1][0:2])
              if line.startswith("RESULT: baseline: TER: AVG:"):
                mter = line.split(":")[4]
                mter = mter.replace("\n","")
                mter = mter.strip()
                lr = mter.split(".")
                mter = float(lr[0]+"."+lr[1][0:2])

            logger.info("Meteor = %.2f | BLEU = %.2f | TER = %.2f",
			mmeteor, mbleu, mter)

            return mmeteor, mbleu, mter

    def extract_references(self, directory, val=True):
        """
        Get reference descriptions for split we are generating outputs for.

        Helper function for bleu_score().
        """
        prefix = "val" if val else "test"
        references = self.data_gen.get_refs_by_split_as_list(prefix)

        for refid in xrange(len(references[0])):
            codecs.open('%s/%s_reference.ref%d'
                        % (directory, prefix, refid), 'w', 'utf-8').write('\n'.join([x[refid] for x in references]))

    def build_model(self, generate=False):
        '''
        Build a Keras model if one does not yet exist.

        Helper function for generate().
        '''

        if generate:
            t = self.args.generation_timesteps
        else:
            t = self.data_gen.max_seq_len
        if self.args.mrnn:
            m = models.MRNN(self.args.embed_size, self.args.hidden_size,
                            self.vocab_len,
                            self.args.dropin,
                            self.args.optimiser, self.args.l2reg,
                            hsn_size=self.hsn_size,
                            weights=self.args.checkpoint,
                            gru=self.args.gru,
                            clipnorm=self.args.clipnorm,
                            t=t)
        else:
            m = models.NIC(self.args.embed_size, self.args.hidden_size,
                           self.vocab_len,
                           self.args.dropin,
                           self.args.optimiser, self.args.l2reg,
                           hsn_size=self.hsn_size,
                           weights=self.args.checkpoint,
                           gru=self.args.gru,
                           clipnorm=self.args.clipnorm,
                           t=t)

        self.model = m.buildKerasModel(use_sourcelang=self.use_sourcelang,
                                       use_image=self.use_image)
Пример #11
0
class Sweep(object):
    def __init__(self, args):
        '''
        Initialise the model and set Theano debugging model if
        self.args.debug is true
        '''

        self.args = args
        self.use_sourcelang = args.source_vectors is not None
        self.use_image = not args.no_image
        self.data_generator = None
        self.prepare_datagenerator()

        if self.args.debug:
            theano.config.optimizer = 'fast_compile'
            theano.config.exception_verbosity = 'high'

    def random_sweep(self):
        '''
        Start randomly sweeping through hyperparameter ranges.

        This current only supports sweeping through the L2 regularisation
        strength, the learning rate, and the dropout probability.
        '''

        model = GroundedTranslation(self.args, datagen=self.data_generator)

        handle = open("../logs/sweeper-%s.log" % self.args.run_string, "w")
        handle.write("{:3} | {:10} | {:10} | {:10} | {:10} | {:10} \n".format(
            "Run", "loss", "val_loss", "lr", "reg", "dropin"))
        handle.close()
        for sweep in xrange(self.args.num_sweeps):
            # randomly sample a learning rate and an L2 regularisation
            handle = open("../logs/sweeper-%s.log" % self.args.run_string, "a")
            if self.args.min_lr == ceil(self.args.min_lr):
                # you provided an exponent, we'll search in log-space
                lr = 10**uniform(self.args.min_lr, self.args.max_lr)
            else:
                # you provided a specific number
                lr = 10**uniform(log10(self.args.min_lr),
                                 log10(self.args.max_lr))

            if self.args.min_l2 == ceil(self.args.min_l2):
                # you provided an exponent, we'll search in log-space
                l2 = 10**uniform(self.args.min_l2, self.args.max_l2)
            else:
                # you provide a specific number
                l2 = 10**uniform(log10(self.args.min_l2),
                                 log10(self.args.max_l2))
            drop_in = uniform(self.args.min_dropin, self.args.max_dropin)

            # modify the arguments that will be used to create the graph
            model.args.lr = lr
            model.args.l2reg = l2
            model.args.dropin = drop_in

            logger.info("Setting learning rate to: %.5e", lr)
            logger.info("Setting l2reg to: %.5e", l2)
            logger.info("Setting dropout to: %f", drop_in)

            # initialise and compile a new model
            losses = model.train_model()
            handle.write(
                "{:3d} | {:5.5f} | {:5.5f} | {:5e} | {:5e} | {:5.4f} \n".
                format(sweep, losses.history['loss'][-1],
                       losses.history['val_loss'][-1], lr, l2, drop_in))
            handle.close()

    def prepare_datagenerator(self):
        '''
        Initialise the data generator and its datastructures, unless a valid
        data generator was already passed into the
        GroundedTranslation.__init() function.
        '''

        # Initialise the data generator if it has not yet been initialised
        if self.data_generator == None:
            self.data_generator = VisualWordDataGenerator(
                self.args, self.args.dataset)

        # Extract the working vocabulary from the training dataset
        if self.args.existing_vocab != "":
            self.data_generator.set_vocabulary(self.args.existing_vocab)
        else:
            self.data_generator.extract_vocabulary()
        self.V = self.data_generator.get_vocab_size()
Пример #12
0
class GroundedTranslationGenerator:

    def __init__(self, args):
        self.args = args
        self.vocab = dict()
        self.unkdict = dict()
        self.counter = 0
        self.maxSeqLen = 0

        # consistent with models.py
        self.use_sourcelang = args.source_vectors is not None
        self.use_image = not args.no_image

        # this results in two file handlers for dataset (here and
        # data_generator)
        if not self.args.dataset:
            logger.warn("No dataset given, using flickr8k")
            self.dataset = h5py.File("flickr8k/dataset.h5", "r")
        else:
            self.dataset = h5py.File("%s/dataset.h5" % self.args.dataset, "r")

        if self.args.debug:
            theano.config.optimizer = 'None'
            theano.config.exception_verbosity = 'high'

    def generationModel(self):
        '''
        In the model, we will merge the VGG image representation with
        the word embeddings. We need to feed the data as a list, in which
        the order of the elements in the list is _crucial_.
        '''

        self.data_gen = VisualWordDataGenerator(self.args,
                                                self.args.dataset)
        self.args.checkpoint = self.find_best_checkpoint()
        self.data_gen.set_vocabulary(self.args.checkpoint)
        self.vocab_len = len(self.data_gen.index2word)
        self.index2word = self.data_gen.index2word
        self.word2index = self.data_gen.word2index

        if self.use_sourcelang:
            # HACK FIXME unexpected problem with input_data
            self.hsn_size = 256
        else:
            self.hsn_size = 0

        m = models.OneLayerLSTM(self.args.hidden_size, self.vocab_len,
                                self.args.dropin,
                                self.args.optimiser, self.args.l2reg,
                                hsn_size=self.hsn_size,
                                weights=self.args.checkpoint,
                                gru=self.args.gru)

        self.model = m.buildKerasModel(use_sourcelang=self.use_sourcelang,
                                       use_image=self.use_image)

        self.generate_sentences(self.args.checkpoint, val=not self.args.test)
        self.bleu_score(self.args.checkpoint, val=not self.args.test)
        self.calculate_pplx(self.args.checkpoint, val=not self.args.test)

    def generate_sentences(self, filepath, val=True):
        """
        Generates descriptions of images for --generation_timesteps
        iterations through the LSTM. Each input description is clipped to
        the first <BOS> token, or, if --generate_from_N_words is set, to the
        first N following words (N + 1 BOS token).
        This process can be additionally conditioned
        on source language hidden representations, if provided by the
        --source_vectors parameter.
        The output is clipped to the first EOS generated, if it exists.

        TODO: beam search
        TODO: duplicated method with generate.py
        """
        prefix = "val" if val else "test"
        handle = codecs.open("%s/%sGenerated" % (filepath, prefix), "w",
                             'utf-8')
        logger.info("Generating %s descriptions", prefix)

        start_gen = self.args.generate_from_N_words  # Default 0
        start_gen = start_gen + 1  # include BOS

        # prepare the datastructures for generation (no batching over val)
        arrays = self.make_generation_arrays(prefix, start_gen,
                 generation=self.args.use_predicted_tokens)
        N_sents = arrays[0].shape[0]
        logger.debug("Input arrays %d", len(arrays))
        logger.debug("Instances %d", len(arrays[0]))

        # complete_sentences = [["<S>"] for _ in range(N_sents)]

        complete_sentences = [[] for _ in range(N_sents)]
        for t in range(start_gen):  # minimum 1
            for i in range(N_sents):
                w = np.argmax(arrays[0][i, t])

        logger.debug(complete_sentences[3])
        logger.debug(self.index2word[np.argmax(arrays[0][0])])

        for t in range(start_gen, self.args.generation_timesteps):
            # we take a view of the datastructures, which means we're only
            # ever generating a prediction for the next word. This saves a
            # lot of cycles.
            preds = self.model.predict([arr[:, 0:t] for arr in arrays],
                                       verbose=0)

            # Look at the last indices for the words.
            next_word_indices = np.argmax(preds[:, -1], axis=1)
            # update array[0]/sentence-so-far with generated words.
            for i in range(N_sents):
                arrays[0][i, t, next_word_indices[i]] = 1.
            next_words = [self.index2word[x] for x in next_word_indices]
            for i in range(len(next_words)):
                complete_sentences[i].append(next_words[i])

        # save each sentence until it hits the first end-of-string token
        for s in complete_sentences:
            handle.write(' '.join([x for x
                                   in itertools.takewhile(
                                       lambda n: n != "<E>", s[1:])]) + "\n")

        handle.close()

    def find_best_checkpoint(self):
        '''
        Read the summary file from the directory and scrape out the run ID of
        the highest BLEU scoring checkpoint. Then do an ls-stlye function in
        the directory and return the exact path to the best model.

        Assumes only one matching prefix in the model checkpoints directory.
        '''

        summary_data = open("%s/summary" % self.args.model_checkpoints).readlines()
        summary_data = [x.replace("\n", "") for x in summary_data]
        best_id = None
        target = "Best PPLX" if self.args.best_pplx else "Best BLEU"
        for line in summary_data:
            if line.startswith(target):
                best_id = "%03d" % (int(line.split(":")[1].split("|")[0]))

        checkpoint = None
        if best_id is not None:
            checkpoints = os.listdir(self.args.model_checkpoints)
            for c in checkpoints:
                if c.startswith(best_id):
                    checkpoint = c
                    break
        logger.info("Best checkpoint: %s/%s" % (self.args.model_checkpoints, checkpoint))
        return "%s/%s" % (self.args.model_checkpoints, checkpoint)

    def yield_chunks(self, len_split_indices, batch_size):
        '''
        self.args.batch_size is not always cleanly divisible by the number of
        items in the split, so we need to always yield the correct number of
        items.
        '''
        for i in xrange(0, len_split_indices, batch_size):
            # yield split_indices[i:i+batch_size]
            yield (i, i+batch_size-1)

    def make_generation_arrays(self, prefix, fixed_words, generation=False):
        """Create arrays that are used as input for generation. """

        # Y_target is unused
        #if generation:
        #    input_data, _ =\
        #        self.data_gen.get_generation_data_by_split(prefix,
        #                      self.use_sourcelang, self.use_image)
        #else:
        input_data, _ = self.data_gen.get_data_by_split(prefix,
                           self.use_sourcelang, self.use_image)

        # Replace input words (input_data[0]) with zeros for generation,
        # except for the first args.generate_from_N_words
        # NOTE: this will include padding and BOS steps (fixed_words has been
        # incremented accordingly already in generate_sentences().)
        logger.info("Initialising with the first %d gold words (incl BOS)",
                    fixed_words)
        gen_input_data = deepcopy(input_data)
        gen_input_data[0][:, fixed_words:, :] = 0

        return gen_input_data

    def calculate_pplx(self, directory, val=True):
        """ Without batching. Robust against multiple descriptions/image,
        since it uses data_generator.get_data_by_split input. """
        prefix = "val" if val else "test"
        logger.info("Calculating pplx over %s data", prefix)
        sum_logprobs = 0
        y_len = 0
        input_data, Y_target = self.data_gen.get_data_by_split(prefix,
                                  self.use_sourcelang, self.use_image)

        if self.args.debug:
            tic = time.time()

        preds = self.model.predict(input_data, verbose=0)

        if self.args.debug:
            logger.info("Forward pass took %f", time.time()-tic)

        for t in range(Y_target.shape[1]):
            for i in range(Y_target.shape[0]):
                target_idx = np.argmax(Y_target[i, t])
                if self.index2word[target_idx] != "<P>":
                    log_p = math.log(preds[i, t, target_idx],2)
                    #logprobs.append(log_p)
                    sum_logprobs += -log_p
                    y_len += 1

        norm_logprob = sum_logprobs / y_len
        pplx = math.pow(2, norm_logprob)
        logger.info("PPLX: %.4f", pplx)
        handle = open("%s/%sPPLX" % (directory, prefix), "w")
        handle.write("%f\n" % pplx)
        handle.close()
        return pplx

    def extract_references(self, directory, val=True):
        """
        Get reference descriptions for val, training subsection.
        """
        prefix = "val" if val else "test"
        references = self.data_gen.get_refs_by_split_as_list(prefix)

        for refid in xrange(len(references[0])):
            codecs.open('%s/%s_reference.ref%d'
                        % (directory, prefix, refid), 'w', 'utf-8').write('\n'.join([x[refid] for x in references]))

    def bleu_score(self, directory, val=True):
        '''
        PPLX is only weakly correlated with improvements in BLEU,
        and thus improvements in human judgements. Let's also track
        BLEU score of a subset of generated sentences in the val split
        to decide on early stopping, etc.
        '''

        prefix = "val" if val else "test"
        self.extract_references(directory, val)

        subprocess.check_call(
            ['perl multi-bleu.perl %s/%s_reference.ref < %s/%sGenerated | tee %s/%sBLEU'
             % (directory, prefix, directory, prefix, directory, prefix)], shell=True)
Пример #13
0
class GroundedTranslationGenerator:
    def __init__(self, args):
        self.args = args
        self.vocab = dict()
        self.unkdict = dict()
        self.counter = 0
        self.maxSeqLen = 0

        # consistent with models.py
        self.use_sourcelang = args.source_vectors is not None
        self.use_image = not args.no_image
        self.model = None
        self.prepare_datagenerator()

        # this results in two file handlers for dataset (here and
        # data_generator)
        if not self.args.dataset:
            logger.warn("No dataset given, using flickr8k")
            self.dataset = h5py.File("flickr8k/dataset.h5", "r")
        else:
            self.dataset = h5py.File("%s/dataset.h5" % self.args.dataset, "r")

        if self.args.debug:
            theano.config.optimizer = 'None'
            theano.config.exception_verbosity = 'high'

    def prepare_datagenerator(self):
        self.data_gen = VisualWordDataGenerator(self.args, self.args.dataset)
        self.args.checkpoint = self.find_best_checkpoint()
        self.data_gen.set_vocabulary(self.args.checkpoint)
        self.vocab_len = len(self.data_gen.index2word)
        self.index2word = self.data_gen.index2word
        self.word2index = self.data_gen.word2index

    def generate(self):
        '''
        Entry point for this module.
        Loads up a data generator to get the relevant image / source features.
        Builds the relevant model, given the command-line arguments.
        Generates sentences for the images in the val / test data.
        Calculates BLEU and PPLX, unless requested.
        '''

        if self.use_sourcelang:
            # HACK FIXME unexpected problem with input_data
            self.hsn_size = self.data_gen.hsn_size
        else:
            self.hsn_size = 0

        if self.model == None:
            self.build_model(generate=True)

        self.generate_sentences(self.args.checkpoint, val=not self.args.test)
        if not self.args.without_scores:
            score = self.bleu_score(self.args.checkpoint,
                                    val=not self.args.test)
            if self.args.multeval:
                score, _, _ = self.multeval_scores(self.args.checkpoint,
                                                   val=not self.args.test)
            if not self.args.no_pplx:
                self.build_model(generate=False)
                self.calculate_pplx(self.args.checkpoint,
                                    val=not self.args.test)
            return score

    def generate_sentences(self, filepath, val=True):
        """
        Generates descriptions of images for --generation_timesteps
        iterations through the LSTM. Each input description is clipped to
        the first <BOS> token, or, if --generate_from_N_words is set, to the
        first N following words (N + 1 BOS token).
        This process can be additionally conditioned
        on source language hidden representations, if provided by the
        --source_vectors parameter.
        The output is clipped to the first EOS generated, if it exists.

        TODO: duplicated method with generate.py
        """

        if self.args.beam_width > 1:
            prefix = "val" if val else "test"
            handle = codecs.open("%s/%sGenerated" % (filepath, prefix), "w",
                                 'utf-8')
            logger.info("Generating %s descriptions", prefix)

            start_gen = self.args.generate_from_N_words  # Default 0
            start_gen = start_gen + 1  # include BOS

            generator = self.data_gen.generation_generator(prefix,
                                                           batch_size=1)
            seen = 0
            # we are going to beam search for the most probably sentence.
            # let's do this one sentence at a time to make the logging output
            # easier to understand
            for data in generator:
                text = data[0]['text']
                # Append the first start_gen words to the complete_sentences list
                # for each instance in the batch.
                complete_sentences = [[] for _ in range(text.shape[0])]
                for t in range(start_gen):  # minimum 1
                    for i in range(text.shape[0]):
                        w = np.argmax(text[i, t])
                        complete_sentences[i].append(self.index2word[w])
                del data[0]['text']
                text = self.reset_text_arrays(text, start_gen)
                Y_target = data[1]['output']
                data[0]['text'] = text

                max_beam_width = self.args.beam_width
                structs = self.make_duplicate_matrices(data[0], max_beam_width)

                # A beam is a 2-tuple with the probability of the sequence and
                # the words in that sequence. Start with empty beams
                beams = [(0.0, [])]
                # collects beams that are in the top candidates and
                # emitted a <E> token.
                finished = []
                for t in range(start_gen, self.args.generation_timesteps):
                    # Store the candidates produced at timestep t, will be
                    # pruned at the end of the timestep
                    candidates = []

                    # we take a view of the datastructures, which means we're only
                    # ever generating a prediction for the next word. This saves a
                    # lot of cycles.
                    preds = self.model.predict(structs, verbose=0)

                    # The last indices in preds are the predicted words
                    next_word_indices = preds[:, t - 1]
                    sorted_indices = np.argsort(-next_word_indices, axis=1)

                    # Each instance in structs is holding the history of a
                    # beam, and so there is a direct connection between the
                    # index of a beam in beams and the index of an instance in
                    # structs.
                    for beam_idx, b in enumerate(beams):
                        # get the sorted predictions for the beam_idx'th beam
                        beam_predictions = sorted_indices[beam_idx]
                        for top_idx in range(self.args.beam_width):
                            wordIndex = beam_predictions[top_idx]
                            wordProb = next_word_indices[beam_idx][
                                beam_predictions[top_idx]]
                            # For the beam_idxth beam, add the log probability
                            # of the top_idxth predicted word to the previous
                            # log probability of the sequence, and  append the
                            # top_idxth predicted word to the sequence of words
                            candidates.append([
                                b[0] + math.log(wordProb), b[1] + [wordIndex]
                            ])

                    candidates.sort(reverse=True)
                    if self.args.verbose:
                        logger.info("Candidates in the beam")
                        logger.info("---")
                        for c in candidates:
                            logger.info(
                                " ".join([self.index2word[x]
                                          for x in c[1]]) + " (%f)" % c[0])

                    beams = candidates[:max_beam_width]  # prune the beams
                    pruned = []
                    for b in beams:
                        # If a top candidate emitted an EOS token then
                        # a) add it to the list of finished sequences
                        # b) remove it from the beams and decrease the
                        # maximum size of the beams.
                        if b[1][-1] == self.word2index["<E>"]:
                            finished.append(b)
                            if max_beam_width >= 1:
                                max_beam_width -= 1
                        else:
                            pruned.append(b)

                    beams = pruned[:max_beam_width]

                    if self.args.verbose:
                        logger.info("Pruned beams")
                        logger.info("---")
                        for b in beams:
                            logger.info(
                                " ".join([self.index2word[x]
                                          for x in b[1]]) + "(%f)" % b[0])

                    if max_beam_width == 0:
                        # We have sampled max_beam_width sequences with an <E>
                        # token so stop the beam search.
                        break

                    # Reproduce the structs for the beam search so we can keep
                    # track of the state of each beam
                    structs = self.make_duplicate_matrices(
                        data[0], max_beam_width)

                    # Rewrite the 1-hot word features with the
                    # so-far-predcicted tokens in a beam.
                    for bidx, b in enumerate(beams):
                        for idx, w in enumerate(b[1]):
                            next_word_index = w
                            structs['text'][bidx, idx + 1, w] = 1.

                # If none of the sentences emitted an <E> token while
                # decoding, add the final beams into the final candidates
                if len(finished) == 0:
                    for leftover in beams:
                        finished.append(leftover)

                # Normalise the probabilities by the length of the sequences
                # as suggested by Graves (2012) http://arxiv.org/abs/1211.3711
                for f in finished:
                    f[0] = f[0] / len(f[1])
                finished.sort(reverse=True)

                if self.args.verbose:
                    logger.info("Length-normalised samples")
                    logger.info("---")
                    for f in finished:
                        logger.info(
                            " ".join([self.index2word[x]
                                      for x in f[1]]) + "(%f)" % f[0])

                # Emit the lowest (log) probability sequence
                best_beam = finished[0]
                complete_sentences[i] = [
                    self.index2word[x] for x in best_beam[1]
                ]
                handle.write(' '.join([
                    x for x in itertools.takewhile(lambda n: n != "<E>",
                                                   complete_sentences[i])
                ]) + "\n")
                if self.args.verbose:
                    logger.info(
                        "%s (%f)", ' '.join([
                            x for x in itertools.takewhile(
                                lambda n: n != "<E>", complete_sentences[i])
                        ]), best_beam[0])

                seen += text.shape[0]
                if seen == self.data_gen.split_sizes['val']:
                    # Hacky way to break out of the generator
                    break
            handle.close()
        else:
            # We are going to arg max decode a sequence.
            prefix = "val" if val else "test"
            logger.info("Generating %s descriptions", prefix)
            start_gen = self.args.generate_from_N_words + 1  # include BOS
            handle = codecs.open("%s/%sGenerated" % (filepath, prefix), "w",
                                 'utf-8')

            generator = self.data_gen.generation_generator(prefix)
            seen = 0
            for data in generator:
                text = deepcopy(data[0]['text'])
                # Append the first start_gen words to the complete_sentences list
                # for each instance in the batch.
                complete_sentences = [[] for _ in range(text.shape[0])]
                for t in range(start_gen):  # minimum 1
                    for i in range(text.shape[0]):
                        w = np.argmax(text[i, t])
                        complete_sentences[i].append(self.index2word[w])
                del data[0]['text']
                text = self.reset_text_arrays(text, start_gen)
                Y_target = data[1]['output']
                data[0]['text'] = text

                for t in range(start_gen, self.args.generation_timesteps):
                    logger.debug("Input token: %s" %
                                 self.index2word[np.argmax(text[0, t - 1])])
                    preds = self.model.predict(data[0], verbose=0)

                    # Look at the last indices for the words.
                    next_word_indices = np.argmax(preds[:, t - 1], axis=1)
                    logger.debug("Predicted token: %s" %
                                 self.index2word[next_word_indices[0]])
                    # update array[0]/sentence-so-far with generated words.
                    for i in range(len(next_word_indices)):
                        data[0]['text'][i, t, next_word_indices[i]] = 1.
                    next_words = [
                        self.index2word[x] for x in next_word_indices
                    ]
                    for i in range(len(next_words)):
                        complete_sentences[i].append(next_words[i])

                sys.stdout.flush()
                # print/extract each sentence until it hits the first end-of-string token
                for s in complete_sentences:
                    if self.args.verbose:
                        logger.info(
                            "%s", ' '.join([
                                x for x in itertools.takewhile(
                                    lambda n: n != "<E>",
                                    complete_sentences[i])
                            ]))
                    decoded_str = ' '.join([
                        x for x in itertools.takewhile(lambda n: n != "<E>",
                                                       s[1:])
                    ])
                    handle.write(decoded_str + "\n")

                seen += text.shape[0]
                if seen == self.data_gen.split_sizes[prefix]:
                    # Hacky way to break out of the generator
                    break
            handle.close()

    def calculate_pplx(self, path, val=True):
        """ Splits the input data into batches of self.args.batch_size to
        reduce the memory footprint of holding all of the data in RAM. """

        prefix = "val" if val else "test"
        logger.info("Calculating pplx over %s data", prefix)
        sum_logprobs = 0
        y_len = 0

        generator = self.data_gen.generation_generator(prefix)
        seen = 0
        for data in generator:
            Y_target = deepcopy(data[1]['output'])
            del data[1]['output']

            preds = self.model.predict(data[0],
                                       verbose=0,
                                       batch_size=self.args.batch_size)

            for i in range(Y_target.shape[0]):
                for t in range(Y_target.shape[1]):
                    target_idx = np.argmax(Y_target[i, t])
                    target_tok = self.index2word[target_idx]
                    if target_tok != "<P>":
                        log_p = math.log(preds[i, t, target_idx], 2)
                        sum_logprobs += -log_p
                        y_len += 1

            seen += data[0]['text'].shape[0]
            if seen == self.data_gen.split_sizes[prefix]:
                # Hacky way to break out of the generator
                break

        norm_logprob = sum_logprobs / y_len
        pplx = math.pow(2, norm_logprob)
        logger.info("PPLX: %.4f", pplx)
        handle = open("%s/%sPPLX" % (path, prefix), "w")
        handle.write("%f\n" % pplx)
        handle.close()
        return pplx

    def reset_text_arrays(self, text_arrays, fixed_words=1):
        """ Reset the values in the text data structure to zero so we cannot
        accidentally pass them into the model.

        Helper function for generate_sentences().
         """
        reset_arrays = deepcopy(text_arrays)
        reset_arrays[:, fixed_words:, :] = 0
        return reset_arrays

    def make_duplicate_matrices(self, generator_data, k):
        '''
        Prepare K duplicates of the input data for a given instance yielded by
        the data generator.

        Helper function for the beam search decoder in generation_sentences().
        '''

        if self.use_sourcelang and self.use_image:
            # the data generator yielded a dictionary with the words, the
            # image features, and the source features
            dupes = [[], [], []]
            words = generator_data['text']
            img = generator_data['img']
            source = generator_data['src']
            for x in range(k):
                # Make a deep copy of the word_feats structures
                # so the arrays will never be shared
                dupes[0].append(deepcopy(words[0, :, :]))
                dupes[1].append(source[0, :, :])
                dupes[2].append(img[0, :, :])

            # Turn the list of arrays into a numpy array
            dupes[0] = np.array(dupes[0])
            dupes[1] = np.array(dupes[1])
            dupes[2] = np.array(dupes[2])

            return {'text': dupes[0], 'img': dupes[2], 'src': dupes[1]}

        elif self.use_image:
            # the data generator yielded a dictionary with the words and the
            # image features
            dupes = [[], []]
            words = generator_data['text']
            img = generator_data['img']
            for x in range(k):
                # Make a deep copy of the word_feats structures
                # so the arrays will never be shared
                dupes[0].append(deepcopy(words[0, :, :]))
                dupes[1].append(img[0, :, :])

            # Turn the list of arrays into a numpy array
            dupes[0] = np.array(dupes[0])
            dupes[1] = np.array(dupes[1])

            return {'text': dupes[0], 'img': dupes[1]}

        elif self.use_sourcelang:
            # the data generator yielded a dictionary with the words and the
            # source features
            dupes = [[], []]
            words = generator_data['text']
            source = generator_data['src']
            for x in range(k):
                # Make a deep copy of the word_feats structures
                # so the arrays will never be shared
                dupes[0].append(deepcopy(words[0, :, :]))
                dupes[1].append(source[0, :, :])

            # Turn the list of arrays into a numpy array
            dupes[0] = np.array(dupes[0])
            dupes[1] = np.array(dupes[1])

            return {'text': dupes[0], 'src': dupes[1]}

    def find_best_checkpoint(self):
        '''
        Read the summary file from the directory and scrape out the run ID of
        the highest BLEU scoring checkpoint. Then do an ls-stlye function in
        the directory and return the exact path to the best model.

        Assumes only one matching prefix in the model checkpoints directory.
        '''

        summary_data = open("%s/summary" %
                            self.args.model_checkpoints).readlines()
        summary_data = [x.replace("\n", "") for x in summary_data]
        best_id = None
        target = "Best loss" if self.args.best_pplx else "Best Metric"
        for line in summary_data:
            if line.startswith(target):
                best_id = "%03d" % (int(line.split(":")[1].split("|")[0]))

        checkpoint = None
        if best_id is not None:
            checkpoints = os.listdir(self.args.model_checkpoints)
            for c in checkpoints:
                if c.startswith(best_id):
                    checkpoint = c
                    break
        logger.info("Best checkpoint: %s/%s" %
                    (self.args.model_checkpoints, checkpoint))
        return "%s/%s" % (self.args.model_checkpoints, checkpoint)

    def bleu_score(self, directory, val=True):
        '''
        PPLX is only weakly correlated with improvements in BLEU,
        and thus improvements in human judgements. Let's also track
        BLEU score of a subset of generated sentences in the val split
        to decide on early stopping, etc.
        '''

        prefix = "val" if val else "test"
        self.extract_references(directory, val)

        subprocess.check_call([
            'perl multi-bleu.perl %s/%s_reference.ref < %s/%sGenerated | tee %s/%sBLEU'
            % (directory, prefix, directory, prefix, directory, prefix)
        ],
                              shell=True)
        bleudata = open("%s/%sBLEU" % (directory, prefix)).readline()
        data = bleudata.split(",")[0]
        bleuscore = data.split("=")[1]
        bleu = float(bleuscore.lstrip())
        return bleu

    def multeval_scores(self, directory, val=True):
        '''
        Maybe you want to evaluate with Meteor, TER, and BLEU?
        '''
        prefix = "val" if val else "test"
        self.extract_references(directory, val)

        with cd(MULTEVAL_DIR):
            subprocess.check_call([
                './multeval.sh eval --refs ../%s/%s_reference.* \
                 --hyps-baseline ../%s/%sGenerated \
                 --meteor.language %s \
		 --threads 4 \
		2> multevaloutput 1> multevaloutput' %
                (directory, prefix, directory, prefix, self.args.meteor_lang)
            ],
                                  shell=True)
            handle = open("multevaloutput")
            multdata = handle.readlines()
            handle.close()
            for line in multdata:
                if line.startswith("RESULT: baseline: BLEU: AVG:"):
                    mbleu = line.split(":")[4]
                    mbleu = mbleu.replace("\n", "")
                    mbleu = mbleu.strip()
                    lr = mbleu.split(".")
                    mbleu = float(lr[0] + "." + lr[1][0:2])
                if line.startswith("RESULT: baseline: METEOR: AVG:"):
                    mmeteor = line.split(":")[4]
                    mmeteor = mmeteor.replace("\n", "")
                    mmeteor = mmeteor.strip()
                    lr = mmeteor.split(".")
                    mmeteor = float(lr[0] + "." + lr[1][0:2])
                if line.startswith("RESULT: baseline: TER: AVG:"):
                    mter = line.split(":")[4]
                    mter = mter.replace("\n", "")
                    mter = mter.strip()
                    lr = mter.split(".")
                    mter = float(lr[0] + "." + lr[1][0:2])

            logger.info("Meteor = %.2f | BLEU = %.2f | TER = %.2f", mmeteor,
                        mbleu, mter)

            return mmeteor, mbleu, mter

    def extract_references(self, directory, val=True):
        """
        Get reference descriptions for split we are generating outputs for.

        Helper function for bleu_score().
        """
        prefix = "val" if val else "test"
        references = self.data_gen.get_refs_by_split_as_list(prefix)

        for refid in xrange(len(references[0])):
            codecs.open('%s/%s_reference.ref%d' % (directory, prefix, refid),
                        'w', 'utf-8').write('\n'.join(
                            [x[refid] for x in references]))

    def build_model(self, generate=False):
        '''
        Build a Keras model if one does not yet exist.

        Helper function for generate().
        '''

        if generate:
            t = self.args.generation_timesteps
        else:
            t = self.data_gen.max_seq_len
        if self.args.mrnn:
            m = models.MRNN(self.args.embed_size,
                            self.args.hidden_size,
                            self.vocab_len,
                            self.args.dropin,
                            self.args.optimiser,
                            self.args.l2reg,
                            hsn_size=self.hsn_size,
                            weights=self.args.checkpoint,
                            gru=self.args.gru,
                            clipnorm=self.args.clipnorm,
                            t=t)
        else:
            m = models.NIC(self.args.embed_size,
                           self.args.hidden_size,
                           self.vocab_len,
                           self.args.dropin,
                           self.args.optimiser,
                           self.args.l2reg,
                           hsn_size=self.hsn_size,
                           weights=self.args.checkpoint,
                           gru=self.args.gru,
                           clipnorm=self.args.clipnorm,
                           t=t)

        self.model = m.buildKerasModel(use_sourcelang=self.use_sourcelang,
                                       use_image=self.use_image)
class ExtractFinalHiddenStateActivations:

    def __init__(self, args):
        self.args = args
        self.args.generate_from_N_words = 0  # Default 0
        self.vocab = dict()
        self.unkdict = dict()
        self.counter = 0
        self.maxSeqLen = 0
        self.MAX_HT = self.args.generation_timesteps - 1

        # consistent with models.py
        # maybe use_sourcelang isn't applicable here?
        self.use_sourcelang = args.source_vectors is not None
        self.use_image = not args.no_image

        if self.args.debug:
            theano.config.optimizer = 'None'
            theano.config.exception_verbosity = 'high'

        self.source_type = "predicted" if self.args.use_predicted_tokens else "gold"
        self.source_encoder = "mt_enc" if self.args.no_image else "vis_enc"
        self.source_dim = self.args.hidden_size

        self.h5_dataset_str = "%s-hidden_feats-%s-%d" % (self.source_type,
                                                         self.source_encoder,
                                                         self.source_dim)
        logger.info("Serialising into %s" % self.h5_dataset_str)

    def get_hidden_activations(self):
        '''
        In the model, we will merge the VGG image representation with
        the word embeddings. We need to feed the data as a list, in which
        the order of the elements in the list is _crucial_.
        '''

        self.data_generator = VisualWordDataGenerator(self.args,
                                                      self.args.dataset)
        self.args.checkpoint = self.find_best_checkpoint()
        self.data_generator.set_vocabulary(self.args.checkpoint)
        self.vocab_len = len(self.data_generator.index2word)
        t = self.args.generation_timesteps if self.args.use_predicted_tokens else self.data_generator.max_seq_len

        m = models.NIC(self.args.embed_size, self.args.hidden_size,
                       self.vocab_len,
                       self.args.dropin,
                       self.args.optimiser, self.args.l2reg,
                       weights=self.args.checkpoint,
                       gru=self.args.gru,
                       t=t)

        self.fhs = m.buildHSNActivations(use_image=self.use_image)
        if self.args.use_predicted_tokens and self.args.no_image == False:
            gen_m = models.NIC(self.args.embed_size, self.args.hidden_size,
                               self.vocab_len,
                               self.args.dropin,
                               self.args.optimiser, self.args.l2reg,
                               weights=self.args.checkpoint,
                               gru=self.args.gru,
                               t=self.args.generation_timesteps)
            self.full_model = gen_m.buildKerasModel(use_image=self.use_image)

        self.new_generate_activations('train')
        self.new_generate_activations('val')
        self.new_generate_activations('test')

    def new_generate_activations(self, split):
        '''
        Generate and serialise final-timestep hidden state activations
        into --dataset.
        TODO: we should be able to serialise predicted final states instead of
        gold-standard final states for val and test data.
        '''
        logger.info("%s: extracting final hidden state activations from this model", split)

	# Prepare the data generator based on whether we're going to work with
	# the gold standard input tokens or the automatically predicted tokens
	if self.args.use_predicted_tokens:
	    the_generator = self.data_generator.generation_generator(split=split)
	else:
	    the_generator = self.data_generator.fixed_generator(split=split)

	counter = 0
        if split == 'train':
            hidden_states = []
            batch_start = 0
            batch_end = 0
            for data in the_generator:
		if self.args.use_predicted_tokens:
		    tokens = self.get_predicted_tokens(data)
                    data['text'] = self.set_text_arrays(tokens, data['text'])
                print(data['text'].shape)

                # We extract the FHS from either the oracle input tokens
                hsn = self.fhs.predict({'text': data['text'],
                                        'img': data['img']},
                                       batch_size=self.args.batch_size,
                                       verbose=1)

                for idx, h in enumerate(hsn['rnn']):
                    # get final_hidden index on a sentence-by-sentence
                    # basis by searching for the first <E> in each trainY
                    eos = False
                    for widx, warr in enumerate(data['output'][idx]):
                        w = np.argmax(warr)
                        if self.data_generator.index2word[w] == "<E>":
                            final_hidden = h[widx]
                            hidden_states.append(final_hidden)
                            eos = True
                            logger.debug(widx)
                            break
                    if not eos:
                        final_hidden = h[self.MAX_HT]
                        hidden_states.append(final_hidden)
                    batch_end += 1

                # Note: serialisation happens over training batches too.
                # now serialise the hidden representations in the h5
                self.to_h5_indices(split, data['indices'], hidden_states)

                batch_start = batch_end
		counter += len(hidden_states)
                hidden_states = []
		logger.info("Processed %d instances" % counter)
                if batch_end >= self.data_generator.split_sizes[split]:
                    break

        elif split == 'val' or split == "test":
            hidden_states = []
            batch_start = 0
            batch_end = 0
            for data in the_generator:
		if self.args.use_predicted_tokens:
		    tokens = self.get_predicted_tokens(data)
                    data['text'] = self.set_text_arrays(tokens, data['text'])

                # We extract the FHS from either the oracle input tokens
                hsn = self.fhs.predict({'text': data['text'],
                                        'img': data['img']},
                                       batch_size=self.args.batch_size,
                                       verbose=1)

                for idx, h in enumerate(hsn['rnn']):
                    # get final_hidden index on a sentence-by-sentence
                    # basis by searching for the first <E> in each trainY
                    eos = False
                    for widx, warr in enumerate(data['output'][idx]):
                        w = np.argmax(warr)
                        if self.data_generator.index2word[w] == "<E>":
                            final_hidden = h[widx]
                            hidden_states.append(final_hidden)
                            eos = True
                            break
                    if not eos:
                        final_hidden = h[self.MAX_HT]
                        hidden_states.append(final_hidden)
                    batch_end += 1

                # Note: serialisation happens over training batches too.
                # now serialise the hidden representations in the h5
                self.to_h5_indices(split, data['indices'], hidden_states)

                batch_start = batch_end
		counter += len(hidden_states)
                hidden_states = []
		logger.info("Processed %d instances" % counter)
                if batch_end >= self.data_generator.split_sizes[split]:
                    break

    def get_predicted_tokens(self, data):
        """
        We're not going to work with the gold standard input tokens.
        Instead we're going to automatically predict them and then extract
        the final hidden state from the inferred data.

        Helper function used by new_generate_activations().
        """
        # We are going to arg max decode a sequence.
        start_gen = self.args.generate_from_N_words + 1  # include BOS

        text = deepcopy(data['text'])
        # Append the first start_gen words to the complete_sentences list
        # for each instance in the batch.
        complete_sentences = [[] for _ in range(text.shape[0])]
        for t in range(start_gen):  # minimum 1
            for i in range(text.shape[0]):
                w = np.argmax(text[i, t])
                complete_sentences[i].append(self.data_generator.index2word[w])
        del data['text']
        text = self.reset_text_arrays(text, start_gen)
        Y_target = data['output']
        data['text'] = text

        for t in range(start_gen, self.args.generation_timesteps):
            logger.debug("Input token: %s" % self.data_generator.index2word[np.argmax(data['text'][0,t-1])])
            preds = self.full_model.predict(data, verbose=0)

            # Look at the last indices for the words.
            next_word_indices = np.argmax(preds['output'][:, t-1], axis=1)
            logger.debug("Predicted token: %s" % self.data_generator.index2word[next_word_indices[0]])
            # update array[0]/sentence-so-far with generated words.
            for i in range(len(next_word_indices)):
                data['text'][i, t, next_word_indices[i]] = 1.
            next_words = [self.data_generator.index2word[x] for x in next_word_indices]
            for i in range(len(next_words)):
                complete_sentences[i].append(next_words[i])

        # extract each sentence until it hits the first end-of-string token
        pruned_sentences = []
        for s in complete_sentences:
            pruned_sentences.append([x for x
                                    in itertools.takewhile(
                                        lambda n: n != "<E>", s)])
        return pruned_sentences

    def set_text_arrays(self, predicted_tokens, text_arrays):
        """ Set the values of the text tokens in the text arrays
        based on the tokens predicted by the model.

        Helper function used by new_generate_activations() """
        pidx = 0
        new_arrays = deepcopy(text_arrays)
        for pairs in zip(predicted_tokens, text_arrays):
            toks = pairs[0]
            struct = pairs[1]
            for tidx, t in enumerate(toks):
                struct[tidx, self.data_generator.word2index[t]] = 1
            new_arrays[pidx] = struct
            pidx += 1
        return new_arrays

    def reset_text_arrays(self, text_arrays, fixed_words=1):
        """ Reset the values in the text data structure to zero so we cannot
        accidentally pass them into the model.

        Helper function for generate_sentences().
         """
        reset_arrays = deepcopy(text_arrays)
        reset_arrays[:,fixed_words:, :] = 0
        return reset_arrays

    def generate_activations(self, split):
        '''
        Generate and serialise final-timestep hidden state activations
        into --dataset.
        TODO: we should be able to serialise predicted final states instead of
        gold-standard final states for val and test data.
        '''
        logger.info("%s: extracting final hidden state activations from this model", split)

        if split == 'train':
            """ WARNING: This collects the *entirety of the training data* in
            hidden_states, so should not be used on non-toy training data.
            """
            hidden_states = []
            batch_start = 0
            batch_end = 0
            for train_input, trainY, indicator, keys in\
                self.data_generator.yield_training_batch(self.args.big_batch_size,
                                                         self.use_sourcelang,
                                                         self.use_image,
                                                         return_keys=True):

                if self.args.use_predicted_tokens is True and\
                    self.args.no_image is False:
                    # Reset the word indices and then generate the
                    # descriptions of the images from scratch
                    fixed_words = self.args.generate_from_N_words + 1
                    train_input[0][:, fixed_words:, :] = 0
                    predicted_words = self.generate_sentences(split,
                                                              arrays=train_input)
                    self.sentences_to_h5_keys(split, keys, predicted_words)

                    # TODO: code duplication from make_generation_arrays
                    pred_inputs = deepcopy(train_input)
                    tokens = pred_inputs[0]
                    tokens[:, fixed_words, :] = 0  # reset the inputs
                    for prediction, words in zip(predicted_words, tokens):
                        for idx, t in enumerate(prediction):
                            words[idx, self.data_generator.word2index[t]] = 1.
                    trainY = self.data_generator.get_target_descriptions(tokens)

                    hsn = self.fhs.predict(train_input,
                                           batch_size=self.args.batch_size,
                                           verbose=1)

                else:
                    # We extract the FHS from oracle training input tokens
                    hsn = self.fhs.predict(train_input,
                                           batch_size=self.args.batch_size,
                                           verbose=1)

                logger.info(len(hsn))
                for idx, h in enumerate(hsn):
                    # get final_hidden index on a sentence-by-sentence
                    # basis by searching for the first <E> in each trainY
                    eos = False
                    for widx, warr in enumerate(trainY[idx]):
                        w = np.argmax(warr)
                        if self.data_generator.index2word[w] == "<E>":
                            final_hidden = h[widx]
                            hidden_states.append(final_hidden)
                            eos = True
                            break
                    if not eos:
                        final_hidden = h[30]
                        hidden_states.append(final_hidden)
                    batch_end += 1
                logger.info(len(hidden_states))

                # Note: serialisation happens over training batches too.
                # now serialise the hidden representations in the h5
                #self.serialise_to_h5(split, len(hidden_states[0]), hidden_states,
                #                     batch_start, batch_end)
                # KEYS ARE OVER IMAGES NOT DESCRIPTIONS
                # THIS WILL BREAK IF THERE ARE MULTIPLE DESCRIPTIONS/IMAGE
                self.serialise_to_h5_keys(split, keys, hidden_states,
                                          batch_start, batch_end)

                batch_start = batch_end
                hidden_states = []

        elif split == 'val' or split == "test":
            # TODO: get keys and do serialise_to_h5 with keys.
            inputs, Ys = self.data_generator.get_data_by_split(split,
                                      self.use_sourcelang, self.use_image)
            hidden_states = []
            # We can extract the FGS from either oracle or predicted word
            # sequences for val  / test data .
            if self.args.use_predicted_tokens is True and self.args.no_image is False:
                predicted_words = self.generate_sentences(split)
                self.sentences_to_h5(split, predicted_words)
                inputs, Ys = self.make_generation_arrays(split,
                                         self.args.generate_from_N_words,
                                         predicted_tokens=predicted_words)

            hsn = self.fhs.predict(inputs,
                                   batch_size=self.args.batch_size,
                                   verbose=1)

            for idx, h in enumerate(hsn):
                # get final_hidden index on a sentence-by-sentence
                # basis by searching for the first <E> in each trainY
                for widx, warr in enumerate(Ys[idx]):
                    w = np.argmax(warr)
                    if self.data_generator.index2word[w] == "<E>":
                        logger.debug("Sentence length %d", widx)
                        final_hidden = h[widx]
                        hidden_states.append(final_hidden)
                        break

            # now serialise the hidden representations in the h5
            self.serialise_to_h5(split, len(hidden_states[0]), hidden_states)

    def make_generation_arrays(self, prefix, fixed_words,
                               predicted_tokens=None):
        '''
        Create arrays that are used as input for generation / activation.
        '''


        if predicted_tokens is not None:
            input_data, targets = self.data_generator.get_data_by_split(prefix,
                                           self.use_sourcelang, self.use_image)
            logger.info("Initialising generation arrays with predicted tokens")
            gen_input_data = deepcopy(input_data)
            tokens = gen_input_data[0]
            tokens[:, fixed_words, :] = 0  # reset the inputs
            for prediction, words, tgt in zip(predicted_tokens, tokens, targets):
                for idx, t in enumerate(prediction):
                    words[idx, self.data_generator.word2index[t]] = 1.
            targets = self.data_generator.get_target_descriptions(tokens)
            return gen_input_data, targets

        else:
            # Replace input words (input_data[0]) with zeros for generation,
            # except for the first args.generate_from_N_words
            # NOTE: this will include padding and BOS steps (fixed_words has been
            # incremented accordingly already in generate_sentences().)
            input_data = self.data_generator.get_generation_data_by_split(prefix,
                                           self.use_sourcelang, self.use_image)
            logger.info("Initialising with the first %d gold words (incl BOS)",
                        fixed_words)
            gen_input_data = deepcopy(input_data)
            gen_input_data[0][:, fixed_words:, :] = 0
            return gen_input_data

    def generate_sentences(self, split, arrays=None):
        """
        Generates descriptions of images for --generation_timesteps
        iterations through the LSTM. Each input description is clipped to
        the first <BOS> token, or, if --generate_from_N_words is set, to the
        first N following words (N + 1 BOS token).
        This process can be additionally conditioned
        on source language hidden representations, if provided by the
        --source_vectors parameter.
        The output is clipped to the first EOS generated, if it exists.

        TODO: beam search
        TODO: duplicated method with generate.py and Callbacks.py
        """
        logger.info("%s: generating descriptions", split)

        start_gen = self.args.generate_from_N_words  # Default 0
        start_gen = start_gen + 1  # include BOS

        # prepare the datastructures for generation (no batching over val)
        if arrays == None:
            arrays = self.make_generation_arrays(split, start_gen)
        N_sents = arrays[0].shape[0]

        complete_sentences = [[] for _ in range(N_sents)]
        for t in range(start_gen):  # minimum 1
            for i in range(N_sents):
                w = np.argmax(arrays[0][i, t])
                complete_sentences[i].append(self.data_generator.index2word[w])

        for t in range(start_gen, self.args.generation_timesteps):
            # we take a view of the datastructures, which means we're only
            # ever generating a prediction for the next word. This saves a
            # lot of cycles.
            preds = self.full_model.predict([arr[:, 0:t] for arr in arrays],
                                            verbose=0)

            # Look at the last indices for the words.
            next_word_indices = np.argmax(preds[:, -1], axis=1)
            # update array[0]/sentence-so-far with generated words.
            for i in range(N_sents):
                arrays[0][i, t, next_word_indices[i]] = 1.
            next_words = [self.data_generator.index2word[x] for x in next_word_indices]
            for i in range(len(next_words)):
                complete_sentences[i].append(next_words[i])

        # extract each sentence until it hits the first end-of-string token
        pruned_sentences = []
        for s in complete_sentences:
            pruned_sentences.append([x for x
                                     in itertools.takewhile(
                                         lambda n: n != "<E>", s)])
        return pruned_sentences

    def to_h5_indices(self, split, indices, hidden_states):
        hsn_shape = len(hidden_states[0])
        fhf_str = "final_hidden_features"
        logger.info("Serialising final hidden state features from %s to H5",
                    split)
        for idx, data_key in enumerate(indices):
            ident = data_key[0]
            desc_idx = data_key[1]
            self.data_generator.set_source_features(split, ident,
                                                    self.h5_dataset_str,
                                                    hidden_states[idx],
                                                    hsn_shape,
                                                    desc_idx)

    def serialise_to_h5_keys(self, split, data_keys, hidden_states):
        hsn_shape = len(hidden_states[0])
        fhf_str = "final_hidden_features"
        logger.info("Serialising final hidden state features from %s to H5",
                    split)
        for idx, data_key in enumerate(data_keys):
            self.data_generator.set_source_features(split, data_key,
                                                    self.h5_dataset_str,
                                                    hidden_states[idx],
                                                    hsn_shape)
            #try:
            #    hsn_data = self.data_generator.dataset[split][data_key].create_dataset(
            #        fhf_str, (hsn_shape,), dtype='float32')
            #except RuntimeError:
            #    # the dataset already exists, retrieve it into RAM and then overwrite it
            #    del self.data_generator.dataset[split][data_key][fhf_str]
            #    hsn_data = self.data_generator.dataset[split][data_key].create_dataset(
            #        fhf_str, (hsn_shape,), dtype='float32')
            #try:
            #    hsn_data[:] = hidden_states[idx]
            #except IndexError:
            #    raise IndexError("data_key %s of %s; index idx %d, len hidden %d" % (
            #        data_key, len(data_keys), idx, len(hidden_states)))
            #    break

    def sentences_to_h5(self, split, sentences):
        '''
        Save the predicted sentences into the h5 dataset object.
        This is useful for subsequently (i.e. in a different program)
        extracting LM-only final hidden states from predicted sentences.
        Specifically, this can be compared to generating LM-only hidden
        states over gold-standard tokens.
        '''
        idx = 0
        logger.info("Serialising sentences from %s to H5", split)
        data_keys = self.data_generator.dataset[split]
        if split == 'val' and self.args.small_val:
            data_keys = ["%06d" % x for x in range(len(sentences))]
        else:
            data_keys = ["%06d" % x for x in range(len(sentences))]
        for data_key in data_keys:
            self.data_generator.set_predicted_description(split, data_key,
                                                          sentences[idx][1:])
            idx += 1

    def sentences_to_h5_keys(self, split, data_keys, sentences):
        logger.info("Serialising sentences from %s to H5",
                    split)
        for idx, data_key in enumerate(data_keys):
            self.data_generator.set_predicted_description(split, data_key,
                                                    sentences[idx])

    def serialise_to_h5(self, split, hsn_shape, hidden_states,
                        batch_start=None, batch_end=None):
        """ Serialise the hidden representations from generate_activations
        into the h5 dataset.
        This assumes one hidden_state per image key, which is maybe not
        appropriate if there are multiple descriptions/image.
        """
        idx = 0
        logger.info("Serialising final hidden state features from %s to H5",
                    split)
        if batch_start is not None:
            logger.info("Start at %d, end at %d", batch_start, batch_end)
            data_keys = ["%06d" % x for x in range(batch_start, batch_end)]
            assert len(hidden_states) == len(data_keys),\
                    "keys: %d hidden %d; start %d end %d" % (len(data_keys),
                                            len(hidden_states), batch_start,
                                            batch_end)
        else:
            data_keys = self.data_generator.dataset[split]
            if split == 'val' and self.args.small_val:
                data_keys = ["%06d" % x for x in range(len(hidden_states))]
            else:
                data_keys = ["%06d" % x for x in range(len(hidden_states))]
        for data_key in data_keys:
            self.data_generator.set_source_features(split, data_key,
                                                    self.h5_dataset_str,
                                                    hidden_states[idx],
                                                    hsn_shape)
            #try:
            #    hsn_data = self.data_generator.dataset[split][data_key].create_dataset(
            #        fhf_str, (hsn_shape,), dtype='float32')
            #except RuntimeError:
            #    # the dataset already exists, retrieve it into RAM and then overwrite it
            #    del self.data_generator.dataset[split][data_key][fhf_str]
            #    hsn_data = self.data_generator.dataset[split][data_key].create_dataset(
            #        fhf_str, (hsn_shape,), dtype='float32')
            #try:
            #    hsn_data[:] = hidden_states[idx]
            #except IndexError:
            #    raise IndexError("data_key %s of %s; index idx %d, len hidden %d" % (
            #        data_key, len(data_keys),
            #                      idx, len(hidden_states)))
            #    break
            idx += 1

    def find_best_checkpoint(self):
        '''
        Read the summary file from the directory and scrape out the run ID of
        the highest BLEU scoring checkpoint. Then do an ls-stlye function in
        the directory and return the exact path to the best model.

        Assumes only one matching prefix in the model checkpoints directory.
        '''

        summary_data = open("%s/summary" % self.args.model_checkpoints).readlines()
        summary_data = [x.replace("\n", "") for x in summary_data]
        best_id = None
        target = "Best PPLX" if self.args.best_pplx else "Best BLEU"
        for line in summary_data:
            if line.startswith(target):
                best_id = "%03d" % (int(line.split(":")[1].split("|")[0]))

        checkpoint = None
        if best_id is not None:
            checkpoints = os.listdir(self.args.model_checkpoints)
            for c in checkpoints:
                if c.startswith(best_id):
                    checkpoint = c
                    break
        return "%s/%s" % (self.args.model_checkpoints, checkpoint)
Пример #15
0
class GroundedTranslation(object):
    def __init__(self, args, datagen=None):
        '''
        Initialise the model and set Theano debugging model if
        self.args.debug is true. Prepare the data generator if necessary.
        '''

        self.args = args
        self.data_generator = datagen
        self.use_sourcelang = args.source_vectors is not None
        self.use_image = not args.no_image
        self.log_run_arguments()
        self.data_generator = datagen
        self.prepare_datagenerator()

        if self.args.debug:
            theano.config.optimizer = 'fast_compile'
            theano.config.exception_verbosity = 'high'

    def train_model(self):
        '''
        Initialise the data generator to process the data in a memory-friendly
        manner. Then build the Keras model, given the user-specified arguments
        (or the initial defaults). Train the model for self.args.max_epochs
        and return the training and validation losses.

        The losses object contains a history variable. The history variable is
        a dictionary with a list of training and validation losses:

        losses.history.['loss']
        losses.history.['val_loss']
        '''

        if not self.use_sourcelang:
            hsn_size = 0
        else:
            hsn_size = self.data_generator.hsn_size  # ick

        if self.args.mrnn:
            m = models.MRNN(self.args.embed_size,
                            self.args.hidden_size,
                            self.V,
                            self.args.dropin,
                            self.args.optimiser,
                            self.args.l2reg,
                            hsn_size=hsn_size,
                            weights=self.args.init_from_checkpoint,
                            gru=self.args.gru,
                            clipnorm=self.args.clipnorm,
                            t=self.data_generator.max_seq_len,
                            lr=self.args.lr)
        else:
            m = models.NIC(self.args.embed_size,
                           self.args.hidden_size,
                           self.V,
                           self.args.dropin,
                           self.args.optimiser,
                           self.args.l2reg,
                           hsn_size=hsn_size,
                           weights=self.args.init_from_checkpoint,
                           gru=self.args.gru,
                           clipnorm=self.args.clipnorm,
                           t=self.data_generator.max_seq_len,
                           lr=self.args.lr)

        model = m.buildKerasModel(use_sourcelang=self.use_sourcelang,
                                  use_image=self.use_image)

        callbacks = CompilationOfCallbacks(self.data_generator.word2index,
                                           self.data_generator.index2word,
                                           self.args,
                                           self.args.dataset,
                                           self.data_generator,
                                           use_sourcelang=self.use_sourcelang,
                                           use_image=self.use_image)

        train_generator = self.data_generator.random_generator('train')
        train_size = self.data_generator.split_sizes['train']
        val_generator = self.data_generator.fixed_generator('val')
        val_size = self.data_generator.split_sizes['val']

        losses = model.fit_generator(generator=train_generator,
                                     samples_per_epoch=train_size,
                                     nb_epoch=self.args.max_epochs,
                                     verbose=1,
                                     callbacks=[callbacks],
                                     nb_worker=1,
                                     validation_data=val_generator,
                                     nb_val_samples=val_size)

        return losses

    def prepare_datagenerator(self):
        '''
        Initialise the data generator and its datastructures, unless a valid
        data generator was already passed into the
        GroundedTranslation.__init() function.
        '''

        # Initialise the data generator if it has not yet been initialised
        if self.data_generator == None:
            self.data_generator = VisualWordDataGenerator(
                self.args, self.args.dataset)

            # Extract the working vocabulary from the training dataset
            if self.args.existing_vocab != "":
                self.data_generator.set_vocabulary(self.args.existing_vocab)
            else:
                self.data_generator.extract_vocabulary()
        self.V = self.data_generator.get_vocab_size()

    def log_run_arguments(self):
        '''
        Save the command-line arguments, along with the method defaults,
        used to parameterise this run.
        '''
        logger.info("Run arguments:")
        for arg, value in self.args.__dict__.iteritems():
            logger.info("%s: %s" % (arg, str(value)))
Пример #16
0
class GroundedTranslation(object):

    def __init__(self, args, datagen=None):
        '''
        Initialise the model and set Theano debugging model if
        self.args.debug is true. Prepare the data generator if necessary.
        '''

        self.args = args
        self.data_generator = datagen
        self.use_sourcelang = args.source_vectors is not None
        self.use_image = not args.no_image
        self.log_run_arguments()
        self.data_generator=datagen
        self.prepare_datagenerator()

        if self.args.debug:
            theano.config.optimizer = 'fast_compile'
            theano.config.exception_verbosity = 'high'

    def train_model(self):
        '''
        Initialise the data generator to process the data in a memory-friendly
        manner. Then build the Keras model, given the user-specified arguments
        (or the initial defaults). Train the model for self.args.max_epochs
        and return the training and validation losses.

        The losses object contains a history variable. The history variable is
        a dictionary with a list of training and validation losses:

        losses.history.['loss']
        losses.history.['val_loss']
        '''

        if not self.use_sourcelang:
            hsn_size = 0
        else:
            hsn_size = self.data_generator.hsn_size  # ick

        if self.args.mrnn:
            m = models.MRNN(self.args.embed_size, self.args.hidden_size,
                            self.V, self.args.dropin,
                            self.args.optimiser, self.args.l2reg,
                            hsn_size=hsn_size,
                            weights=self.args.init_from_checkpoint,
                            gru=self.args.gru,
                            clipnorm=self.args.clipnorm,
                            t=self.data_generator.max_seq_len,
                            lr=self.args.lr)
        else:
            m = models.NIC(self.args.embed_size, self.args.hidden_size,
                           self.V, self.args.dropin,
                           self.args.optimiser, self.args.l2reg,
                           hsn_size=hsn_size,
                           weights=self.args.init_from_checkpoint,
                           gru=self.args.gru,
                           clipnorm=self.args.clipnorm,
                           t=self.data_generator.max_seq_len,
                           lr=self.args.lr)

        model = m.buildKerasModel(use_sourcelang=self.use_sourcelang,
                                  use_image=self.use_image)

        callbacks = CompilationOfCallbacks(self.data_generator.word2index,
                                           self.data_generator.index2word,
                                           self.args,
                                           self.args.dataset,
                                           self.data_generator,
                                           use_sourcelang=self.use_sourcelang,
                                           use_image=self.use_image)

        train_generator = self.data_generator.random_generator('train')
        train_size = self.data_generator.split_sizes['train']
        val_generator = self.data_generator.fixed_generator('val')
        val_size = self.data_generator.split_sizes['val']

        losses = model.fit_generator(generator=train_generator,
                                     samples_per_epoch=train_size,
                                     nb_epoch= self.args.max_epochs,
                                     verbose=1,
                                     callbacks=[callbacks],
                                     nb_worker=1,
                                     validation_data=val_generator,
                                     nb_val_samples=val_size)

        return losses

    def prepare_datagenerator(self):
        '''
        Initialise the data generator and its datastructures, unless a valid
        data generator was already passed into the
        GroundedTranslation.__init() function.
        '''

        # Initialise the data generator if it has not yet been initialised
        if self.data_generator == None:
            self.data_generator = VisualWordDataGenerator(self.args,
                                                          self.args.dataset)

            # Extract the working vocabulary from the training dataset
            if self.args.existing_vocab != "":
                self.data_generator.set_vocabulary(self.args.existing_vocab)
            else:
                self.data_generator.extract_vocabulary()
        self.V = self.data_generator.get_vocab_size()


    def log_run_arguments(self):
        '''
        Save the command-line arguments, along with the method defaults,
        used to parameterise this run.
        '''
        logger.info("Run arguments:")
        for arg, value in self.args.__dict__.iteritems():
            logger.info("%s: %s" % (arg, str(value)))
class ExtractFinalHiddenStateActivations:
    def __init__(self, args):
        self.args = args
        self.args.generate_from_N_words = 0  # Default 0
        self.vocab = dict()
        self.unkdict = dict()
        self.counter = 0
        self.maxSeqLen = 0
        self.MAX_HT = self.args.generation_timesteps - 1

        # consistent with models.py
        # maybe use_sourcelang isn't applicable here?
        self.use_sourcelang = args.source_vectors is not None
        self.use_image = not args.no_image

        if self.args.debug:
            theano.config.optimizer = 'None'
            theano.config.exception_verbosity = 'high'

        self.source_type = "predicted" if self.args.use_predicted_tokens else "gold"
        self.source_encoder = "mt_enc" if self.args.no_image else "vis_enc"
        self.source_dim = self.args.hidden_size

        self.h5_dataset_str = "%s-hidden_feats-%s-%d" % (
            self.source_type, self.source_encoder, self.source_dim)
        logger.info("Serialising into %s" % self.h5_dataset_str)

    def get_hidden_activations(self):
        '''
        In the model, we will merge the VGG image representation with
        the word embeddings. We need to feed the data as a list, in which
        the order of the elements in the list is _crucial_.
        '''

        self.data_generator = VisualWordDataGenerator(self.args,
                                                      self.args.dataset)
        self.args.checkpoint = self.find_best_checkpoint()
        self.data_generator.set_vocabulary(self.args.checkpoint)
        self.vocab_len = len(self.data_generator.index2word)
        t = self.args.generation_timesteps if self.args.use_predicted_tokens else self.data_generator.max_seq_len

        m = models.NIC(self.args.embed_size,
                       self.args.hidden_size,
                       self.vocab_len,
                       self.args.dropin,
                       self.args.optimiser,
                       self.args.l2reg,
                       weights=self.args.checkpoint,
                       gru=self.args.gru,
                       t=t)

        self.fhs = m.buildHSNActivations(use_image=self.use_image)
        if self.args.use_predicted_tokens and self.args.no_image == False:
            gen_m = models.NIC(self.args.embed_size,
                               self.args.hidden_size,
                               self.vocab_len,
                               self.args.dropin,
                               self.args.optimiser,
                               self.args.l2reg,
                               weights=self.args.checkpoint,
                               gru=self.args.gru,
                               t=self.args.generation_timesteps)
            self.full_model = gen_m.buildKerasModel(use_image=self.use_image)

        self.new_generate_activations('train')
        self.new_generate_activations('val')
        self.new_generate_activations('test')

    def new_generate_activations(self, split):
        '''
        Generate and serialise final-timestep hidden state activations
        into --dataset.
        TODO: we should be able to serialise predicted final states instead of
        gold-standard final states for val and test data.
        '''
        logger.info(
            "%s: extracting final hidden state activations from this model",
            split)

        # Prepare the data generator based on whether we're going to work with
        # the gold standard input tokens or the automatically predicted tokens
        if self.args.use_predicted_tokens:
            the_generator = self.data_generator.generation_generator(
                split=split)
        else:
            the_generator = self.data_generator.fixed_generator(split=split)

        counter = 0
        hidden_states = []
        batch_start = 0
        batch_end = 0
        for data in the_generator:
            if self.args.use_predicted_tokens:
                tokens = self.get_predicted_tokens(data)
                data[0]['text'] = self.set_text_arrays(tokens, data[0]['text'])

            # We extract the FHS from either the oracle input tokens
            hsn = self.fhs.predict(
                {
                    'text': data[0]['text'],
                    'img': data[0]['img']
                },
                batch_size=self.args.batch_size,
                verbose=1)

            for idx, h in enumerate(hsn):
                # get final_hidden index on a sentence-by-sentence
                # basis by searching for the first <E> in each trainY
                eos = False
                for widx, warr in enumerate(data[1]['output'][idx]):
                    w = np.argmax(warr)
                    if self.data_generator.index2word[w] == "<E>":
                        final_hidden = h[widx]
                        hidden_states.append(final_hidden)
                        eos = True
                        logger.debug(widx)
                        break
                if not eos:
                    final_hidden = h[self.MAX_HT]
                    hidden_states.append(final_hidden)
                batch_end += 1

            # Note: serialisation happens over training batches too.
            # now serialise the hidden representations in the h5
            self.to_h5_indices(split, data[0]['indices'], hidden_states)

            batch_start = batch_end
            counter += len(hidden_states)
            hidden_states = []
            logger.info("Processed %d instances" % counter)
            if batch_end >= self.data_generator.split_sizes[split]:
                break

#        elif split == 'val' or split == "test":
#            hidden_states = []
#            batch_start = 0
#            batch_end = 0
#            for data in the_generator:
#		if self.args.use_predicted_tokens:
#		    tokens = self.get_predicted_tokens(data)
#                    data['text'] = self.set_text_arrays(tokens, data['text'])
#
#                # We extract the FHS from either the oracle input tokens
#                hsn = self.fhs.predict({'text': data['text'],
#                                        'img': data['img']},
#                                       batch_size=self.args.batch_size,
#                                       verbose=1)
#
#                for idx, h in enumerate(hsn['rnn']):
#                    # get final_hidden index on a sentence-by-sentence
#                    # basis by searching for the first <E> in each trainY
#                    eos = False
#                    for widx, warr in enumerate(data['output'][idx]):
#                        w = np.argmax(warr)
#                        if self.data_generator.index2word[w] == "<E>":
#                            final_hidden = h[widx]
#                            hidden_states.append(final_hidden)
#                            eos = True
#                            break
#                    if not eos:
#                        final_hidden = h[self.MAX_HT]
#                        hidden_states.append(final_hidden)
#                    batch_end += 1
#
#                # Note: serialisation happens over training batches too.
#                # now serialise the hidden representations in the h5
#                self.to_h5_indices(split, data['indices'], hidden_states)
#
#                batch_start = batch_end
#		counter += len(hidden_states)
#                hidden_states = []
#		logger.info("Processed %d instances" % counter)
#                if batch_end >= self.data_generator.split_sizes[split]:
#                    break

    def get_predicted_tokens(self, data):
        """
        We're not going to work with the gold standard input tokens.
        Instead we're going to automatically predict them and then extract
        the final hidden state from the inferred data.

        Helper function used by new_generate_activations().
        """
        # We are going to arg max decode a sequence.
        start_gen = self.args.generate_from_N_words + 1  # include BOS

        text = deepcopy(data['text'])
        # Append the first start_gen words to the complete_sentences list
        # for each instance in the batch.
        complete_sentences = [[] for _ in range(text.shape[0])]
        for t in range(start_gen):  # minimum 1
            for i in range(text.shape[0]):
                w = np.argmax(text[i, t])
                complete_sentences[i].append(self.data_generator.index2word[w])
        del data['text']
        text = self.reset_text_arrays(text, start_gen)
        Y_target = data['output']
        data['text'] = text

        for t in range(start_gen, self.args.generation_timesteps):
            logger.debug("Input token: %s" %
                         self.data_generator.index2word[np.argmax(
                             data['text'][0, t - 1])])
            preds = self.full_model.predict(data, verbose=0)

            # Look at the last indices for the words.
            next_word_indices = np.argmax(preds['output'][:, t - 1], axis=1)
            logger.debug("Predicted token: %s" %
                         self.data_generator.index2word[next_word_indices[0]])
            # update array[0]/sentence-so-far with generated words.
            for i in range(len(next_word_indices)):
                data['text'][i, t, next_word_indices[i]] = 1.
            next_words = [
                self.data_generator.index2word[x] for x in next_word_indices
            ]
            for i in range(len(next_words)):
                complete_sentences[i].append(next_words[i])

        # extract each sentence until it hits the first end-of-string token
        pruned_sentences = []
        for s in complete_sentences:
            pruned_sentences.append(
                [x for x in itertools.takewhile(lambda n: n != "<E>", s)])
        return pruned_sentences

    def set_text_arrays(self, predicted_tokens, text_arrays):
        """ Set the values of the text tokens in the text arrays
        based on the tokens predicted by the model.

        Helper function used by new_generate_activations() """
        pidx = 0
        new_arrays = deepcopy(text_arrays)
        for pairs in zip(predicted_tokens, text_arrays):
            toks = pairs[0]
            struct = pairs[1]
            for tidx, t in enumerate(toks):
                struct[tidx, self.data_generator.word2index[t]] = 1
            new_arrays[pidx] = struct
            pidx += 1
        return new_arrays

    def reset_text_arrays(self, text_arrays, fixed_words=1):
        """ Reset the values in the text data structure to zero so we cannot
        accidentally pass them into the model.

        Helper function for generate_sentences().
         """
        reset_arrays = deepcopy(text_arrays)
        reset_arrays[:, fixed_words:, :] = 0
        return reset_arrays

#    def make_generation_arrays(self, prefix, fixed_words,
#                               predicted_tokens=None):
#        '''
#        Create arrays that are used as input for generation / activation.
#        '''
#
#
#        if predicted_tokens is not None:
#            input_data, targets = self.data_generator.get_data_by_split(prefix,
#                                           self.use_sourcelang, self.use_image)
#            logger.info("Initialising generation arrays with predicted tokens")
#            gen_input_data = deepcopy(input_data)
#            tokens = gen_input_data[0]
#            tokens[:, fixed_words, :] = 0  # reset the inputs
#            for prediction, words, tgt in zip(predicted_tokens, tokens, targets):
#                for idx, t in enumerate(prediction):
#                    words[idx, self.data_generator.word2index[t]] = 1.
#            targets = self.data_generator.get_target_descriptions(tokens)
#            return gen_input_data, targets
#
#        else:
#            # Replace input words (input_data[0]) with zeros for generation,
#            # except for the first args.generate_from_N_words
#            # NOTE: this will include padding and BOS steps (fixed_words has been
#            # incremented accordingly already in generate_sentences().)
#            input_data = self.data_generator.get_generation_data_by_split(prefix,
#                                           self.use_sourcelang, self.use_image)
#            logger.info("Initialising with the first %d gold words (incl BOS)",
#                        fixed_words)
#            gen_input_data = deepcopy(input_data)
#            gen_input_data[0][:, fixed_words:, :] = 0
#            return gen_input_data
#
#    def generate_sentences(self, split, arrays=None):
#        """
#        Generates descriptions of images for --generation_timesteps
#        iterations through the LSTM. Each input description is clipped to
#        the first <BOS> token, or, if --generate_from_N_words is set, to the
#        first N following words (N + 1 BOS token).
#        This process can be additionally conditioned
#        on source language hidden representations, if provided by the
#        --source_vectors parameter.
#        The output is clipped to the first EOS generated, if it exists.
#
#        TODO: beam search
#        TODO: duplicated method with generate.py and Callbacks.py
#        """
#        logger.info("%s: generating descriptions", split)
#
#        start_gen = self.args.generate_from_N_words  # Default 0
#        start_gen = start_gen + 1  # include BOS
#
#        # prepare the datastructures for generation (no batching over val)
#        if arrays == None:
#            arrays = self.make_generation_arrays(split, start_gen)
#        N_sents = arrays[0].shape[0]
#
#        complete_sentences = [[] for _ in range(N_sents)]
#        for t in range(start_gen):  # minimum 1
#            for i in range(N_sents):
#                w = np.argmax(arrays[0][i, t])
#                complete_sentences[i].append(self.data_generator.index2word[w])
#
#        for t in range(start_gen, self.args.generation_timesteps):
#            # we take a view of the datastructures, which means we're only
#            # ever generating a prediction for the next word. This saves a
#            # lot of cycles.
#            preds = self.full_model.predict([arr[:, 0:t] for arr in arrays],
#                                            verbose=0)
#
#            # Look at the last indices for the words.
#            next_word_indices = np.argmax(preds[:, -1], axis=1)
#            # update array[0]/sentence-so-far with generated words.
#            for i in range(N_sents):
#                arrays[0][i, t, next_word_indices[i]] = 1.
#            next_words = [self.data_generator.index2word[x] for x in next_word_indices]
#            for i in range(len(next_words)):
#                complete_sentences[i].append(next_words[i])
#
#        # extract each sentence until it hits the first end-of-string token
#        pruned_sentences = []
#        for s in complete_sentences:
#            pruned_sentences.append([x for x
#                                     in itertools.takewhile(
#                                         lambda n: n != "<E>", s)])
#        return pruned_sentences

    def to_h5_indices(self, split, indices, hidden_states):
        hsn_shape = len(hidden_states[0])
        fhf_str = "final_hidden_features"
        logger.info("Serialising final hidden state features from %s to H5",
                    split)
        for idx, data_key in enumerate(indices):
            ident = data_key[0]
            desc_idx = data_key[1]
            self.data_generator.set_source_features(split, ident,
                                                    self.h5_dataset_str,
                                                    hidden_states[idx],
                                                    hsn_shape, desc_idx)


#    def serialise_to_h5_keys(self, split, data_keys, hidden_states):
#        hsn_shape = len(hidden_states[0])
#        fhf_str = "final_hidden_features"
#        logger.info("Serialising final hidden state features from %s to H5",
#                    split)
#        for idx, data_key in enumerate(data_keys):
#            self.data_generator.set_source_features(split, data_key,
#                                                    self.h5_dataset_str,
#                                                    hidden_states[idx],
#                                                    hsn_shape)
#            #try:
#            #    hsn_data = self.data_generator.dataset[split][data_key].create_dataset(
#            #        fhf_str, (hsn_shape,), dtype='float32')
#            #except RuntimeError:
#            #    # the dataset already exists, retrieve it into RAM and then overwrite it
#            #    del self.data_generator.dataset[split][data_key][fhf_str]
#            #    hsn_data = self.data_generator.dataset[split][data_key].create_dataset(
#            #        fhf_str, (hsn_shape,), dtype='float32')
#            #try:
#            #    hsn_data[:] = hidden_states[idx]
#            #except IndexError:
#            #    raise IndexError("data_key %s of %s; index idx %d, len hidden %d" % (
#            #        data_key, len(data_keys), idx, len(hidden_states)))
#            #    break
#
#    def sentences_to_h5(self, split, sentences):
#        '''
#        Save the predicted sentences into the h5 dataset object.
#        This is useful for subsequently (i.e. in a different program)
#        extracting LM-only final hidden states from predicted sentences.
#        Specifically, this can be compared to generating LM-only hidden
#        states over gold-standard tokens.
#        '''
#        idx = 0
#        logger.info("Serialising sentences from %s to H5", split)
#        data_keys = self.data_generator.dataset[split]
#        if split == 'val' and self.args.small_val:
#            data_keys = ["%06d" % x for x in range(len(sentences))]
#        else:
#            data_keys = ["%06d" % x for x in range(len(sentences))]
#        for data_key in data_keys:
#            self.data_generator.set_predicted_description(split, data_key,
#                                                          sentences[idx][1:])
#            idx += 1
#
#    def sentences_to_h5_keys(self, split, data_keys, sentences):
#        logger.info("Serialising sentences from %s to H5",
#                    split)
#        for idx, data_key in enumerate(data_keys):
#            self.data_generator.set_predicted_description(split, data_key,
#                                                    sentences[idx])
#
#    def serialise_to_h5(self, split, hsn_shape, hidden_states,
#                        batch_start=None, batch_end=None):
#        """ Serialise the hidden representations from generate_activations
#        into the h5 dataset.
#        This assumes one hidden_state per image key, which is maybe not
#        appropriate if there are multiple descriptions/image.
#        """
#        idx = 0
#        logger.info("Serialising final hidden state features from %s to H5",
#                    split)
#        if batch_start is not None:
#            logger.info("Start at %d, end at %d", batch_start, batch_end)
#            data_keys = ["%06d" % x for x in range(batch_start, batch_end)]
#            assert len(hidden_states) == len(data_keys),\
#                    "keys: %d hidden %d; start %d end %d" % (len(data_keys),
#                                            len(hidden_states), batch_start,
#                                            batch_end)
#        else:
#            data_keys = self.data_generator.dataset[split]
#            if split == 'val' and self.args.small_val:
#                data_keys = ["%06d" % x for x in range(len(hidden_states))]
#            else:
#                data_keys = ["%06d" % x for x in range(len(hidden_states))]
#        for data_key in data_keys:
#            self.data_generator.set_source_features(split, data_key,
#                                                    self.h5_dataset_str,
#                                                    hidden_states[idx],
#                                                    hsn_shape)
#            #try:
#            #    hsn_data = self.data_generator.dataset[split][data_key].create_dataset(
#            #        fhf_str, (hsn_shape,), dtype='float32')
#            #except RuntimeError:
#            #    # the dataset already exists, retrieve it into RAM and then overwrite it
#            #    del self.data_generator.dataset[split][data_key][fhf_str]
#            #    hsn_data = self.data_generator.dataset[split][data_key].create_dataset(
#            #        fhf_str, (hsn_shape,), dtype='float32')
#            #try:
#            #    hsn_data[:] = hidden_states[idx]
#            #except IndexError:
#            #    raise IndexError("data_key %s of %s; index idx %d, len hidden %d" % (
#            #        data_key, len(data_keys),
#            #                      idx, len(hidden_states)))
#            #    break
#            idx += 1

    def find_best_checkpoint(self):
        '''
        Read the summary file from the directory and scrape out the run ID of
        the highest BLEU scoring checkpoint. Then do an ls-stlye function in
        the directory and return the exact path to the best model.

        Assumes only one matching prefix in the model checkpoints directory.
        '''

        summary_data = open("%s/summary" %
                            self.args.model_checkpoints).readlines()
        summary_data = [x.replace("\n", "") for x in summary_data]
        best_id = None
        target = "Best loss" if self.args.best_pplx else "Best Metric"
        for line in summary_data:
            if line.startswith(target):
                best_id = "%03d" % (int(line.split(":")[1].split("|")[0]))

        checkpoint = None
        if best_id is not None:
            checkpoints = os.listdir(self.args.model_checkpoints)
            for c in checkpoints:
                if c.startswith(best_id):
                    checkpoint = c
                    break
        return "%s/%s" % (self.args.model_checkpoints, checkpoint)
Пример #18
0
    def train_model(self):
        '''
        In the model, we will merge
        the word embeddings with
        the VGG image representation (if used)
        and the source-language multimodal vectors (if used).
        We need to feed the data as a list, in which the order of the elements
        in the list is _crucial_.
        '''

        self.log_run_arguments()

        self.data_generator = VisualWordDataGenerator(
            self.args, self.args.dataset)
        self.data_generator.extract_vocabulary()

        self.V = self.data_generator.get_vocab_size()

        # Keras doesn't do batching of val set, so
        # assume val data is small enough to get all at once.
        # val_input is the list passed to model.fit()
        # val_input can contain image, source features as well (or not)
        if not self.args.enable_val_pplx:
            val_input, valY = self.data_generator.get_data_by_split('val',
                                  self.use_sourcelang, self.use_image)

        if not self.use_sourcelang:
            hsn_size = 0
        else:
            hsn_size = self.data_generator.hsn_size  # ick

        m = models.OneLayerLSTM(self.args.hidden_size, self.V,
                                self.args.dropin,
                                self.args.optimiser, self.args.l2reg,
                                hsn_size=hsn_size,
                                weights=self.args.init_from_checkpoint,
                                gru=self.args.gru)

        model = m.buildKerasModel(use_sourcelang=self.use_sourcelang,
                                  use_image=self.use_image)

        callbacks = CompilationOfCallbacks(self.data_generator.word2index,
                                           self.data_generator.index2word,
                                           self.args,
                                           self.args.dataset,
                                           self.data_generator,
                                           use_sourcelang=self.use_sourcelang,
                                           use_image=self.use_image)

        big_batch_size = self.args.big_batch_size
        if big_batch_size > 0:
            if self.args.small:
                batches = ceil(SMALL_NUM_DESCRIPTIONS/self.args.big_batch_size)
            else:
                batches = ceil(float(self.data_generator.split_sizes['train']) /
                               self.args.big_batch_size)
            batches = int(batches)
        else:  # if big_batch_size == 0, reset to training set size.
            big_batch_size = self.data_generator.split_sizes['train']
            batches = 1

        # for epoch in range(self.args.epochs):
        epoch = 0
        while True:
            # the program will exit with sys.exit(0) in
            # Callbacks.early_stop_decision(). Do not put any clean-up
            # after this loop. It will NEVER be executed!
            batch = 1
            for train_input, trainY, indicator in\
                self.data_generator.yield_training_batch(big_batch_size,
                                                         self.use_sourcelang,
                                                         self.use_image):

                if self.args.predefined_epochs:
                    logger.info("Epoch %d/%d, big-batch %d/%d", epoch+1,
                                self.args.max_epochs, batch, batches)
                else:
                    logger.info("Epoch %d, big-batch %d/%d", epoch+1,
                                batch, batches)

                if indicator is True:
                    # let's test on the val after training on these batches
                    model.fit(train_input,
                              trainY,
                              validation_data=None if
                                  self.args.enable_val_pplx
                                  else (val_input, valY),
                              callbacks=[callbacks],
                              nb_epoch=1,
                              verbose=1,
                              batch_size=self.args.batch_size,
                              shuffle=True)
                else:
                    model.fit(train_input,
                              trainY,
                              nb_epoch=1,
                              verbose=1,
                              batch_size=self.args.batch_size,
                              shuffle=True)
                batch += 1
            epoch += 1
            if self.args.predefined_epochs and epoch >= self.args.max_epochs:
                # stop training because we've exceeded self.args.max_epochs
                break
Пример #19
0
class GroundedTranslationGenerator:

    def __init__(self, args):
        self.args = args
        self.vocab = dict()
        self.unkdict = dict()
        self.counter = 0
        self.maxSeqLen = 0

        # consistent with models.py
        self.use_sourcelang = args.source_vectors is not None
        self.use_image = not args.no_image
        self.model = None
        self.prepare_datagenerator()

        # this results in two file handlers for dataset (here and
        # data_generator)
        if not self.args.dataset:
            logger.warn("No dataset given, using flickr8k")
            self.dataset = h5py.File("flickr8k/dataset.h5", "r")
        else:
            self.dataset = h5py.File("%s/dataset.h5" % self.args.dataset, "r")

        if self.args.debug:
            theano.config.optimizer = 'None'
            theano.config.exception_verbosity = 'high'

    def prepare_datagenerator(self):
        self.data_gen = VisualWordDataGenerator(self.args,
                                                self.args.dataset)
        self.args.checkpoint = self.find_best_checkpoint()
        self.data_gen.set_vocabulary(self.args.checkpoint)
        self.vocab_len = len(self.data_gen.index2word)
        self.index2word = self.data_gen.index2word
        self.word2index = self.data_gen.word2index

    def generate(self):
        '''
        Entry point for this module.
        Loads up a data generator to get the relevant image / source features.
        Builds the relevant model, given the command-line arguments.
        Generates sentences for the images in the val / test data.
        Calculates BLEU and PPLX, unless requested.
        '''

        if self.use_sourcelang:
            # HACK FIXME unexpected problem with input_data
            self.hsn_size = self.data_gen.hsn_size
        else:
            self.hsn_size = 0

        if self.model == None:
            self.build_model(generate=True)

        self.generate_sentences(self.args.checkpoint, val=not self.args.test)
        if not self.args.without_scores:
            score = self.bleu_score(self.args.checkpoint, val=not self.args.test)
            if self.args.multeval:
                score, _, _ = self.multeval_scores(self.args.checkpoint,
                                                    val=not self.args.test)
            if not self.args.no_pplx:
                self.build_model(generate=False)
                self.calculate_pplx(self.args.checkpoint, val=not self.args.test)
            return score

    def generate_sentences(self, filepath, val=True):
        """
        Generates descriptions of images for --generation_timesteps
        iterations through the LSTM. Each input description is clipped to
        the first <BOS> token, or, if --generate_from_N_words is set, to the
        first N following words (N + 1 BOS token).
        This process can be additionally conditioned
        on source language hidden representations, if provided by the
        --source_vectors parameter.
        The output is clipped to the first EOS generated, if it exists.

        TODO: duplicated method with generate.py
        """

        if self.args.beam_width > 1:
            prefix = "val" if val else "test"
            handle = codecs.open("%s/%sGenerated" % (filepath, prefix), "w",
                                 'utf-8')
            logger.info("Generating %s descriptions", prefix)

            start_gen = self.args.generate_from_N_words  # Default 0
            start_gen = start_gen + 1  # include BOS

            generator = self.data_gen.generation_generator(prefix, batch_size=1)
            seen = 0
            # we are going to beam search for the most probably sentence.
            # let's do this one sentence at a time to make the logging output
            # easier to understand
            for data in generator:
                text = data[0]['text']
                # Append the first start_gen words to the complete_sentences list
                # for each instance in the batch.
                complete_sentences = [[] for _ in range(text.shape[0])]
                for t in range(start_gen):  # minimum 1
                    for i in range(text.shape[0]):
                        w = np.argmax(text[i, t])
                        complete_sentences[i].append(self.index2word[w])
                del data[0]['text']
                text = self.reset_text_arrays(text, start_gen)
                Y_target = data[1]['output']
                data[0]['text'] = text

                max_beam_width = self.args.beam_width
                structs = self.make_duplicate_matrices(data[0], max_beam_width)

                # A beam is a 2-tuple with the probability of the sequence and
                # the words in that sequence. Start with empty beams
                beams = [(0.0, [])]
                # collects beams that are in the top candidates and 
                # emitted a <E> token.
                finished = [] 
                for t in range(start_gen, self.args.generation_timesteps):
                    # Store the candidates produced at timestep t, will be
                    # pruned at the end of the timestep
                    candidates = []

                    # we take a view of the datastructures, which means we're only
                    # ever generating a prediction for the next word. This saves a
                    # lot of cycles.
                    preds = self.model.predict(structs, verbose=0)

                    # The last indices in preds are the predicted words
                    next_word_indices = preds[:, t-1]
                    sorted_indices = np.argsort(-next_word_indices, axis=1)

                    # Each instance in structs is holding the history of a
                    # beam, and so there is a direct connection between the
                    # index of a beam in beams and the index of an instance in
                    # structs.
                    for beam_idx, b in enumerate(beams):
                        # get the sorted predictions for the beam_idx'th beam
                        beam_predictions = sorted_indices[beam_idx]
                        for top_idx in range(self.args.beam_width):
                            wordIndex = beam_predictions[top_idx]
                            wordProb = next_word_indices[beam_idx][beam_predictions[top_idx]]
                            # For the beam_idxth beam, add the log probability
                            # of the top_idxth predicted word to the previous
                            # log probability of the sequence, and  append the 
                            # top_idxth predicted word to the sequence of words 
                            candidates.append([b[0] + math.log(wordProb), b[1] + [wordIndex]])

                    candidates.sort(reverse = True)
                    if self.args.verbose:
                        logger.info("Candidates in the beam")
                        logger.info("---")
                        for c in candidates:
                            logger.info(" ".join([self.index2word[x] for x in c[1]]) + " (%f)" % c[0])

                    beams = candidates[:max_beam_width] # prune the beams
                    pruned = []
                    for b in beams:
                        # If a top candidate emitted an EOS token then 
                        # a) add it to the list of finished sequences
                        # b) remove it from the beams and decrease the 
                        # maximum size of the beams.
                        if b[1][-1] == self.word2index["<E>"]:
                            finished.append(b)
                            if max_beam_width >= 1:
                                max_beam_width -= 1
                        else:
                            pruned.append(b)
                    
                    beams = pruned[:max_beam_width]

                    if self.args.verbose:
                        logger.info("Pruned beams")
                        logger.info("---")
                        for b in beams:
                            logger.info(" ".join([self.index2word[x] for x in b[1]]) + "(%f)" % b[0])

                    if max_beam_width == 0:
                        # We have sampled max_beam_width sequences with an <E>
                        # token so stop the beam search.
                        break

                    # Reproduce the structs for the beam search so we can keep
                    # track of the state of each beam
                    structs = self.make_duplicate_matrices(data[0], max_beam_width)

                    # Rewrite the 1-hot word features with the
                    # so-far-predcicted tokens in a beam.
                    for bidx, b in enumerate(beams):
                        for idx, w in enumerate(b[1]):
                            next_word_index = w
                            structs['text'][bidx, idx+1, w] = 1.

                # If none of the sentences emitted an <E> token while
                # decoding, add the final beams into the final candidates
                if len(finished) == 0:
                    for leftover in beams:
                        finished.append(leftover)

                # Normalise the probabilities by the length of the sequences
                # as suggested by Graves (2012) http://arxiv.org/abs/1211.3711
                for f in finished:
                    f[0] = f[0] / len(f[1])
                finished.sort(reverse=True)

                if self.args.verbose:
                    logger.info("Length-normalised samples")
                    logger.info("---")
                    for f in finished:
                        logger.info(" ".join([self.index2word[x] for x in f[1]]) + "(%f)" % f[0])

                # Emit the lowest (log) probability sequence
                best_beam = finished[0]
                complete_sentences[i] = [self.index2word[x] for x in best_beam[1]]
                handle.write(' '.join([x for x
                                       in itertools.takewhile(
                                           lambda n: n != "<E>", complete_sentences[i])]) + "\n")
                if self.args.verbose:
                    logger.info("%s (%f)",' '.join([x for x
                                          in itertools.takewhile(
                                              lambda n: n != "<E>",
                                              complete_sentences[i])]),
                                          best_beam[0])

                seen += text.shape[0]
                if seen == self.data_gen.split_sizes['val']:
                    # Hacky way to break out of the generator
                    break
            handle.close()
        else:
            # We are going to arg max decode a sequence.
            prefix = "val" if val else "test"
            logger.info("Generating %s descriptions", prefix)
            start_gen = self.args.generate_from_N_words + 1  # include BOS
            handle = codecs.open("%s/%sGenerated" % (filepath, prefix), 
                                 "w", 'utf-8')

            generator = self.data_gen.generation_generator(prefix)
            seen = 0
            for data in generator:
                text = deepcopy(data[0]['text'])
                # Append the first start_gen words to the complete_sentences list
                # for each instance in the batch.
                complete_sentences = [[] for _ in range(text.shape[0])]
                for t in range(start_gen):  # minimum 1
                    for i in range(text .shape[0]):
                        w = np.argmax(text[i, t])
                        complete_sentences[i].append(self.index2word[w])
                del data[0]['text']
                text = self.reset_text_arrays(text, start_gen)
                Y_target = data[1]['output']
                data[0]['text'] = text

                for t in range(start_gen, self.args.generation_timesteps):
                    logger.debug("Input token: %s" % self.index2word[np.argmax(text[0,t-1])])
                    preds = self.model.predict(data[0],
                                               verbose=0)

                    # Look at the last indices for the words.
                    next_word_indices = np.argmax(preds[:, t-1], axis=1)
                    logger.debug("Predicted token: %s" % self.index2word[next_word_indices[0]])
                    # update array[0]/sentence-so-far with generated words.
                    for i in range(len(next_word_indices)):
                        data[0]['text'][i, t, next_word_indices[i]] = 1.
                    next_words = [self.index2word[x] for x in next_word_indices]
                    for i in range(len(next_words)):
                        complete_sentences[i].append(next_words[i])

                sys.stdout.flush()
                # print/extract each sentence until it hits the first end-of-string token
                for s in complete_sentences:
                    if self.args.verbose:
                        logger.info("%s",' '.join([x for x
                                              in itertools.takewhile(
                                                  lambda n: n != "<E>",
                                                  complete_sentences[i])]))
                    decoded_str = ' '.join([x for x
                                            in itertools.takewhile(
                                                lambda n: n != "<E>", s[1:])])
                    handle.write(decoded_str + "\n")

                seen += text.shape[0]
                if seen == self.data_gen.split_sizes[prefix]:
                    # Hacky way to break out of the generator
                    break
            handle.close()

    def calculate_pplx(self, path, val=True):
        """ Splits the input data into batches of self.args.batch_size to
        reduce the memory footprint of holding all of the data in RAM. """

        prefix = "val" if val else "test"
        logger.info("Calculating pplx over %s data", prefix)
        sum_logprobs = 0
        y_len = 0

        generator = self.data_gen.generation_generator(prefix)
        seen = 0
        for data in generator:
            Y_target = deepcopy(data[1]['output'])
            del data[1]['output']

            preds = self.model.predict(data[0],
                                       verbose=0,
                                       batch_size=self.args.batch_size)

            for i in range(Y_target.shape[0]):
                for t in range(Y_target.shape[1]):
                    target_idx = np.argmax(Y_target[i, t])
                    target_tok = self.index2word[target_idx]
                    if target_tok != "<P>":
                        log_p = math.log(preds[i, t, target_idx],2)
                        sum_logprobs += -log_p
                        y_len += 1

            seen += data[0]['text'].shape[0]
            if seen == self.data_gen.split_sizes[prefix]:
                # Hacky way to break out of the generator
                break

        norm_logprob = sum_logprobs / y_len
        pplx = math.pow(2, norm_logprob)
        logger.info("PPLX: %.4f", pplx)
        handle = open("%s/%sPPLX" % (path, prefix), "w")
        handle.write("%f\n" % pplx)
        handle.close()
        return pplx


    def reset_text_arrays(self, text_arrays, fixed_words=1):
        """ Reset the values in the text data structure to zero so we cannot
        accidentally pass them into the model.

        Helper function for generate_sentences().
         """
        reset_arrays = deepcopy(text_arrays)
        reset_arrays[:,fixed_words:, :] = 0
        return reset_arrays

    def make_duplicate_matrices(self, generator_data, k):
        '''
        Prepare K duplicates of the input data for a given instance yielded by
        the data generator.

        Helper function for the beam search decoder in generation_sentences().
        '''

        if self.use_sourcelang and self.use_image:
            # the data generator yielded a dictionary with the words, the
            # image features, and the source features
            dupes = [[],[],[]]
            words = generator_data['text']
            img = generator_data['img']
            source = generator_data['src']
            for x in range(k):
                # Make a deep copy of the word_feats structures 
                # so the arrays will never be shared
                dupes[0].append(deepcopy(words[0,:,:]))
                dupes[1].append(source[0,:,:])
                dupes[2].append(img[0,:,:])

            # Turn the list of arrays into a numpy array
            dupes[0] = np.array(dupes[0])
            dupes[1] = np.array(dupes[1])
            dupes[2] = np.array(dupes[2])

            return {'text': dupes[0], 'img': dupes[2], 'src': dupes[1]}

        elif self.use_image:
            # the data generator yielded a dictionary with the words and the
            # image features
            dupes = [[],[]]
            words = generator_data['text']
            img = generator_data['img']
            for x in range(k):
                # Make a deep copy of the word_feats structures 
                # so the arrays will never be shared
                dupes[0].append(deepcopy(words[0,:,:]))
                dupes[1].append(img[0,:,:])

            # Turn the list of arrays into a numpy array
            dupes[0] = np.array(dupes[0])
            dupes[1] = np.array(dupes[1])

            return {'text': dupes[0], 'img': dupes[1]}

        elif self.use_sourcelang:
            # the data generator yielded a dictionary with the words and the
            # source features
            dupes = [[],[]]
            words = generator_data['text']
            source= generator_data['src']
            for x in range(k):
                # Make a deep copy of the word_feats structures 
                # so the arrays will never be shared
                dupes[0].append(deepcopy(words[0,:,:]))
                dupes[1].append(source[0,:,:])

            # Turn the list of arrays into a numpy array
            dupes[0] = np.array(dupes[0])
            dupes[1] = np.array(dupes[1])

            return {'text': dupes[0], 'src': dupes[1]}

    def find_best_checkpoint(self):
        '''
        Read the summary file from the directory and scrape out the run ID of
        the highest BLEU scoring checkpoint. Then do an ls-stlye function in
        the directory and return the exact path to the best model.

        Assumes only one matching prefix in the model checkpoints directory.
        '''

        summary_data = open("%s/summary" % self.args.model_checkpoints).readlines()
        summary_data = [x.replace("\n", "") for x in summary_data]
        best_id = None
        target = "Best loss" if self.args.best_pplx else "Best Metric"
        for line in summary_data:
            if line.startswith(target):
                best_id = "%03d" % (int(line.split(":")[1].split("|")[0]))

        checkpoint = None
        if best_id is not None:
            checkpoints = os.listdir(self.args.model_checkpoints)
            for c in checkpoints:
                if c.startswith(best_id):
                    checkpoint = c
                    break
        logger.info("Best checkpoint: %s/%s" % (self.args.model_checkpoints, checkpoint))
        return "%s/%s" % (self.args.model_checkpoints, checkpoint)

    def bleu_score(self, directory, val=True):
        '''
        PPLX is only weakly correlated with improvements in BLEU,
        and thus improvements in human judgements. Let's also track
        BLEU score of a subset of generated sentences in the val split
        to decide on early stopping, etc.
        '''

        prefix = "val" if val else "test"
        self.extract_references(directory, val)

        subprocess.check_call(
            ['perl multi-bleu.perl %s/%s_reference.ref < %s/%sGenerated | tee %s/%sBLEU'
             % (directory, prefix, directory, prefix, directory, prefix)], shell=True)
        bleudata = open("%s/%sBLEU" % (directory, prefix)).readline()
        data = bleudata.split(",")[0]
        bleuscore = data.split("=")[1]
        bleu = float(bleuscore.lstrip())
        return bleu

    def multeval_scores(self, directory, val=True):
        '''
        Maybe you want to evaluate with Meteor, TER, and BLEU?
        '''
        prefix = "val" if val else "test"
        self.extract_references(directory, val)

        with cd(MULTEVAL_DIR):
            subprocess.check_call(
                ['./multeval.sh eval --refs ../%s/%s_reference.* \
                 --hyps-baseline ../%s/%sGenerated \
                 --meteor.language %s \
		 --threads 4 \
		2> multevaloutput 1> multevaloutput'
                % (directory, prefix, directory, prefix, self.args.meteor_lang)], shell=True)
            handle = open("multevaloutput")
            multdata = handle.readlines()
            handle.close()
            for line in multdata:
              if line.startswith("RESULT: baseline: BLEU: AVG:"):
                mbleu = line.split(":")[4]
                mbleu = mbleu.replace("\n","")
                mbleu = mbleu.strip()
                lr = mbleu.split(".")
                mbleu = float(lr[0]+"."+lr[1][0:2])
              if line.startswith("RESULT: baseline: METEOR: AVG:"):
                mmeteor = line.split(":")[4]
                mmeteor = mmeteor.replace("\n","")
                mmeteor = mmeteor.strip()
                lr = mmeteor.split(".")
                mmeteor = float(lr[0]+"."+lr[1][0:2])
              if line.startswith("RESULT: baseline: TER: AVG:"):
                mter = line.split(":")[4]
                mter = mter.replace("\n","")
                mter = mter.strip()
                lr = mter.split(".")
                mter = float(lr[0]+"."+lr[1][0:2])

            logger.info("Meteor = %.2f | BLEU = %.2f | TER = %.2f", 
			mmeteor, mbleu, mter)

            return mmeteor, mbleu, mter

    def extract_references(self, directory, val=True):
        """
        Get reference descriptions for split we are generating outputs for.

        Helper function for bleu_score().
        """
        prefix = "val" if val else "test"
        references = self.data_gen.get_refs_by_split_as_list(prefix)

        for refid in xrange(len(references[0])):
            codecs.open('%s/%s_reference.ref%d'
                        % (directory, prefix, refid), 'w', 'utf-8').write('\n'.join([x[refid] for x in references]))

    def build_model(self, generate=False):
        '''
        Build a Keras model if one does not yet exist.

        Helper function for generate().
        '''

        if generate:
            t = self.args.generation_timesteps
        else:
            t = self.data_gen.max_seq_len
        if self.args.mrnn:
            m = models.MRNN(self.args.embed_size, self.args.hidden_size,
                            self.vocab_len,
                            self.args.dropin,
                            self.args.optimiser, self.args.l2reg,
                            hsn_size=self.hsn_size,
                            weights=self.args.checkpoint,
                            gru=self.args.gru,
                            clipnorm=self.args.clipnorm,
                            t=t)
        else:
            m = models.NIC(self.args.embed_size, self.args.hidden_size,
                           self.vocab_len,
                           self.args.dropin,
                           self.args.optimiser, self.args.l2reg,
                           hsn_size=self.hsn_size,
                           weights=self.args.checkpoint,
                           gru=self.args.gru,
                           clipnorm=self.args.clipnorm,
                           t=t)

        self.model = m.buildKerasModel(use_sourcelang=self.use_sourcelang,
                                       use_image=self.use_image)