Пример #1
0
def read_data_test(source_path, _beam_buckets):

    order = []
    data_set = [[] for _ in _beam_buckets]
    with tf.gfile.GFile(source_path, mode="r") as source_file:
        source = source_file.readline()
        counter = 0
        while source:
            if counter % 100000 == 0:
                print("  reading data line %d" % counter)
                sys.stdout.flush()
            source_ids = [int(x) for x in source.split()][::-1]
            success = False
            for bucket_id, source_size in enumerate(_beam_buckets):
                if len(source_ids) <= source_size:

                    order.append(
                        (bucket_id, len(data_set[bucket_id]), counter))
                    data_set[bucket_id].append(source_ids)
                    success = True
                    break

            if not success:
                mylog("failed source length {}".format(len(source_ids)))
            source = source_file.readline()
            counter += 1
    return data_set, order, counter
Пример #2
0
    def init_parameters_from_scratch(self, sess):
        mylog("Created model with fresh parameters.")
        sess.run(self.var_init_op)
        sess.run(self.broadcast_ops)

        # verify each model have the same parameters
        self.check_output_bias(sess)
Пример #3
0
    def beam_with_buckets(self, sources, sources_raw, inputs, source_buckets, encoder_cell, decoder_cell, dtype, devices = None, attention = False):

        self.topk_values = []
        self.eos_values = []
        self.topk_indexes = []

        with variable_scope.variable_scope(variable_scope.get_variable_scope(),reuse= None):   
            # seq2seq
            if not self.with_attention:
                _ht, _, = self.beam_basic_seq2seq(encoder_cell, decoder_cell, sources, inputs, dtype, devices)
            else:
                _ht, _, = self.beam_attention_seq2seq(encoder_cell, decoder_cell, sources, sources_raw, inputs, dtype, devices)

            # flat _ht
            _ht = tf.reshape(_ht, [-1, self.size]) # [batch_size, size]
            if self.normalize_ht_radius != 0.0:
                mylog("Normalize_ht: radius: {}".format(self.normalize_ht_radius))
                _ht = tf.nn.l2_normalize(_ht, 1) * self.normalize_ht_radius

            # logits
            _softmax = tf.nn.softmax(tf.add(tf.matmul(_ht, self.output_embedding, transpose_b = True), self.output_bias)) 


            if self.with_fsa:
              _softmax = self.mask_target(_softmax, self.fsa_target_mask, self.mask_values)
            
            # topk
            value, index = tf.nn.top_k(_softmax, self.topk_n, sorted = True)
            eos_v = tf.slice(_softmax, [0,self.EOS_ID],[-1,1])

            self.topk_value = value
            self.topk_index = index
            self.eos_value = eos_v
Пример #4
0
    def load_parameters(self, sess, path):
        mylog("Reading model parameters from %s" % path)
        self.saver.restore(sess, path)
        sess.run(self.broadcast_ops)

        # verify each model have the same parameters
        self.check_output_bias(sess)
Пример #5
0
    def model_with_buckets(self, sources, sources_raw, inputs, targets, weights,
                           buckets, encoder_cell, decoder_cell, dtype, softmax_loss_function,
                           per_example_loss=False, name=None, devices = None, attention = False, rare_weights = None):
                                                                              
        seq2seq_f = None

        if attention:
            seq2seq_f = self.attention_seq2seq
        else:
            seq2seq_f = self.basic_seq2seq

        with variable_scope.variable_scope(variable_scope.get_variable_scope()):

            _hts, decoder_state = seq2seq_f(encoder_cell, decoder_cell, sources, sources_raw, inputs, dtype, devices)
            
            # flat _hts targets weights
            _hts = tf.reshape(_hts, [-1, self.size]) #[batch_size * time_steps , size]
            # normalize the ht;
            if self.normalize_ht_radius != 0.0:
                mylog("Normalize_ht: radius: {}".format(self.normalize_ht_radius))
                _hts = tf.nn.l2_normalize(_hts, 1) * self.normalize_ht_radius
            
            targets = tf.reshape(targets, [-1])
            weights = tf.reshape(weights, [-1])
            
            # logits / loss / topk_values + topk_indexes
            with tf.device(devices[-1]):
                if self.with_sampled_softmax:
                    logits = _hts
                else:
                    logits = tf.add(tf.matmul(_hts, self.output_embedding, transpose_b = True), self.output_bias)
                  
                crossent = softmax_loss_function(logits, targets)
                cost = math_ops.reduce_sum(crossent * weights)
                cost = cost / math_ops.cast(self.global_batch_size, cost.dtype)
                if self.rare_weight:
                    rare_weights = tf.reshape(rare_weights, [-1])
                    rare_cost = math_ops.reduce_sum(crossent * rare_weights)
                    rare_cost = rare_cost / math_ops.cast(self.global_batch_size, cost.dtype)

                if self.mrt:
                    crossent_batch_length = tf.reshape(crossent * weights, [self.batch_size,-1])
                    # alpha_log_p = a * log(p(sentence)): shape:[batch_size]
                    alpha_log_p = - math_ops.reduce_sum(crossent_batch_length, axis = 1) * self.mrt_alpha
                    alpha_log_p = tf.reshape(alpha_log_p,[self.num_sentences_per_batch_in_mrt,-1])
                    q = tf.nn.softmax(alpha_log_p)
                    negative_bleu_scores = - tf.reshape(self.bleu_scores, [self.num_sentences_per_batch_in_mrt,-1])
                    mrt_loss = math_ops.reduce_sum(q * negative_bleu_scores) / math_ops.cast(self.num_sentences_per_batch_in_mrt, q.dtype)
                    #mrt_loss = tf.Print(mrt_loss,[crossent_batch_length, weights, alpha_log_p, q, negative_bleu_scores, mrt_loss], summarize = 1000)
                    #mrt_loss = tf.Print(mrt_loss, [q,negative_bleu_scores], summarize = 100)
                    self.mrt_loss = mrt_loss

        self.logits = logits
        self.losses_by_words = crossent # 1 dimension: [#batch_size * #words]
        if self.rare_weight:
            self.losses = rare_cost
            self.normal_losses = cost
        else:
            self.losses = cost
        self.hts = _hts
Пример #6
0
 def print_current_beam(self, j, bc, finished=False):
     if self.with_fsa:
         s = "Beam:{} Father:{} word:{} state:{} score:{}".format(
             j, bc.beam_index, bc.word_index, bc.fsa_state, bc.score)
     else:
         s = "Beam:{} Father:{} word:{} score:{}".format(
             j, bc.beam_index, bc.word_index, bc.score)
     if finished:
         s = "*" + s
     mylog(s)
Пример #7
0
 def report_statics(self):
     mylog_section("FSA")
     mylog_subsection("FSA Info")
     mylog("Number of States: {}".format(len(self.states)))
     mylog("Number of Links: {}".format(self.num_links))
     mylog("Start state: {}".format(self.start_state.name))
     mylog("End state: {}".format(self.end_state.name))
Пример #8
0
def read_data_test_parallel(source_path, target_path, _buckets):

    order = []
    data_set = [[] for _ in _buckets]
    with tf.gfile.GFile(source_path, mode="r") as source_file:
        with tf.gfile.GFile(target_path, mode="r") as target_file:

            source = source_file.readline()
            target = target_file.readline()
            counter = 0
            while source:
                if counter % 100000 == 0:
                    print("  reading data line %d" % counter)
                    sys.stdout.flush()

                if source.strip() == '':
                    source_ids = []
                else:
                    source_ids = np.fromstring(source, dtype=int,
                                               sep=' ').tolist()[::-1]
                if target.strip() == '':
                    target_ids = []
                else:
                    target_ids = np.fromstring(target, dtype=int,
                                               sep=' ').tolist()

                target_ids.append(data_utils.EOS_ID)

                success = False
                for bucket_id, (source_size,
                                target_size) in enumerate(_buckets):
                    if len(source_ids) <= source_size and len(
                            target_ids) <= target_size:
                        order.append(
                            (bucket_id, len(data_set[bucket_id]), counter))
                        data_set[bucket_id].append([source_ids, target_ids])
                        success = True
                        break

                if not success:
                    mylog("failed source length {}".format(len(source_ids)))

                source, target = source_file.readline(), target_file.readline()
                counter += 1

    return data_set, order, counter
Пример #9
0
    def decode(self):
        for i in xrange(self.max_target_length):
            # rnn_step
            if self.check_attention:
                top_value, top_index, eos_value, attention_score = self.rnn_step(
                    i)
            else:
                top_value, top_index, eos_value = self.rnn_step(i)
                attention_score = None

            # top_beam_cells = [BeamCell]
            top_beam_cells = self.get_top_beam_cells(i, top_value, top_index,
                                                     eos_value)
            # grow sentence
            self.grow_sentence(i,
                               top_beam_cells,
                               attention_score=attention_score)

            if self.valid_beam_size_last_step <= 0:
                break

        # add the length penalty
        for i in xrange(len(self.results)):
            self.results[i].get_normalized_score(self.length_alpha,
                                                 self.coverage_beta)

        # return the top one sentence and scores
        self.results = sorted(self.results, key=lambda x: -x.normalized_score)

        print(self.results[0])

        if len(self.results) > 0:
            best_sentence = self.results[0].finished_sentence
            best_score = self.results[0].normalized_score
            attention_history = self.results[0].attention_history
        else:
            best_sentence = []
            best_score = 0.0
            attention_history = None
            mylog("No decoding results.")

        return best_sentence, best_score, attention_history
Пример #10
0
    def step(self,
             session,
             sources_per_model,
             inputs_per_model,
             targets_per_model,
             target_weights_per_model,
             bucket_id,
             forward_only=False):
        # just ignore the bucket_id

        if forward_only:
            # if forward only (usually the evaluation of the dev set), use model0's step function. The sources_per_model should be the same shape as requested by models[0].step
            return self.models[0].step(session,
                                       sources_per_model,
                                       inputs_per_model,
                                       targets_per_model,
                                       target_weights_per_model,
                                       bucket_id,
                                       forward_only=forward_only)

        # sources: [] * n_models

        input_feed = {}

        for m, sources in enumerate(sources_per_model):
            input_feed[self.models[m].sources.name] = sources

        for m in xrange(len(sources_per_model)):
            inputs = inputs_per_model[m]
            targets = targets_per_model[m]
            target_weights = target_weights_per_model[m]

            input_feed[self.models[m].inputs.name] = inputs
            input_feed[self.models[m].targets.name] = targets
            input_feed[self.models[m].target_weights.name] = target_weights

        dump_logits_when_error = True

        # output_feed
        output_feed = []
        output_feed.append(self.losses)
        if not forward_only:
            output_feed += [self.updates, self.gradient_norms]
            if dump_logits_when_error:
                output_feed += [self.logits]

        outputs = session.run(output_feed,
                              input_feed,
                              options=self.run_options,
                              run_metadata=self.run_metadata)

        if dump_logits_when_error and (not forward_only) and (
                np.isnan(outputs[0]) or np.isinf(outputs[0])
                or np.isnan(outputs[2][0]) or np.isinf(outputs[2][0])):
            mylog("L/norm is Nan/Inf! {} {}".format(outputs[0], outputs[2][0]))
            for i in xrange(len(targets_per_model)):
                target = targets_per_model[i]
                source = sources_per_model[i]
                logits = outputs[3][i]
                np.savetxt("targets{}.npz".format(i), target)
                np.savetxt("source_inputs{}.npz".format(i), source)
                np.savetxt("logits{}.npz".format(i), logits)
            return  # will cause a exception

        if forward_only:
            return outputs[0]
        else:
            return outputs[0], outputs[2][
                0]  # only return losses and norm of first model
Пример #11
0
    def __init__(self,
                 buckets,
                 size,
                 from_vocab_size,
                 target_vocab_size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 optimizer = "adam",
                 forward_only=False,
                 dropoutRate = 1.0,
                 devices = "",
                 run_options = None,
                 run_metadata = None,
                 topk_n = 30,
                 dtype=tf.float32,
                 with_attention = False,
                 beam_search = False,
                 beam_buckets = None,
                 n_samples = 500,
                 with_sampled_softmax = False,
                 attention_style = "additive",
                 attention_scale = True,
                 standalone = True,
                 swap_memory = True,
                 n_distributed_models = 1,
                 with_fsa = False,
                 dump_lstm = False,
                 check_attention = False,
                 tie_input_output_embedding = False,
                 variational_dropout = False,
                 mrt = False,
                 num_sentences_per_batch_in_mrt = 1,
                 mrt_alpha = 0.005,
                 normalize_ht_radius = 0.0,
                 layer_normalization = False,
                 rare_weight = False,
                 null_attention = False
                 ):
        """Create the model.
        """
        
        mylog("Init SeqModel with dynamic_rnn")

        self.buckets = buckets
        self.PAD_ID = 0
        self.GO_ID = 1
        self.EOS_ID = 2
        self.UNK_ID = 3
        self.batch_size = batch_size
        self.devices = devices
        self.run_options = run_options
        self.run_metadata = run_metadata
        self.topk_n = min(topk_n,target_vocab_size)
        self.dtype = dtype
        self.from_vocab_size = from_vocab_size
        self.target_vocab_size = target_vocab_size
        self.num_layers = num_layers
        self.size = size
        self.with_attention = with_attention
        self.beam_search = beam_search
        self.with_sampled_softmax = with_sampled_softmax
        self.n_samples = n_samples
        self.attention_style = attention_style
        self.attention_scale = attention_scale
        self.max_gradient_norm = max_gradient_norm
        self.swap_memory = swap_memory
        self.with_fsa = with_fsa
        self.dump_lstm = dump_lstm
        self.check_attention = check_attention
        self.forward_only = forward_only
        self.tie_input_output_embedding = tie_input_output_embedding
        self.variational_dropout = variational_dropout
        self.mrt = mrt # minimum risk training
        self.num_sentences_per_batch_in_mrt = num_sentences_per_batch_in_mrt
        self.mrt_alpha = mrt_alpha
        self.normalize_ht_radius = normalize_ht_radius
        self.layer_normalization = layer_normalization
        self.rare_weight = rare_weight
        self.null_attention = null_attention
        
        self.global_batch_size = batch_size
        if not standalone:
            self.global_batch_size = batch_size * n_distributed_models
            
        self.first_batch = True
        
        # some parameters
        with tf.device(devices[0]):
            self.dropoutRate = tf.get_variable('dropoutRate',shape = (), initializer = tf.constant_initializer(float(dropoutRate),dtype = dtype), trainable=False, dtype=dtype)


            self.dropoutAssign_op = self.dropoutRate.assign(dropoutRate)
            self.dropout10_op = self.dropoutRate.assign(1.0)
            self.learning_rate = tf.get_variable("learning_rate", shape = (), initializer = tf.constant_initializer(float(learning_rate), dtype = dtype), trainable=False, dtype=dtype)
            self.learning_rate_decay_op = self.learning_rate.assign(
                self.learning_rate * learning_rate_decay_factor)
            self.global_step = tf.get_variable("global_step", initializer = 0, trainable=False, dtype = tf.int32)
            
        
        # Input Layer
        with tf.device(devices[0]):
            # for encoder
            self.source_input_embedding = tf.get_variable("source_input_embedding",[from_vocab_size, size], dtype = dtype)
            
            source_input_plhd = tf.placeholder(tf.int32, shape = [self.batch_size, None], name = "source")
            source_input_embed = tf.nn.embedding_lookup(self.source_input_embedding, source_input_plhd)
            self.sources = source_input_plhd
            self.sources_embed = source_input_embed
            
            
            # for decoder
            self.inputs = []
            self.inputs_embed = []
            
            self.input_embedding = tf.get_variable("input_embedding",[target_vocab_size, size], dtype = dtype)

            input_plhd = tf.placeholder(tf.int32, shape = [self.batch_size, None], name = "input")
            input_embed = tf.nn.embedding_lookup(self.input_embedding, input_plhd)
            self.inputs = input_plhd
            self.inputs_embed = input_embed

            if self.mrt:
                # only for sampling
                self.dummy_inputs = tf.cast(tf.reshape(input_plhd, [self.batch_size,-1,1]),self.dtype)
                # for mrt training
                self.bleu_scores = tf.placeholder(self.dtype, shape = [self.batch_size], name = "bleu_scores")
            
            
        def lstm_cell(device,input_keep_prob = 1.0, output_keep_prob = 1.0, state_keep_prob=1.0, variational_recurrent=False, input_size = None, seed = None):
            if not self.layer_normalization:
                cell = tf.contrib.rnn.LSTMCell(size, state_is_tuple=True)
            else:
                cell = tf.contrib.rnn.LayerNormBasicLSTMCell(size)

            cell = tf.contrib.rnn.DropoutWrapper(cell,input_keep_prob = input_keep_prob, output_keep_prob = output_keep_prob, state_keep_prob = state_keep_prob, variational_recurrent=False, dtype=self.dtype, input_size = input_size, seed = None)
            cell = DeviceCellWrapper(cell, device)
            return cell
          
          
        # LSTM
        encoder_cells = []
        decoder_cells = []
        for i in xrange(num_layers):
            input_keep_prob = self.dropoutRate
            output_keep_prob = 1.0
            state_keep_prob = 1.0
            input_size = size
            seed = None
            if self.variational_dropout:
              seed = random.randint(1,10000)
              state_keep_prob = self.dropoutRate
            
            if i == num_layers - 1:
                output_keep_prob = self.dropoutRate
            device = devices[i+1]
            encoder_cells.append(lstm_cell(device,input_keep_prob, 1.0, state_keep_prob = state_keep_prob, variational_recurrent = self.variational_dropout, input_size = input_size, seed = seed)) # encoder's top layer doesn't need dropout
            decoder_cells.append(lstm_cell(device,input_keep_prob, output_keep_prob, state_keep_prob = state_keep_prob, variational_recurrent = self.variational_dropout, input_size = input_size, seed = seed)) # encoder's top layer doesn't need dropout
            
        self.encoder_cell = tf.contrib.rnn.MultiRNNCell(encoder_cells, state_is_tuple=True)
        self.decoder_cell = tf.contrib.rnn.MultiRNNCell(decoder_cells, state_is_tuple=True)
        
        # Output Layer
        with tf.device(devices[-1]):
            if self.tie_input_output_embedding:
                self.output_embedding = self.input_embedding
            else:
                self.output_embedding = tf.get_variable("output_embedding",[target_vocab_size, size], dtype = dtype)
            self.output_bias = tf.get_variable("output_bias",[target_vocab_size], dtype = dtype)

            # target: 1  2  3  4 
            # inputs: go 1  2  3
            # weights:1  1  1  1


            self.targets = tf.placeholder(tf.int32, shape=[self.batch_size, None ], name = "target")
            self.target_weights = tf.placeholder(dtype, shape = [self.batch_size, None ], name="target_weight")
            if self.rare_weight:
              self.rare_weights = tf.placeholder(dtype, shape = [self.batch_size, None ], name="rare_weight")

        # Attention
        if self.with_attention:
            self.attention = Attention(self)


        # softmax + cross_entropy_loss
        if not self.with_sampled_softmax:
            self.softmax_loss_function = lambda x,y: tf.nn.sparse_softmax_cross_entropy_with_logits(logits=x, labels= y)
        else:
            def sampled_loss(labels, logits):
                labels = tf.reshape(labels, [-1, 1])
                # We need to compute the sampled_softmax_loss using 32bit floats to
                # avoid numerical instabilities.
                return tf.cast(
                    tf.nn.sampled_softmax_loss(
                        weights=self.output_embedding,
                        biases=self.output_bias,
                        labels=labels,
                        inputs=logits,
                        num_sampled=self.n_samples,
                        num_classes=target_vocab_size),
                    dtype)
            
            self.softmax_loss_function = lambda y,x: sampled_loss(x,y)

        if not beam_search:
            # Model with buckets
            self.model_with_buckets(self.sources_embed, self.sources, self.inputs_embed, self.targets, self.target_weights, self.buckets, self.encoder_cell, self.decoder_cell, dtype, self.softmax_loss_function, devices = devices, attention = with_attention, rare_weights = self.rare_weights if self.rare_weight else None)

            # for minimum risk training, draw the sample decoder
            if self.mrt:
                self.sample_network(self.sources_embed, self.sources, self.dummy_inputs, self.encoder_cell, self.decoder_cell, dtype, devices = devices, attention = with_attention)

            # train
            if not forward_only:

                params = tf.contrib.framework.get_trainable_variables(scope=variable_scope.get_variable_scope())
                self.params = params

                # unclipped gradients

                if not self.mrt:
                    self.gradients = tf.gradients(self.losses, params, colocate_gradients_with_ops=True)
                else:
                    self.gradients = tf.gradients(self.mrt_loss, params, colocate_gradients_with_ops=True)
                

                # optimizor
                if optimizer == "adagrad":
                    opt = tf.train.AdagradOptimizer(self.learning_rate)
                elif optimizer == 'adam':
                    opt = tf.train.AdamOptimizer(self.learning_rate)
                else:
                    opt = tf.train.GradientDescentOptimizer(learning_rate = self.learning_rate)
                self.opt = opt

                # updates
                if standalone:
                    clipped_gradients, norm = tf.clip_by_global_norm(self.gradients, max_gradient_norm)
                    self.gradient_norms = norm
                    self.updates = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)

        else: # for beam search;
            self.init_beam_decoder(beam_buckets)

        if standalone: 
            all_vars = tf.global_variables()
            self.train_vars = []
            self.beam_search_vars = []
            for var in all_vars:
                if not var.name.startswith("v0/beam_search"):
                    self.train_vars.append(var)
                else:
                    self.beam_search_vars.append(var)

            self.saver = tf.train.Saver(self.train_vars)
            self.best_saver = tf.train.Saver(self.train_vars)
Пример #12
0
    def grow_sentence(self, index, top_beam_cells, attention_score=None):
        if self.print_beam:
            mylog("--------- Step {} --------".format(index))
        # the variables for next step
        target_inputs = []
        beam_parent = []
        scores = []
        sentences = []
        if self.check_attention:
            attention_scores = []
            attention_history = []

        if self.with_fsa:
            fsa_states = []

        # process the top beam_size cells

        for j, bc in enumerate(top_beam_cells):

            if bc.word_index == data_utils.EOS_ID:  # finish one sentences
                if len(self.sentences[
                        bc.beam_index]) + 1 < self.min_target_length:
                    continue

                finished_sentence = self.sentences[bc.beam_index] + [
                    bc.word_index
                ]
                finished_score = bc.score

                coverage_score = 0.0
                finished_attention_history = None
                if self.check_attention:
                    coverage_score = self.attention_scores[
                        bc.beam_index] + attention_score[bc.beam_index]
                    #print(finished_sentence, finished_score)
                    #print(coverage_score)
                    coverage_score = np.sum(
                        np.log(np.minimum(coverage_score, 1.0)))

                    # for attention_history
                    finished_attention_history = self.attention_history[
                        bc.beam_index] + [attention_score[bc.beam_index]]

                f = FinishedEntry(finished_sentence,
                                  finished_score,
                                  coverage_score=coverage_score,
                                  attention_history=finished_attention_history)

                self.results.append(f)

                if self.print_beam:
                    self.print_current_beam(j, bc, finished=True)

                continue

            if self.print_beam:
                self.print_current_beam(j, bc)

            beam_parent.append(bc.beam_index)
            target_inputs.append(bc.word_index)
            scores.append(bc.score)
            sentences.append(self.sentences[bc.beam_index] + [bc.word_index])

            if self.check_attention:
                attention_scores.append(self.attention_scores[bc.beam_index] +
                                        attention_score[bc.beam_index])
                attention_history.append(
                    self.attention_history[bc.beam_index] +
                    [attention_score[bc.beam_index]])

            if self.with_fsa:
                fsa_states.append(bc.fsa_state)

            if len(scores) >= self.beam_size:
                break

        # can not fill beam_size, just repeat the last one

        self.valid_beam_size_last_step = len(scores)

        while len(scores) > 0 and len(
                scores
        ) < self.beam_size and index < self.max_target_length - 1:
            beam_parent.append(beam_parent[-1])
            target_inputs.append(target_inputs[-1])
            scores.append(scores[-1])
            sentences.append(sentences[-1])
            if self.with_fsa:
                fsa_states.append(fsa_states[-1])
            if self.check_attention:
                attention_scores.append(self.attention_scores[-1] +
                                        attention_score[-1])
                attention_history.append(self.attention_history[-1] +
                                         [attention_score[-1]])

        # update for next step
        self.beam_parent = beam_parent
        self.target_inputs = target_inputs
        self.scores = scores
        self.sentences = sentences
        if self.with_fsa:
            self.fsa_states = fsa_states
            self.prepare_fsa_target_mask()
        if self.check_attention:
            self.attention_scores = attention_scores
            self.attention_history = attention_history
Пример #13
0
    def __init__(self,
                 buckets,
                 size,
                 from_vocab_size,
                 target_vocab_size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 optimizer="adam",
                 forward_only=False,
                 dropoutRate=1.0,
                 run_options=None,
                 run_metadata=None,
                 devices_per_model=None,
                 topk_n=30,
                 dtype=tf.float32,
                 with_attention=False,
                 beam_search=False,
                 beam_buckets=None,
                 n_samples=500,
                 with_sampled_softmax=False,
                 attention_style="additive",
                 attention_scale=True,
                 num_models=4,
                 tie_input_output_embedding=False):
        '''
        LocalReplica: Model1[GPU0,GPU1] Model2[GPU3,GPU4],... each model has their own variables, after one step, gradients will sum across multiple GPUs, and updates locally on their own GPU. 

        devices_per_model: [["/gpu:0",..],...] devices_per_model[m][l] m: model, l:layer



        '''

        self.models = []
        self.devices_per_model = devices_per_model
        self.variable_mgr = VariableMgrLocalReplicated()
        self.num_models = num_models
        self.buckets = buckets
        self.run_options = run_options
        self.run_metadata = run_metadata

        # Generate models
        for d, devices_each_model in enumerate(self.devices_per_model):
            with tf.device(devices_each_model[0]):
                with self.variable_mgr.create_outer_variable_scope(
                        d), tf.name_scope("tower_{}".format(d)) as name_scope:
                    mylog("creating model #{} at devices: {}".format(
                        d, devices_each_model))
                    seqModel = SeqModel(
                        buckets,
                        size,
                        from_vocab_size,
                        target_vocab_size,
                        num_layers,
                        max_gradient_norm,
                        batch_size,
                        learning_rate,
                        learning_rate_decay_factor,
                        optimizer=optimizer,
                        forward_only=forward_only,
                        dropoutRate=dropoutRate,
                        devices=devices_each_model,
                        run_options=run_options,
                        run_metadata=run_metadata,
                        topk_n=topk_n,
                        dtype=dtype,
                        with_attention=with_attention,
                        beam_search=beam_search,
                        beam_buckets=beam_buckets,
                        n_samples=n_samples,
                        with_sampled_softmax=with_sampled_softmax,
                        attention_style=attention_style,
                        attention_scale=attention_scale,
                        standalone=False,  # ! do not init the optimizer now
                        n_distributed_models=self.num_models,
                        tie_input_output_embedding=tie_input_output_embedding)

                    self.models.append(seqModel)

        # collect the learning_rate_decay_op
        self.learning_rate_dacay_ops = []
        self.dropout10_ops = []
        self.dropoutAssign_ops = []
        for model in self.models:
            self.learning_rate_dacay_ops.append(model.learning_rate_decay_op)
            self.dropout10_ops.append(model.dropout10_op)
            self.dropoutAssign_ops.append(model.dropoutAssign_op)

        # Aggregate the gradients

        section = "Aggregate Gradients "
        mylog_section(section)

        agg_grads = []

        for b in xrange(len(buckets)):

            mylog_subsection("Bucket {}".format(b))

            # for each buckets
            gradients = []  # [[grad * n_variable] * n_model]
            params = []  # [[param * n_variable] * n_model]
            for model in self.models:
                gradients.append(model.gradients[b])
                params.append(model.params)

            agg_grad_per_gpu = {
            }  # record how many aggregations of grads happens on eah gpu

            agg_grads_per_bucket = []

            for param_id in xrange(len(params[0])):

                grads_per_model = []
                params_per_model = []

                for model_id in xrange(len(params)):
                    params_per_model.append(params[model_id][param_id])
                    grads_per_model.append(gradients[model_id][param_id])

                # choose one device to do aggregation
                device_for_agg = None

                min_n_agg = 1000000

                for param in params_per_model:
                    dev = param.device
                    if not dev in agg_grad_per_gpu:
                        agg_grad_per_gpu[dev] = []
                    n_agg = len(agg_grad_per_gpu[dev])
                    if min_n_agg > n_agg:
                        min_n_agg = n_agg
                        device_for_agg = dev

                agg_grad_per_gpu[device_for_agg].append(params[0][param_id])

                with tf.device(device_for_agg):
                    if type(grads_per_model[0]) == tf.IndexedSlices:
                        values = tf.concat([x.values for x in grads_per_model],
                                           0)
                        indices = tf.concat(
                            [x.indices for x in grads_per_model], 0)
                        agg_grad = tf.IndexedSlices(values, indices)
                    else:
                        agg_grad = tf.add_n(grads_per_model)

                agg_grads_per_bucket.append(agg_grad)

            # show aggregation device placement
            for device in agg_grad_per_gpu:
                mylog("Aggregated On {}:".format(device))
                for param in agg_grad_per_gpu[device]:
                    mylog("\t" + param.name)
            agg_grads.append(agg_grads_per_bucket)

        # send the aggregated grads to each model on different gpus
        for d, devices_each_model in enumerate(self.devices_per_model):
            self.models[d].init_agg_updates(agg_grads)

        # combine losses, updates and gradients norm
        self.losses = []  # per bucket
        self.updates = []
        self.gradient_norms = []

        for b in xrange(len(buckets)):
            losses = []
            updates = []
            gradient_norms = []
            for i, model in enumerate(self.models):
                losses.append(model.losses[b])
                updates.append(model.updates[b])
                gradient_norms.append(model.gradient_norms[b])

            loss = tf.add_n(losses)
            self.losses.append(loss)
            self.updates.append(updates)
            self.gradient_norms.append(gradient_norms)

        # get init ops group
        self.var_init_op = tf.global_variables_initializer()
        self.broadcast_ops = self.variable_mgr.get_post_init_ops()

        # for saver
        all_vars = tf.global_variables()
        self.train_vars = []
        for var in all_vars:
            if var.name.startswith("v0"):
                self.train_vars.append(var)

        self.saver = tf.train.Saver(self.train_vars)
        self.best_saver = tf.train.Saver(self.train_vars)
Пример #14
0
def train():

    # Read Data
    mylog_section("READ DATA")

    from_train = None
    to_train = None
    from_dev = None
    to_dev = None

    from_train, to_train, from_dev, to_dev, _, _ = data_utils.prepare_data(
        FLAGS.data_cache_dir,
        FLAGS.train_path_from,
        FLAGS.train_path_to,
        FLAGS.dev_path_from,
        FLAGS.dev_path_to,
        FLAGS.from_vocab_size,
        FLAGS.to_vocab_size,
        preprocess_data = FLAGS.preprocess_data
    )


    train_data_bucket = read_data(from_train,to_train,_buckets)
    dev_data_bucket = read_data(from_dev,to_dev, _buckets)
    _,_,real_vocab_size_from,real_vocab_size_to = data_utils.get_vocab_info(FLAGS.data_cache_dir)
    
    FLAGS._buckets = _buckets
    FLAGS.real_vocab_size_from = real_vocab_size_from
    FLAGS.real_vocab_size_to = real_vocab_size_to

    train_n_targets = np.sum([np.sum([len(items[1]) for items in x]) for x in train_data_bucket])
    train_n_tokens = np.sum([np.sum([len(items[1])+len(items[0]) for items in x]) for x in train_data_bucket])
    
    train_bucket_sizes = [len(train_data_bucket[b]) for b in xrange(len(_buckets))]
    train_total_size = float(sum(train_bucket_sizes))
    train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size for i in xrange(len(train_bucket_sizes))]
    dev_bucket_sizes = [len(dev_data_bucket[b]) for b in xrange(len(_buckets))]
    dev_total_size = int(sum(dev_bucket_sizes))

    mylog_section("REPORT")
    # steps
    batch_size = FLAGS.batch_size
    n_epoch = FLAGS.n_epoch
    steps_per_epoch = int(train_total_size / batch_size / FLAGS.num_models)
    steps_per_dev = int(dev_total_size / batch_size)
    if FLAGS.checkpoint_steps == 0:
        steps_per_checkpoint = int(steps_per_epoch / FLAGS.checkpoint_frequency)
    else:
        steps_per_checkpoint = FLAGS.checkpoint_steps
        
    total_steps = steps_per_epoch * n_epoch

    # reports
    mylog("from_vocab_size: {}".format(FLAGS.real_vocab_size_from))
    mylog("to_vocab_size: {}".format(FLAGS.real_vocab_size_to))
    mylog("_buckets: {}".format(FLAGS._buckets))
    mylog("Train:")
    mylog("total: {}".format(train_total_size))
    mylog("bucket sizes: {}".format(train_bucket_sizes))
    mylog("Dev:")
    mylog("total: {}".format(dev_total_size))
    mylog("bucket sizes: {}".format(dev_bucket_sizes))
    mylog("Steps_per_epoch: {}".format(steps_per_epoch))
    mylog("Total_steps:{}".format(total_steps))
    mylog("Steps_per_checkpoint: {}".format(steps_per_checkpoint))


    mylog_section("IN TENSORFLOW")
    
    config = tf.ConfigProto(allow_soft_placement=True, log_device_placement = False)
    config.gpu_options.allow_growth = FLAGS.allow_growth

    with tf.Session(config=config) as sess:
        
        # runtime profile
        if FLAGS.profile:
            run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
            run_metadata = tf.RunMetadata()
        else:
            run_options = None
            run_metadata = None

        mylog_section("MODEL/SUMMARY/WRITER")

        mylog("Creating Model.. (this can take a few minutes)")
        model = create_model(sess, run_options, run_metadata)

        if FLAGS.with_summary:
            mylog("Creating ModelSummary")
            modelSummary = ModelSummary()

            mylog("Creating tf.summary.FileWriter")
            summaryWriter = tf.summary.FileWriter(os.path.join(FLAGS.summary_dir , "train.summary"), sess.graph)

        mylog_section("All Variables")
        show_all_variables()

        # Data Iterators
        mylog_section("Data Iterators")

        dite = DataIterator(model, train_data_bucket, len(train_buckets_scale), batch_size, train_buckets_scale)
        
        iteType = 0
        if iteType == 0:
            mylog("Itetype: withRandom")
            ite = dite.next_random()
        elif iteType == 1:
            mylog("Itetype: withSequence")
            ite = dite.next_sequence()
        
        # statistics during training
        step_time, loss = 0.0, 0.0
        get_batch_time = 0.0
        current_step = 0
        previous_losses = []
        low_ppx = float("inf")
        low_ppx_step = 0
        steps_per_report = 30
        n_targets_report = 0
        n_sources_report = 0
        report_time = 0
        n_valid_sents = 0
        n_valid_words = 0
        patience = FLAGS.patience
        
        mylog_section("TRAIN")

        
        while current_step < total_steps:
            
            # start
            start_time = time.time()
            
            # data and train
            source_inputs, target_inputs, target_outputs, target_weights, bucket_id = ite.next()

            get_batch_time += (time.time() - start_time) / steps_per_checkpoint
            
            L, norm = model.step(sess, source_inputs, target_inputs, target_outputs, target_weights, bucket_id)

            
            # loss and time
            step_time += (time.time() - start_time) / steps_per_checkpoint
            
            loss += L
            current_step += 1
            n_valid_sents += np.sum(np.sign(target_weights[0]))

            # double sum because different model's target_weights has different shape
            n_valid_words += np.sum(np.sum(target_weights))
            
            # for report
            report_time += (time.time() - start_time)
            
            n_targets_report += np.sum(np.sum(target_weights))
            n_sources_report += np.sum(np.sum(np.sign(source_inputs)))
    
            if current_step % steps_per_report == 1:
                sect_name = "STEP {}".format(current_step)
                msg = "StepTime: {:.4f} sec Speed: {:.4f} words/s Total_words: {} get_batch_time_ratio: {:.4f}".format(report_time/steps_per_report, (n_sources_report+n_targets_report)*1.0 / report_time, train_n_tokens, get_batch_time / step_time)
                mylog_line(sect_name,msg)

                report_time = 0
                n_targets_report = 0
                n_sources_report = 0

                # Create the Timeline object, and write it to a json
                if FLAGS.profile:
                    tl = timeline.Timeline(run_metadata.step_stats)
                    ctf = tl.generate_chrome_trace_format()
                    with open('timeline.json', 'w') as f:
                        f.write(ctf)
                    exit()
                    
            
            if current_step % steps_per_checkpoint == 1:

                i_checkpoint = int(current_step / steps_per_checkpoint)
                
                # train_ppx
                loss = loss * FLAGS.batch_size * FLAGS.num_models 
                loss = loss / n_valid_words
                train_ppx = math.exp(float(loss)) if loss < 300 else float("inf")
                learning_rate = model.get_learning_rate(sess)
                
                                
                # dev_ppx
                dev_loss, dev_ppx = evaluate(sess, model, dev_data_bucket)

                # report
                sect_name = "CHECKPOINT {} STEP {}".format(i_checkpoint, current_step)
                msg = "Learning_rate: {:.4f} Dev_ppx: {:.4f} Train_ppx: {:.4f} Norm: {:.4f}".format(learning_rate, dev_ppx, train_ppx, norm)
                mylog_line(sect_name, msg)

                if FLAGS.with_summary:
                    # save summary
                    _summaries = modelSummary.step_record(sess, train_ppx, dev_ppx)
                    for _summary in _summaries:
                        summaryWriter.add_summary(_summary, i_checkpoint)
                
                # save model per checkpoint
                if FLAGS.saveCheckpoint:
                    checkpoint_path = os.path.join(FLAGS.saved_model_dir, "model")
                    s = time.time()
                    model.saver.save(sess, checkpoint_path, global_step=i_checkpoint, write_meta_graph = False)
                    msg = "Model saved using {:.4f} sec at {}".format(time.time()-s, checkpoint_path)
                    mylog_line(sect_name, msg)
                    
                # save best model
                if dev_ppx < low_ppx:
                    patience = FLAGS.patience
                    low_ppx = dev_ppx
                    low_ppx_step = current_step
                    checkpoint_path = os.path.join(FLAGS.saved_model_dir, "best")
                    s = time.time()
                    model.best_saver.save(sess, checkpoint_path, global_step=0, write_meta_graph = False)
                    msg = "Model saved using {:.4f} sec at {}".format(time.time()-s, checkpoint_path)
                    mylog_line(sect_name, msg)
                else:
                    patience -= 1

                    # decay the learning rate
                    if FLAGS.decay_learning_rate:
                        sess.run(model.learning_rate_dacay_ops)
                        msg = "New learning_rate: {:.4f} Dev_ppx: {:.4f} Lowest_dev_ppx: {:.4f}".format(model.get_learning_rate(sess), dev_ppx, low_ppx)
                        mylog_line(sect_name, msg)

                    

                if patience <= 0:
                    mylog("Training finished. Running out of patience.")
                    break

                # Save checkpoint and zero timer and loss.
                step_time, loss, n_valid_sents, n_valid_words = 0.0, 0.0, 0, 0
                get_batch_time = 0
Пример #15
0
 def check_output_bias(self, sess):
     # to varify, we lookat the output_bias
     for m, model in enumerate(self.models):
         mylog("Model{} output_bias: {}".format(
             m,
             sess.run(model.output_bias)[0]))
Пример #16
0
def log_flags(_FLAGS):
    members = _FLAGS.__dict__['__flags'].keys()
    mylog_section("FLAGS")
    for attr in members:
        mylog("{}={}".format(attr, getattr(_FLAGS, attr)))
Пример #17
0
def show_all_tensors():
    all_tensors = [
        tensor for tensor in tf.get_default_graph().as_graph_def().node
    ]
    for tensor in all_tensors:
        mylog(tensor.name)
Пример #18
0
def show_all_variables():
    all_vars = tf.global_variables()
    for var in all_vars:
        mylog(var.name + " @ " + var.device)