Exemplo n.º 1
0
def create_model(session, run_options, run_metadata):
    devices = get_device_address(FLAGS.N)
    dtype = tf.float32
    model = SeqModel(FLAGS._buckets,
                     FLAGS.size,
                     FLAGS.real_vocab_size,
                     FLAGS.num_layers,
                     FLAGS.max_gradient_norm,
                     FLAGS.batch_size,
                     FLAGS.learning_rate,
                     FLAGS.learning_rate_decay_factor,
                     withAdagrad=FLAGS.withAdagrad,
                     dropoutRate=FLAGS.keep_prob,
                     dtype=dtype,
                     devices=devices,
                     topk_n=FLAGS.topk,
                     run_options=run_options,
                     run_metadata=run_metadata)
    ckpt = tf.train.get_checkpoint_state(FLAGS.saved_model_dir)
    if FLAGS.mode == "DUMP_LSTM" or FLAGS.mode == "BEAM_DECODE" or FLAGS.mode == 'FORCE_DECODE' or (
            not FLAGS.fromScratch) and ckpt:
        mylog("Reading model parameters from %s" % ckpt.model_checkpoint_path)
        model.saver.restore(session, ckpt.model_checkpoint_path)
    else:
        mylog("Created model with fresh parameters.")
        session.run(tf.global_variables_initializer())
    return model
Exemplo n.º 2
0
def create_model(session, embAttr, START_ID, run_options, run_metadata):
    devices = get_device_address(FLAGS.N)
    dtype = tf.float32
    model = SeqModel(_buckets,
                     FLAGS.size,
                     FLAGS.num_layers,
                     FLAGS.max_gradient_norm,
                     FLAGS.batch_size,
                     FLAGS.learning_rate,
                     FLAGS.learning_rate_decay_factor,
                     embAttr,
                     withAdagrad=FLAGS.withAdagrad,
                     num_samples=FLAGS.n_sampled,
                     dropoutRate=FLAGS.keep_prob,
                     START_ID=START_ID,
                     loss=FLAGS.loss,
                     dtype=dtype,
                     devices=devices,
                     use_concat=FLAGS.use_concat,
                     no_user_id=FLAGS.no_user_id,
                     output_feat=FLAGS.output_feat,
                     no_input_item_feature=FLAGS.no_input_item_feature,
                     topk_n=FLAGS.topk,
                     run_options=run_options,
                     run_metadata=run_metadata)

    ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
    # if FLAGS.recommend or (not FLAGS.fromScratch) and ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path):
    if FLAGS.recommend or (not FLAGS.fromScratch) and ckpt:
        mylog("Reading model parameters from %s" % ckpt.model_checkpoint_path)
        model.saver.restore(session, ckpt.model_checkpoint_path)
    else:
        mylog("Created model with fresh parameters.")
        session.run(tf.global_variables_initializer())
    return model
Exemplo n.º 3
0
    def __init__(self,
                 buckets,
                 size,
                 from_vocab_size,
                 target_vocab_size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 optimizer="adam",
                 forward_only=False,
                 dropoutRate=1.0,
                 run_options=None,
                 run_metadata=None,
                 devices_per_model=None,
                 topk_n=30,
                 dtype=tf.float32,
                 with_attention=False,
                 beam_search=False,
                 beam_buckets=None,
                 n_samples=500,
                 with_sampled_softmax=False,
                 attention_style="additive",
                 attention_scale=True,
                 num_models=4,
                 tie_input_output_embedding=False):
        '''
        LocalReplica: Model1[GPU0,GPU1] Model2[GPU3,GPU4],... each model has their own variables, after one step, gradients will sum across multiple GPUs, and updates locally on their own GPU. 

        devices_per_model: [["/gpu:0",..],...] devices_per_model[m][l] m: model, l:layer



        '''

        self.models = []
        self.devices_per_model = devices_per_model
        self.variable_mgr = VariableMgrLocalReplicated()
        self.num_models = num_models
        self.buckets = buckets
        self.run_options = run_options
        self.run_metadata = run_metadata

        # Generate models
        for d, devices_each_model in enumerate(self.devices_per_model):
            with tf.device(devices_each_model[0]):
                with self.variable_mgr.create_outer_variable_scope(
                        d), tf.name_scope("tower_{}".format(d)) as name_scope:
                    mylog("creating model #{} at devices: {}".format(
                        d, devices_each_model))
                    seqModel = SeqModel(
                        buckets,
                        size,
                        from_vocab_size,
                        target_vocab_size,
                        num_layers,
                        max_gradient_norm,
                        batch_size,
                        learning_rate,
                        learning_rate_decay_factor,
                        optimizer=optimizer,
                        forward_only=forward_only,
                        dropoutRate=dropoutRate,
                        devices=devices_each_model,
                        run_options=run_options,
                        run_metadata=run_metadata,
                        topk_n=topk_n,
                        dtype=dtype,
                        with_attention=with_attention,
                        beam_search=beam_search,
                        beam_buckets=beam_buckets,
                        n_samples=n_samples,
                        with_sampled_softmax=with_sampled_softmax,
                        attention_style=attention_style,
                        attention_scale=attention_scale,
                        standalone=False,  # ! do not init the optimizer now
                        n_distributed_models=self.num_models,
                        tie_input_output_embedding=tie_input_output_embedding)

                    self.models.append(seqModel)

        # collect the learning_rate_decay_op
        self.learning_rate_dacay_ops = []
        self.dropout10_ops = []
        self.dropoutAssign_ops = []
        for model in self.models:
            self.learning_rate_dacay_ops.append(model.learning_rate_decay_op)
            self.dropout10_ops.append(model.dropout10_op)
            self.dropoutAssign_ops.append(model.dropoutAssign_op)

        # Aggregate the gradients

        section = "Aggregate Gradients "
        mylog_section(section)

        agg_grads = []

        for b in xrange(len(buckets)):

            mylog_subsection("Bucket {}".format(b))

            # for each buckets
            gradients = []  # [[grad * n_variable] * n_model]
            params = []  # [[param * n_variable] * n_model]
            for model in self.models:
                gradients.append(model.gradients[b])
                params.append(model.params)

            agg_grad_per_gpu = {
            }  # record how many aggregations of grads happens on eah gpu

            agg_grads_per_bucket = []

            for param_id in xrange(len(params[0])):

                grads_per_model = []
                params_per_model = []

                for model_id in xrange(len(params)):
                    params_per_model.append(params[model_id][param_id])
                    grads_per_model.append(gradients[model_id][param_id])

                # choose one device to do aggregation
                device_for_agg = None

                min_n_agg = 1000000

                for param in params_per_model:
                    dev = param.device
                    if not dev in agg_grad_per_gpu:
                        agg_grad_per_gpu[dev] = []
                    n_agg = len(agg_grad_per_gpu[dev])
                    if min_n_agg > n_agg:
                        min_n_agg = n_agg
                        device_for_agg = dev

                agg_grad_per_gpu[device_for_agg].append(params[0][param_id])

                with tf.device(device_for_agg):
                    if type(grads_per_model[0]) == tf.IndexedSlices:
                        values = tf.concat([x.values for x in grads_per_model],
                                           0)
                        indices = tf.concat(
                            [x.indices for x in grads_per_model], 0)
                        agg_grad = tf.IndexedSlices(values, indices)
                    else:
                        agg_grad = tf.add_n(grads_per_model)

                agg_grads_per_bucket.append(agg_grad)

            # show aggregation device placement
            for device in agg_grad_per_gpu:
                mylog("Aggregated On {}:".format(device))
                for param in agg_grad_per_gpu[device]:
                    mylog("\t" + param.name)
            agg_grads.append(agg_grads_per_bucket)

        # send the aggregated grads to each model on different gpus
        for d, devices_each_model in enumerate(self.devices_per_model):
            self.models[d].init_agg_updates(agg_grads)

        # combine losses, updates and gradients norm
        self.losses = []  # per bucket
        self.updates = []
        self.gradient_norms = []

        for b in xrange(len(buckets)):
            losses = []
            updates = []
            gradient_norms = []
            for i, model in enumerate(self.models):
                losses.append(model.losses[b])
                updates.append(model.updates[b])
                gradient_norms.append(model.gradient_norms[b])

            loss = tf.add_n(losses)
            self.losses.append(loss)
            self.updates.append(updates)
            self.gradient_norms.append(gradient_norms)

        # get init ops group
        self.var_init_op = tf.global_variables_initializer()
        self.broadcast_ops = self.variable_mgr.get_post_init_ops()

        # for saver
        all_vars = tf.global_variables()
        self.train_vars = []
        for var in all_vars:
            if var.name.startswith("v0"):
                self.train_vars.append(var)

        self.saver = tf.train.Saver(self.train_vars)
        self.best_saver = tf.train.Saver(self.train_vars)