示例#1
0
文件: trainer.py 项目: rzcwade/nabu
    def _device(self, cluster):
        '''
        get the device

        args:
            cluster: a tf cluster

        returns:
            - the device specification
            - the chief paramater server device
        '''

        if 'local' in cluster.as_dict():
            device = tf.DeviceSpec(job='local')
            chief_ps = None
        else:
            #distributed training
            num_servers = len(cluster.as_dict()['ps'])
            ps_strategy = tf.contrib.training.GreedyLoadBalancingStrategy(
                num_tasks=num_servers,
                load_fn=tf.contrib.training.byte_size_load_fn)
            device = tf.train.replica_device_setter(ps_tasks=num_servers,
                                                    ps_strategy=ps_strategy)
            chief_ps = tf.DeviceSpec(job='ps', task=0)

        return device, chief_ps
示例#2
0
  def _model_fn(features, labels, mode, params):
    model_fn = MODELS[FLAGS.model].model

    global_step = tf.train.get_or_create_global_step()

    if FLAGS.num_gpus > 0 and mode == learn.ModeKeys.TRAIN:
      split_features = {k: tf.split(v, FLAGS.num_gpus)
                        for k, v in features.iteritems()}
      split_labels = {k: tf.split(v, FLAGS.num_gpus)
                      for k, v in labels.iteritems()}
      grads = []
      predictions = collections.defaultdict(list)
      losses = []

      opt = ops.create_optimizer(
        params.optimizer, params.learning_rate, params.decay_steps)

      for i in range(FLAGS.num_gpus):
        with tf.device(tf.DeviceSpec(device_type="GPU", device_index=i)):
          with tf.name_scope("tower_%d" % i):
            with tf.variable_scope(tf.get_variable_scope(), reuse=i > 0):
              device_features = {k: v[i] for k, v in split_features.iteritems()}
              device_labels = {k: v[i] for k, v in split_labels.iteritems()}

              device_predictions, device_loss = model_fn(
                device_features, device_labels, mode, params)

              for k, v in device_predictions.iteritems():
                predictions[k].append(v)

              if device_loss is not None:
                losses.append(device_loss)

              device_grads = opt.compute_gradients(device_loss)
              grads.append(device_grads)

      grads = ops.average_gradients(grads)
      train_op = opt.apply_gradients(grads, global_step=global_step)

      for k, v in predictions.iteritems():
        predictions[k] = tf.concat(v, axis=0)

      loss = tf.add_n(losses) if losses else None
    else:
      with tf.device(tf.DeviceSpec(device_type="GPU", device_index=0)):
        predictions, loss = model_fn(features, labels, mode, params)

        train_op = None
        if mode == learn.ModeKeys.TRAIN:
          opt = ops.create_optimizer(
            params.optimizer, params.learning_rate, params.decay_steps)
          train_op = opt.minimize(loss, global_step=global_step)

    tf.summary.scalar("loss/loss", loss)

    return tf.contrib.learn.ModelFnOps(
      mode=mode,
      predictions=predictions,
      loss=loss,
      train_op=train_op)
示例#3
0
    def stop_chief(self, server, sess=None):
        # num_ps = cluster_spec.num_tasks(JobType.ps)
        # num_workers = cluster_spec.num_tasks(JobType.worker)
        num_ps = len(self.clusterspec_dict[JobType.ps])
        num_workers = len(self.clusterspec_dict[JobType.worker])
        enq_ops = []

        ps_devtasklist = [
            tf.DeviceSpec(job=JobType.ps, task=ii) for ii in range(num_ps)
        ]
        wrk_devtasklist = [
            tf.DeviceSpec(job=JobType.worker, task=ii)
            for ii in range(1, num_workers)
        ]
        devtasklist = ps_devtasklist + wrk_devtasklist
        for q in create_done_queues_chief(devtasklist):
            qop = q.enqueue(1)
            enq_ops.append(qop)

        if sess is None:
            # config = server.server_def.default_session_config
            # with tf.Session(server.target, config=config) as sess:
            with self.get_session(server) as sess:
                for op in enq_ops:
                    sess.run(op)
        else:
            for op in enq_ops:
                sess.run(op)
  def _set_train_or_infer(self, hparams, res, loss):
    """Set up training and inference."""
    # Training
    if self.mode == tf.estimator.ModeKeys.TRAIN:
      trainable_vars = tf.trainable_variables()
      total_loss = loss[0]
      # Print trainable variables
      utils.print_out("# Trainable variables")
      utils.print_out("Format: <name>, <shape>, <(soft) device placement>")
      for param in trainable_vars:
        utils.print_out("  {}, {}, {}".format(param.name,
                                              str(param.get_shape()),
                                              param.op.device))

      # [K by N]. K: num_gpu, N:num_variables per gpu
      list_vars = [list(filter(lambda x: "tower_{:d}".format(gpu_idx) in x.name, trainable_vars)) for gpu_idx in range(self.num_gpu)]
      
      with tf.variable_scope("optimization"):
        # Calculate gradient per device
        list_grads = []
        with tf.name_scope("compute_gradients"):
          for gpu_idx in range(self.num_gpu):
            with tf.device(tf.DeviceSpec(device_type="GPU", device_index=gpu_idx)), tf.name_scope("tower_{}".format(gpu_idx)):
              loss = total_loss[gpu_idx]
              
              list_grads.append(
                  tf.gradients(loss,
                               list_vars[gpu_idx],
                               colocate_gradients_with_ops=hparams.colocate_gradients_with_ops))

          # Apply NCCL all reduce w/ average on the list_grads
          with tf.name_scope("all_reduce"):
            list_grads = model_helper.allreduce_tensors(list_grads, average=True)

          # Gradient clipping (Not clipped if max_gradient_norm=None)
          with tf.name_scope("clipping"):
            list_grads, list_norms = model_helper.gradient_clip(list_grads, max_gradient_norm=hparams.max_gradient_norm)
            self.grad_norm = list_norms[0]
        
        # Apply gradient per device
        opts = []
        update_ops = []
        with tf.variable_scope("optimizer"):
          for gpu_idx in range(self.num_gpu):
            with tf.device(tf.DeviceSpec(device_type="GPU", device_index=gpu_idx)), tf.variable_scope("tower_{}".format(gpu_idx)):
              if hparams.optimizer == "sgd":
                optimizer = tf.train.GradientDescentOptimizer(self.learning_rate)
              elif hparams.optimizer == "adam":
                optimizer = tf.train.AdamOptimizer(self.learning_rate)
              else:
                raise ValueError("Unknown optimizer type {}".format(hparams.optimizer))
            opts.append(optimizer)
            update_ops.append(optimizer.apply_gradients(zip(list_grads[gpu_idx], list_vars[gpu_idx])))
        
        add_global_step = tf.assign_add(self.global_step, 1)
        with tf.control_dependencies([add_global_step]):
          self.update = tf.group(*update_ops, name='update_op')
      
      self.train_summary = self._get_train_summary()
示例#5
0
    def __init__(self, create_fn, embeddings, labels, **kwargs):
        super(ClassifyParallelModel, self).__init__()
        # We need to remove these because we may be calling back to our caller, and we need
        # the condition of calling to be non-parallel
        gpus = kwargs.pop('gpus', -1)
        # If the gpu ID is set to -1, use CUDA_VISIBLE_DEVICES to figure it out
        if gpus == -1:
            gpus = len(os.getenv('CUDA_VISIBLE_DEVICES', os.getenv('NV_GPU', '0')).split(','))
        print('Num GPUs', gpus)

        self.labels = labels
        nc = len(labels)

        self.saver = None
        self.replicas = []

        self.mxlen = int(kwargs.get('mxlen', 100))
        self.mxwlen = int(kwargs.get('mxwlen', 40))

        # This only exists to make exporting easier
        self.pdrop_value = kwargs.get('dropout', 0.5)
        # This only exists to make exporting easier
        self.x = kwargs.get('x', tf.placeholder(tf.int32, [None, self.mxlen], name="x_parallel"))
        self.y = kwargs.get('y', tf.placeholder(tf.int32, [None, nc], name="y_parallel"))
        self.lengths = kwargs.get('lengths', tf.placeholder(tf.int32, [None], name="lengths_parallel"))
        self.pkeep = kwargs.get('pkeep', tf.placeholder_with_default(1.0, shape=(), name="pkeep"))
        self.pdrop_value = kwargs.get('dropout', 0.5)

        x_splits = tf.split(self.x, gpus)
        y_splits = tf.split(self.y, gpus)
        lengths_splits = tf.split(self.lengths, gpus)
        xch_splits = None
        c2v = embeddings.get('char')
        if c2v is not None:
            self.xch = kwargs.get('xch', tf.placeholder(tf.int32, [None, self.mxlen, self.mxwlen], name='xch_parallel'))
            xch_splits = tf.split(self.xch, gpus)

        losses = []
        self.labels = labels

        with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
            with tf.device(tf.DeviceSpec(device_type="CPU")):
                self.inference = create_fn(embeddings, labels, sess=sess, **kwargs)
            for i in range(gpus):
                with tf.device(tf.DeviceSpec(device_type='GPU', device_index=i)):
                    replica = create_fn(embeddings, labels, sess=sess, x=x_splits[i], y=y_splits[i],
                                        xch=xch_splits[i] if xch_splits is not None else None,
                                        lengths=lengths_splits[i],
                                        pkeep=self.pkeep, **kwargs)
                    self.replicas.append(replica)
                    loss_op = replica.create_loss()
                    losses.append(loss_op)

            self.loss = tf.reduce_mean(tf.stack(losses))

        self.sess = sess
        self.best = self.inference.best
  def _compute_loss(self, hparams, res):
    """Compute loss."""
    # Regression Loss
    with tf.name_scope("regression_loss"):
      loss_type = hparams.loss
      with tf.name_scope("target_placeholder"):
        target_phs = list_ops.list_placeholder(self.num_gpu, (None, self.target_length, self.target_dims), tf.float32)
      for ph in target_phs:
        tf.add_to_collection('placeholder', ph)
      
      with tf.name_scope("{:s}_loss".format(loss_type)):
        if loss_type == "l2":
          loss = list_ops.list_l2(target_phs, res)
        elif loss_type == "weighted_smooth_l1":
          loss = list_ops.list_weighted_smooth_l1(target_phs, res)
        else:
          raise ValueError("Unknown loss type {:s}".format(loss_type))

      with tf.name_scope("reduce_sum"):
        loss = list_ops.list_reduce_sum(loss)
      with tf.name_scope("cast"):
        batch_size_float = list_ops.list_cast(self.batch_size, tf.float32)
      with tf.name_scope("division"):
        list_regression_loss = list_ops.list_divide(loss, batch_size_float)
      with tf.device(tf.DeviceSpec(device_type="CPU", device_index=0)), tf.name_scope("reduce_mean"):
        self.regression_loss = tf.reduce_mean(list_regression_loss)

    # Weight Decay Loss
    with tf.name_scope("weight_decay_loss"):
      all_decay_losses = tf.losses.get_regularization_losses()
      if len(all_decay_losses):
        list_regs = [list(filter(lambda x: "tower_{:d}".format(gpu_idx) in x.name, all_decay_losses)) for gpu_idx in range(self.num_gpu)]
        with tf.name_scope("add_n"):
          list_decay_loss = list_ops.list_add_n(list_regs)

      else:
        list_decay_loss = list_ops.list_zeros_like([np.float32(0.0) for _ in range(self.num_gpu)])
      self.decay_loss = list_decay_loss[0]

    # Total Loss
    with tf.name_scope("total_loss"):
      list_total_loss = [*zip(list_regression_loss, list_decay_loss)]
      with tf.name_scope("add_n"):
        list_total_loss = list_ops.list_add_n(list_total_loss)
      with tf.device(tf.DeviceSpec(device_type="CPU", device_index=0)), tf.name_scope("reduce_mean"):
        self.total_loss = tf.reduce_mean(list_total_loss)

    # Define TB loss summaries
    tf.summary.scalar("all_tower_mean", self.total_loss, family='total_loss')
    [tf.summary.scalar("tower_{:d}".format(gpu_idx), list_total_loss[gpu_idx], family='total_loss') for gpu_idx in range(self.num_gpu)]
    tf.summary.scalar("all_tower_mean", self.regression_loss, family='regression_loss')
    [tf.summary.scalar("tower_{:d}".format(gpu_idx), list_regression_loss[gpu_idx], family='regression_loss') for gpu_idx in range(self.num_gpu)]
    tf.summary.scalar("weight_decay_loss", self.decay_loss, family='regularization_loss')

    return list_total_loss, list_regression_loss, list_decay_loss
示例#7
0
    def stop_chief(self, server, sess=None, stop_workers=True):
        num_workers = self.num_workers
        chief_devtask = tf.DeviceSpec(job=JobType.worker, task=0)
        queue_from_workers = [
            create_done_queue_task(
                chief_devtask, shared_name='done_queue_worker_{}'.format(ii))
            for ii in range(1, num_workers)
        ]

        sess = self.get_session(server) if sess is None else sess
        # MAKE SURE ALL THE WORKERS ARE DONE BEFORE STOPPING
        # for iw, qfw in enumerate(queue_from_workers):
        for qfw in queue_from_workers:
            # RECEIVE SIGNAL FROM WORKERS.
            # if sess is None:
            #     with self.get_session(server) as sess:
            #         sess.run(qfw.dequeue())
            # else:
            sess.run(qfw.dequeue())

            # print("CHIEF {} RECEIVED DONE FROM WORKER {}. QUITTING"
            #       .format(qfw, iw), file=sys.stderr)

        # SEND SIGNALS TO EVERYONE ELSE TO QUIT
        num_ps = self.num_ps
        num_workers = self.num_workers
        enq_ops = []

        ps_devtasklist = [
            tf.DeviceSpec(job=JobType.ps, task=ii) for ii in range(num_ps)
        ]
        wrk_devtasklist = [
            tf.DeviceSpec(job=JobType.worker, task=ii)
            for ii in range(1, num_workers)
        ]
        # STOP WORKERS FIRST BEFORE PS
        if stop_workers:
            devtasklist = wrk_devtasklist + ps_devtasklist
        else:
            devtasklist = ps_devtasklist

        for q in create_done_queues_chief(devtasklist):
            qop = q.enqueue(1)
            enq_ops.append(qop)

        if sess is None:
            # config = server.server_def.default_session_config
            # with tf.Session(server.target, config=config) as sess:
            with self.get_session(server) as sess:
                for op in enq_ops:
                    sess.run(op)
        else:
            for op in enq_ops:
                sess.run(op)
 def get_device_spec(device, next_=True):
     global current_index
     if device in ('cpu', 'CPU'):
         device_spec = tf.DeviceSpec(device_type='CPU', device_index=0)
     else:
         device_spec = tf.DeviceSpec(device_type=device['name'], device_index=current_index)
         if next_:
             current_index = current_index + 1
             current_index = current_index % device['count']
     LOGGER.debug(device_spec.to_string())
     return device_spec
示例#9
0
    def get_device_spec(device):
        global GPU_INDEX
        if device in ('cpu', 'CPU'):
            device_spec = tf.DeviceSpec(device_type='CPU', device_index=0)
        else:
            device_spec = tf.DeviceSpec(
                device_type=device['name'], device_index=GPU_INDEX)
            GPU_INDEX += 1
            GPU_INDEX %= device['count']

        LOGGER.debug(device_spec.to_string())
        return device_spec
示例#10
0
def maybe_device_gpu(device_index=0):
    if USE_DEVICE == defines.DEVICE_GPU:
        if not tf.test.is_built_with_cuda():
            print('WARNING: Tensorflow was not built with cuda, '
                  'we use cpu mode.')
            return defines.DEVICE_CPU
        if not tf.test.is_gpu_available():
            print('WARNING: There is no GPU available we use cpu mode.')
            return defines.DEVICE_CPU
        return tf.device(
            tf.DeviceSpec(device_type=defines.DEVICE_GPU,
                          device_index=device_index))
    return tf.device(
        tf.DeviceSpec(device_type=defines.DEVICE_CPU, device_index=0))
示例#11
0
def make_parallel(model_fn, features, labels, mode, params, num_gpus):
    with tf.device(tf.DeviceSpec(device_type="CPU", device_index=0)):
        split_features = {
            k: tf.split(v, num_gpus) for k, v in features.items()
        }
        split_labels = {k: tf.split(v, num_gpus) for k, v in labels.items()}

    predictions = collections.defaultdict(list)
    losses = []
    tower_grads_and_vars = []

    for i in range(num_gpus):
        with tf.device(tf.DeviceSpec(device_type="GPU", device_index=i)):
            with tf.name_scope("tower_%d" % i) as name_scope:
                with tf.variable_scope(tf.get_variable_scope(), reuse=i > 0):
                    device_features = {
                        k: v[i] for k, v in split_features.items()
                    }
                    device_labels = {k: v[i] for k, v in split_labels.items()}

                    device_predictions, device_loss, device_metrics = model_fn(
                        device_features, device_labels, mode, params)

                    if i == 0:
                        eval_metrics = device_metrics
                        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS,
                                                       name_scope)

                        reg_losses = tf.get_collection(
                            tf.GraphKeys.REGULARIZATION_LOSSES, name_scope)

                    for k, v in device_predictions.items():
                        predictions[k].append(v)

                    if device_loss is not None:
                        losses.append(device_loss)

                        device_all_vars = tf.trainable_variables()
                        device_grads = tf.gradients(
                            device_loss, device_all_vars)
                        device_grads_and_vars = list(
                            zip(device_grads, device_all_vars))

                        tower_grads_and_vars.append(device_grads_and_vars)

    for k, v in predictions.items():
        predictions[k] = tf.concat(v, axis=0)

    return predictions, losses, reg_losses, update_ops, eval_metrics, tower_grads_and_vars
示例#12
0
def parallelize(fn, num_gpus, **kwargs):
    """Parallelizes a tensorflow function

    Args:
        fn (callable): A function taking keywords arguments
        num_gpus (int): The number of GPUs to parallelize on.
        kwargs: Keywords arguments for fn. The values should be tensors, because
            they have to be split into num_gpus parts.

    Returns:
        list: A list of tensors containing the concatenated results of the
            parallel function calls.

    """
    parts = {}
    for k, v in iteritems(kwargs):
        parts[k] = tf.split(v, num_gpus)

    output = []
    for g in range(num_gpus):
        with tf.device(tf.DeviceSpec(device_type='GPU', device_index=g)):
            # all gpus use vars of gpu 0
            with tf.variable_scope(tf.get_variable_scope(), reuse=g > 0):
                output.append(fn(**{k: v[g] for k, v in iteritems(parts)}))
    output = [outp for outp in zip(*output)]
    concat_output = []
    for outp in output:
        # can't concat scalars, so use stack instead
        if isinstance(outp[0], list):
            concat_output.append(outp)
        elif outp[0].get_shape().ndims == 0:
            concat_output.append(tf.stack(outp))
        else:
            concat_output.append(tf.concat(outp, axis=0))
    return concat_output
示例#13
0
    def make_parallel(self, fn, num_gpus, **kwargs):
        """Parallelize given model on multiple gpu devices.
        adapted from: https://github.com/vahidk/EffectiveTensorflow#make_parallel
        """

        in_splits = {}
        for k, v in kwargs.items():
            if k in ('num_classes', 'is_training'):
                in_splits[k] = [v] * num_gpus
            elif type(v) is tf.SparseTensor:
                in_splits[k] = tf.sparse_split(sp_input=v,
                                               num_split=num_gpus,
                                               axis=0)
            else:
                in_splits[k] = tf.split(v, num_gpus)

        out_split = []
        for i in range(num_gpus):
            with tf.device(tf.DeviceSpec(device_type="GPU", device_index=i)):
                with tf.variable_scope(tf.get_variable_scope(),
                                       reuse=tf.AUTO_REUSE):
                    outputs = fn(**{k: v[i] for k, v in in_splits.items()})
                    for o in range(len(outputs)):
                        if o >= len(out_split):
                            out_split.append([])
                        out_split[o].append(outputs[o])

        return [tf.stack(o, axis=0) for o in out_split]
示例#14
0
    def get_allps_devlist(self):
        num_ps = self.num_ps

        ps_devtasklist = [
            tf.DeviceSpec(job=JobType.ps, task=ii) for ii in range(num_ps)
        ]
        return ps_devtasklist
示例#15
0
    def __init__(self,
                 output_actions_size,
                 thread_id=0,
                 device='cpu',
                 device_index=0,
                 learning_rate=0.0001,
                 beta=0.01):
        self.width, self.height, self.depth = 84, 84, 4
        self.thread_id = thread_id
        self.device_spec = tf.DeviceSpec(device_type=device,
                                         device_index=device_index)

        self.scope = 'net_' + str(thread_id)
        self.learning_rate = learning_rate
        self.beta = beta

        self.output_actions_size = output_actions_size

        with tf.device(self.device_spec), tf.variable_scope(
                self.scope) as scope:
            self.input_state = tf.placeholder(
                "float", [None, self.height, self.width, self.depth])
            self.advantage = tf.placeholder("float", [None])
            self.targets = tf.placeholder("float", [None])
            self.actions = tf.placeholder("float",
                                          [None, self.output_actions_size])

        self._build_graph()
示例#16
0
  def build_graph(self, hparams, scope=None):
    with tf.variable_scope("Model"):
      utils.print_out("# Creating {} graph ...".format(self.mode))
      
      is_training = (self.mode == tf.estimator.ModeKeys.TRAIN)
      
      # Encoder
      list_encoder_output, list_encoder_state = self._build_encoder(hparams, is_training)

      # Stop discriminator
      list_stop_score, list_classifier_result = self._build_stop_discriminator(hparams, list_encoder_output, is_training)

      # Decoder
      list_regression, _ = self._build_decoder(hparams, list_encoder_output, list_encoder_state, is_training)

      with tf.device(tf.DeviceSpec(device_type="CPU", device_index=0)), tf.name_scope("output"):
        # Concatenate final outputs from all devices
        self.regression = tf.concat(list_regression, axis=0)
        self.stop = tf.concat(list_classifier_result, axis=0)
     
      list_losses = None
      if self.mode != tf.estimator.ModeKeys.PREDICT:
        # Calculate loss in train and eval phase
        with tf.name_scope("loss"):
          list_losses = self._compute_loss(hparams, list_regression, list_stop_score)

    return (list_regression, list_classifier_result), list_losses
示例#17
0
    def __create_validate(self, depth_multiplier, is_reuse=False):
        # create network graph for validation
        logger.info(
            'creating a mobilenet graph for validation... is_reuse=%d' %
            (is_reuse))
        with tf.device(
                tf.DeviceSpec(device_type="GPU",
                              device_index=0)), tf.variable_scope('tower0'):
            self.output_valid, _ = self.__create_network_for_imagenet(
                self.ph_valid_image,
                is_training=self.is_training,
                is_reuse=is_reuse,
                depth_multiplier=depth_multiplier)

            # loss
            self.loss_valid = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=self.ph_valid_label, logits=self.output_valid)
        self.acc_valid_top1 = tf.cast(tf.nn.in_top_k(self.output_valid,
                                                     self.ph_valid_label,
                                                     k=1),
                                      dtype=tf.float32)
        self.acc_valid_top5 = tf.cast(tf.nn.in_top_k(self.output_valid,
                                                     self.ph_valid_label,
                                                     k=5),
                                      dtype=tf.float32)
示例#18
0
    def call(self, features, training=False):
        device_spec = tf.DeviceSpec(device_type="CPU", device_index=0)

        with tf.device(device_spec):
            # shape: (B, T, E)
            vgids_emb, sequence_len = self.vgids_layer(features)
            vsids_emb, _ = self.vsids_layer(features)
            vcids_emb, _ = self.vcids_layer(features)
            vgprices_emb, _ = self.vgprices_layer(features)

            if training:
                add_mba_reg(self, features, vgids_emb,
                            'user.visited_goods_ids')
                add_mba_reg(self, features, vsids_emb, 'user.visited_shop_ids')
                add_mba_reg(self, features, vcids_emb, 'user.visited_cate_ids')
                add_mba_reg(self, features, vgprices_emb,
                            'user.visited_goods_prices')

            vgoods_shape = tf.shape(vgids_emb)
            query_emb = self.text_emb(features, self.query_layer,
                                      self.query_conv_layer, vgoods_shape[1])
            # shape: (B, T, E)
            user_behavior_rep = tf.concat(
                [vgids_emb, vsids_emb, vcids_emb, vgprices_emb, query_emb],
                axis=-1)
            # shape: (B, T, 64)
            user_behavior_rep = self.mlp(user_behavior_rep, training=training)
            return [user_behavior_rep, sequence_len]
示例#19
0
    def call(self, features, training=False):
        device_spec = tf.DeviceSpec(device_type="CPU", device_index=0)
        with tf.device(device_spec):
            # shape: (B, T, E)
            gids_emb, sequence_len = self.gids_layer(features)
            sids_emb, _ = self.sids_layer(features)
            cids_emb, _ = self.cids_layer(features)
            gprices_emb, _ = self.gprices_layer(features)
            rankpos_emb, _ = self.rankpos_layer(features)
            showpos_emb, _ = self.showpos_layer(features)
            if training:
                add_mba_reg(self, features, gids_emb, 'item.goods_ids')
                add_mba_reg(self, features, sids_emb, 'item.shop_ids')
                add_mba_reg(self, features, cids_emb, 'item.cate_ids')
                add_mba_reg(self, features, gprices_emb, 'item.goods_prices')

            title_emb = self.text_emb(features, self.title_layer,
                                      self.title_conv_layer, self.title_len,
                                      self.twe_dim)
            content_emb = self.text_emb(features, self.content_layer,
                                        self.content_conv_layer,
                                        self.content_len, self.cwe_dim)
            # shape: (B, T, E)
            items_rep = tf.concat([
                gids_emb, sids_emb, cids_emb, gprices_emb, title_emb,
                content_emb
            ],
                                  axis=-1)
            # modeling rank pos
            items_rep = rankpos_emb + items_rep
            # shape: (B, T, 64)
            items_rep = self.mlp(items_rep, training=training)
            return [items_rep, sequence_len, showpos_emb]
示例#20
0
    def Multigpu_train(model_fn, num_gpus, rgb_input, flow_input):

        in_splits = {}
        in_splits['rgb'] = tf.split(
            rgb_input, num_gpus) if rgb_input is not None else None
        in_splits['flow'] = tf.split(
            flow_input, num_gpus) if flow_input is not None else None

        out_split = []
        for i in range(num_gpus):
            if tf.test.is_built_with_cuda():
                device_type = 'GPU'
            else:
                device_type = 'CPU'
            with tf.device(
                    tf.DeviceSpec(device_type=device_type, device_index=i)):
                with tf.variable_scope(tf.get_variable_scope(),
                                       reuse=tf.AUTO_REUSE):
                    if in_splits['flow'] is None:
                        out_split.append(model_fn(in_splits['rgb'][i], None))
                    elif in_splits['rgb'] is None:
                        out_split.append(model_fn(None, in_splits['flow'][i]))
                    else:
                        out_split.append(
                            model_fn(in_splits['rgb'][i],
                                     in_splits['flow'][i]))
        out = tf.concat(out_split, axis=0)
        return out
示例#21
0
def make_parallel(
    num_gpus,
    images,
    questions,
    answers,
    phase_train,
):

    with tf.device('/cpu:0'):
        image = tf.split(images, num_gpus)
        answer = tf.split(answers, num_gpus)

        # question = tf.split(questions, num_gpus)

        question = tf.split(tf.reverse(questions, [-1]), num_gpus)
    loss_split = []
    mi_loss_split = []
    accuracy_split = []
    with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
        for i in range(num_gpus):
            with tf.name_scope('Tower_%d' % i):
                with tf.device(tf.DeviceSpec(device_type='GPU',
                                             device_index=i)):
                    (cross_entropy, mi_loss, correct_prediction) = \
                        get_model(image[i], question[i], answer[i],
                                  phase_train)
                loss_split.append(cross_entropy)
                mi_loss_split.append(mi_loss)
                accuracy_split.append(correct_prediction)
    with tf.device('/cpu:0'):
        mean_loss = tf.reduce_mean(loss_split)
        mean_mi_loss = tf.reduce_mean(mi_loss_split)
        mean_accuracy = tf.reduce_mean(accuracy_split)
    return (mean_loss, mean_mi_loss, mean_accuracy)
示例#22
0
    def join(self, server, sess=None, exit_flag=True):
        # server.join()
        task_id = self.mytask_id
        jobtype = self.myjobtype

        if jobtype == JobType.worker:
            self._signal_chief(server, sess)

        mydevtask = tf.DeviceSpec(job=jobtype, task=task_id)
        queue = create_done_queue_task(mydevtask)

        # RECEIVE SIGNAL FROM CHIEF.
        if sess is None:
            # config = server.server_def.default_session_config
            # with tf.Session(server.target, config=config) as sess:
            with self.get_session(server) as sess:
                sess.run(queue.dequeue())
        else:
            sess.run(queue.dequeue())

        print("{} {} RECEIVED DONE. QUITTING".format(jobtype, task_id),
              file=sys.stderr)

        if exit_flag:
            sys.exit(0)
示例#23
0
    def __init__(self, feature_config, rate=0.3):
        super(UserBehaviorEmbedding, self).__init__()

        feature_configs = feature_config.get_feature_configs()
        self.query_len = feature_configs['user.query_word_ids']['query_len']
        device_spec = tf.DeviceSpec(device_type="CPU", device_index=0)
        with tf.device(device_spec):
            feature_columns = feature_config.get_feature_columns()
            self.vgids_layer = SequenceFeatures(
                [feature_columns.get('user.visited_goods_ids')])
            self.vsids_layer = SequenceFeatures(
                [feature_columns.get('user.visited_shop_ids')])
            self.vcids_layer = SequenceFeatures(
                [feature_columns.get('user.visited_cate_ids')])
            self.vgprices_layer = SequenceFeatures(
                [feature_columns.get('user.visited_goods_prices')])
            self.query_layer = SequenceFeatures(
                [feature_columns.get('user.query_word_ids')])

        # item text convolution layer
        self.query_conv_layer = QueryTextConv(FLAGS.qtxt_filters,
                                              FLAGS.qtxt_kernel_sizes,
                                              self.query_len)
        # multi-layer projection
        self.mlp_bn1 = tf.keras.layers.BatchNormalization(epsilon=1e-6)
        self.mlp_drop1 = tf.keras.layers.Dropout(rate=rate)
        self.mlp_dense1 = tf.keras.layers.Dense(FLAGS.be_filter_size,
                                                activation='relu')
        self.mlp_bn2 = tf.keras.layers.BatchNormalization(epsilon=1e-6)
        self.mlp_drop2 = tf.keras.layers.Dropout(rate=rate)
        self.mlp_dense2 = tf.keras.layers.Dense(FLAGS.hidden_size,
                                                activation='relu')
示例#24
0
def make_parallel(fn, num_gpus, **kwargs):

    in_splits = {}  # create empty dictionary

    # for each of the tensors in kwargs, create a split and add it to the dictionary
    for k, v in kwargs.items():
        in_splits[k] = tf.split(v, num_gpus)

    loss_split = []  # create empty list
    correct_split = []
    pred_split = []

    for i in range(num_gpus):
        with tf.device(tf.DeviceSpec(device_type="GPU", device_index=i)):
            # allow for variable reuse on GPUs beyond index 0
            with tf.variable_scope(tf.get_variable_scope(), reuse=i > 0):
                #pass the splits into the function and append results
                loss, correct_prediction, pred = fn(
                    **{k: v[i]
                       for k, v in in_splits.items()})
                loss_split.append(loss)
                correct_split.append(correct_prediction)
                pred_split.append(pred)

    return tf.concat(loss_split,
                     axis=0), tf.concat(correct_split,
                                        axis=0), tf.concat(pred_split, axis=0)
示例#25
0
def model(input, targets, training, alpha, dropout=0.3, gpu_num=0):

    target15, target14, target13, target12, target11, target10 = targets

    print('input:', input.shape)

    input = tf.split(input, gpu_num)
    target15 = tf.split(target15, gpu_num)
    target14 = tf.split(target14, gpu_num)
    target13 = tf.split(target13, gpu_num)
    target12 = tf.split(target12, gpu_num)
    target11 = tf.split(target11, gpu_num)
    target10 = tf.split(target10, gpu_num)

    losses = []
    Decoded_all = []

    for gpu_id in range(int(gpu_num)):
        reuse = gpu_id > 0
        with tf.device(tf.DeviceSpec(device_type="GPU", device_index=gpu_id)):

            Decoded = en_decode(input[gpu_id], training, dropout, reuse)
            out15, out14, out13, out12, out11, out10 = Decoded

            loss = 0
            loss += abs_loss(out15, target15[gpu_id]) * pdims(out15) * 10
            loss += abs_loss(out14, target14[gpu_id]) * pdims(out14) * 2
            loss += abs_loss(out13, target13[gpu_id]) * pdims(out13)
            loss += abs_loss(out12, target12[gpu_id]) * pdims(out12)
            loss += abs_loss(out11, target11[gpu_id]) * pdims(out11)
            loss += abs_loss(out10, target10[gpu_id]) * pdims(out10)
            loss /= 100

            losses.append(loss)
            Decoded_all.append(Decoded)

    L2_loss = tf.losses.get_regularization_loss() * 1e-4

    loss = tf.reduce_mean(tf.stack(losses, axis=0))
    loss += L2_loss

    trainables = tf.trainable_variables()

    train_vgg = tf.train.MomentumOptimizer(tf.maximum(
        alpha / 2, 1e-7), 0.9).minimize(
            loss, var_list=[var for var in trainables if 'vgg' in var.name])
    train_others = tf.train.MomentumOptimizer(alpha, 0.9).minimize(
        loss, var_list=[var for var in trainables if 'vgg' not in var.name])
    train = tf.group(train_vgg, train_others)

    D = []
    for i in range(len(Decoded_all[0])):
        # for j in range(len(Decoded_all))
        outs = [Decoded_all[j][i] for j in range(len(Decoded_all))]
        outs = tf.concat(outs, axis=0)
        D.append(tf.nn.relu(outs))

    m = L2_loss
    return train, loss, D, m
示例#26
0
def make_parallel(model, num_gpus, imgs):
    out_split = []
    for i in range(num_gpus):
        with tf.device(tf.DeviceSpec(device_type="GPU", device_index=i)):
            with tf.variable_scope(tf.get_variable_scope(), reuse=i > 0):
                out_split.append(model(images=imgs[i]))
    # return tf.concat(out_split, axis=0)
    return out_split
示例#27
0
def distribution_gpus(num_gpus):
    if num_gpus == 1:
        return tf.contrib.distribute.OneDeviceStrategy(
            device=tf.DeviceSpec(device_type="GPU", device_index=0))
    elif num_gpus > 1:
        return tf.contrib.distribute.MirroredStrategy(num_gpus=num_gpus)
    else:
        return None
    def _set_params_initializer(self, hparams, mode, scope):
        """Set various params for self and initialize."""
        self.mode = mode
        self.num_gpu = hparams.num_gpu
        with tf.device(tf.DeviceSpec(device_type="CPU", device_index=0)):
            with tf.variable_scope("training_parameters"):
                # Batch size placeholder
                self.batch_size = list(
                    tf.placeholder(dtype=tf.int32,
                                   shape=[],
                                   name="tower_{}_batch_size".format(gpu_idx))
                    for gpu_idx in range(self.num_gpu))

                # Learning rate
                self.learning_rate = tf.get_variable(
                    name="learning_rate",
                    initializer=hparams.learning_rate,
                    dtype=tf.float32,
                    trainable=False)
                with tf.name_scope("learning_rate_decay"):
                    self.decay_ratio = tf.placeholder(dtype=tf.float32,
                                                      shape=[],
                                                      name="lr_dacay_ratio")

                # Global step
                self.global_step = tf.get_variable(name="global_step",
                                                   initializer=np.array(
                                                       0, np.int64),
                                                   dtype=tf.int64,
                                                   trainable=False)

                with tf.device(tf.DeviceSpec(
                        device_type="CPU", device_index=0)), tf.variable_scope(
                            "batch_norm_decay"):
                    bn_momentum = tf.train.exponential_decay(
                        hparams.bn_init_decay,
                        self.global_step * hparams.batch_size,
                        hparams.bn_decay_step,
                        hparams.bn_decay_rate,
                        staircase=True)
                    self.bn_decay = tf.minimum(hparams.bn_decay_clip,
                                               1 - bn_momentum)

        # Initializer
        self.random_seed = hparams.random_seed
示例#29
0
def list_tile(list_input, multiples, new_scope=True):
  assert type(list_input) == list
  list_output = []

  for gpu_idx, inputs in enumerate(list_input):
    with tf.device(tf.DeviceSpec(device_type="GPU", device_index=gpu_idx)), tf.name_scope("tower_{:d}".format(gpu_idx) if new_scope else tf.get_default_graph().get_name_scope() + "/tower_{:d}/".format(gpu_idx)):
      list_output.append(tf.tile(inputs, multiples))
  
  return list_output
 def _get_histogram_summary(self):
   with tf.device(tf.DeviceSpec(device_type="CPU", device_index=0)), tf.name_scope("activation_histogram"):
     rnn_state_stack = tf.stack(self.list_encoder_output, axis=0)
     feature_stack = tf.stack(self.list_global_feature, axis=0)
     merged_state_stack = tf.stack(self.list_merged_state, axis=0)
     
     return [tf.summary.merge([tf.summary.histogram("rnn_state", rnn_state_stack),
                              tf.summary.histogram("global_feature", feature_stack),
                              tf.summary.histogram("merged_feature", merged_state_stack)])]