예제 #1
0
    def check_result(self, fn, place, dtype):
        shape = [9, 10]

        x_data = np.random.random(size=shape).astype(dtype)
        y_data = np.random.random(size=shape).astype(dtype)
        python_out = fn(x_data, y_data)

        x_var = layers.create_global_var(name='x',
                                         shape=shape,
                                         value=0.0,
                                         dtype=dtype,
                                         persistable=True)
        y_var = layers.create_global_var(name='y',
                                         shape=shape,
                                         value=0.0,
                                         dtype=dtype,
                                         persistable=True)
        out = fn(x_var, y_var)

        exe = fluid.Executor(place)

        exe.run(fluid.default_startup_program())
        fluid_out = exe.run(fluid.default_main_program(),
                            feed={
                                'x': x_data,
                                'y': y_data
                            },
                            fetch_list=[out])

        np.testing.assert_array_equal(python_out, fluid_out[0])
예제 #2
0
def _get_gm_cond_var(main_program, k_steps):
    main_block = main_program.global_block()
    # Add const var
    k_step_var = layers.create_global_var(name="gradient_merge_k",
                                          shape=[1],
                                          value=int(k_steps),
                                          dtype='int32',
                                          persistable=True,
                                          force_cpu=True)

    zero_var = layers.create_global_var(name="gradient_merge_zero",
                                        shape=[1],
                                        value=int(0),
                                        dtype='int32',
                                        persistable=True,
                                        force_cpu=True)

    # Add step var & cond var
    step_var = layers.create_global_var(name="gradient_merge_step",
                                        shape=[1],
                                        value=int(0),
                                        dtype='int32',
                                        persistable=True,
                                        force_cpu=True)

    cond_var = layers.create_global_var(name="gradient_merge_cond",
                                        shape=[1],
                                        value=bool(0),
                                        dtype='bool',
                                        persistable=False,
                                        force_cpu=True)

    with device_guard("cpu"):
        # step_var = (step_var + 1) % k_step
        layers.increment(x=step_var, value=1.0, in_place=True)
        main_block.append_op(type='elementwise_mod',
                             inputs={
                                 'X': step_var,
                                 'Y': k_step_var
                             },
                             outputs={'Out': step_var},
                             attrs={
                                 'axis': -1,
                                 'use_mkldnn': False
                             })

        # cond_var = (step_var == 0)
        main_block.append_op(type='equal',
                             inputs={
                                 'X': step_var,
                                 'Y': zero_var
                             },
                             outputs={'Out': cond_var})

    return cond_var
예제 #3
0
    def check_switch(self, value):
        x = layers.fill_constant(shape=[1], dtype='float32', value=value)

        zero_var = layers.fill_constant(shape=[1], dtype='float32', value=0.0)
        one_var = layers.fill_constant(shape=[1], dtype='float32', value=1.0)
        two_var = layers.fill_constant(shape=[1], dtype='float32', value=2.0)
        three_var = layers.fill_constant(shape=[1], dtype='float32', value=3.0)

        result = layers.create_global_var(
            shape=[1], value=-1.0, dtype='float32', persistable=True)

        with layers.Switch() as switch:
            with switch.case(layers.less_than(x, zero_var)):
                layers.assign(zero_var, result)
            with switch.case(layers.less_than(x, one_var)):
                layers.assign(one_var, result)
            with switch.case(layers.less_than(x, two_var)):
                layers.assign(two_var, result)
            with switch.default():
                layers.assign(three_var, result)

        cpu = core.CPUPlace()
        exe = Executor(cpu)
        exe.run(default_startup_program())

        out = exe.run(feed={}, fetch_list=[result])[0][0]
        return out
예제 #4
0
def linear_warmup_and_cosine_decay(learning_rate, end_lr, warmup_steps,
                                   max_training_steps):
    """Applies linear warmup and cosine decay to the learning rate."""
    dtype = "float32"

    with fluid.default_main_program()._lr_schedule_guard():
        lr = layers.create_global_var(shape=[1],
                                      value=0.0,
                                      dtype=dtype,
                                      persistable=True,
                                      name="learning_rate")

        global_step = _decay_step_counter(1)

        with layers.control_flow.Switch() as switch:
            with switch.case(global_step < warmup_steps):
                warmup_lr = learning_rate * (global_step / warmup_steps)
                layers.assign(warmup_lr, lr)
            with switch.case(global_step < max_training_steps):
                frac = 0.5 * (ops.cos((global_step - warmup_steps) * math.pi /
                                      (max_training_steps - warmup_steps)) + 1)
                decayed_lr = end_lr + (learning_rate - end_lr) * frac
                layers.assign(decayed_lr, lr)
            with switch.default():
                learning_rate = layers.fill_constant(shape=[1],
                                                     dtype=dtype,
                                                     value=end_lr)
                layers.assign(learning_rate, lr)
        return lr
예제 #5
0
    def test_error(self):
        main_program = framework.Program()
        startup_program = framework.Program()
        with framework.program_guard(main_program, startup_program):
            cond = layers.fill_constant(shape=[1], dtype='float32', value=0.0)
            zero_var = layers.fill_constant(shape=[1],
                                            dtype='float32',
                                            value=0.0)

            result = layers.create_global_var(shape=[1],
                                              value=-1.0,
                                              dtype='float32',
                                              persistable=True)

            # 1. The type of 'condition' in case must be Variable.
            def test_condition_type():
                with layers.Switch() as switch:
                    with switch.case(1):
                        layers.assign(zero_var, result)

            self.assertRaises(TypeError, test_condition_type)

            # 2. The dtype of 'condition' in case must be 'bool'.
            def test_condition_dtype():
                with layers.Switch() as switch:
                    with switch.case(cond):
                        layers.assign(zero_var, result)

            self.assertRaises(TypeError, test_condition_dtype)
예제 #6
0
 def _create_scale_from_constant(self, value):
     name = unique_name.generate('global_scale')
     return layers.create_global_var(name=name,
                                     shape=[1],
                                     dtype='float32',
                                     value=float(value),
                                     persistable=True)
예제 #7
0
    def check_switch(self, value):
        x = layers.fill_constant(shape=[1], dtype='float32', value=value)
        zero_var = layers.fill_constant(shape=[1], dtype='float32', value=0.0)
        one_var = layers.fill_constant(shape=[1], dtype='float32', value=1.0)
        two_var = layers.fill_constant(shape=[1], dtype='float32', value=2.0)
        three_var = layers.fill_constant(shape=[1], dtype='float32', value=3.0)

        result = layers.create_global_var(shape=[1],
                                          value=-1.0,
                                          dtype='float32',
                                          persistable=True)

        with layers.Switch() as switch:
            with switch.case(layers.less_than(x, zero_var)):
                layers.assign(zero_var, result)
            with switch.case(layers.less_than(x, one_var)):
                layers.assign(one_var, result)
            with switch.case(layers.less_than(x, two_var)):
                layers.assign(two_var, result)
            with switch.default():
                layers.assign(three_var, result)

        cpu = core.CPUPlace()
        exe = Executor(cpu)
        exe.run(default_startup_program())

        out = exe.run(feed={}, fetch_list=[result])[0][0]
        return out
예제 #8
0
    def _append_optimize_op(self, block, param_and_grad):
        one_var = paddle.ones(shape=[1], dtype='int32', name='lookahead_ones')
        zero_var = paddle.zeros(shape=[1],
                                dtype='int32',
                                name='lookahead_zeros')
        k_var = layers.create_global_var(
            name=unique_name.generate("lookahead_k"),
            shape=[1],
            value=self.k,
            dtype='int32',
            persistable=True)

        mod = paddle.remainder(self._global_step_var, k_var)

        cond_1 = paddle.equal(self._global_step_var, one_var)
        cond_1 = paddle.cast(cond_1, dtype='float32')

        cond_2 = paddle.equal(mod, zero_var)
        cond_2 = paddle.cast(cond_2, dtype='float32')

        slow_var = self._get_accumulator(self._slow_str, param_and_grad[0])

        tmp_var = cond_1 * param_and_grad[0] + (1 - cond_1) * slow_var
        paddle.assign(tmp_var, slow_var)

        tmp_var = self.alpha * param_and_grad[0] + (1.0 -
                                                    self.alpha) * slow_var
        tmp_var_1 = cond_2 * tmp_var + (1 - cond_2) * param_and_grad[0]
        paddle.assign(tmp_var_1, param_and_grad[0])

        tmp_var_1 = cond_2 * tmp_var + (1 - cond_2) * slow_var
        paddle.assign(tmp_var_1, slow_var)
예제 #9
0
    def optimize(self, metrics):
        """
        Optimize the model by metrics(mainly `metrics["loss"]`).
        """
        # TODO: support dygraph
        if self.warmup_steps > 0:
            scheduled_lr = layers.learning_rate_scheduler.noam_decay(
                1 / (self.warmup_steps * (self.learning_rate**2)),
                self.warmup_steps)
        else:
            scheduled_lr = layers.create_global_var(
                name=fluid.unique_name.generate("learning_rate"),
                shape=[1],
                value=self.learning_rate,
                dtype="float32",
                persistable=True)
        grad_clip = fluid.clip.GradientClipByGlobalNorm(self.max_grad_norm)

        self.optimizer = AdamW(learning_rate=scheduled_lr,
                               grad_clip=grad_clip,
                               weight_decay=self.weight_decay)

        if self.is_distributed:
            self.optimizer = fleet.distributed_optimizer(
                self.optimizer, strategy=self.dist_strategy)

        self.optimizer.minimize(metrics["loss"])
        return scheduled_lr
예제 #10
0
 def __init__(self, init_loss_scale=1.):
     super(StaticLossScale, self).__init__()
     self.scale = layers.create_global_var(
         name=unique_name.generate("loss_scale"),
         shape=[1],
         value=init_loss_scale,
         dtype='float32',
         persistable=True)
예제 #11
0
 def __init__(self, init_loss_scale=2**15, increment_every=2000, factor=2.):
     super(DynamicLossScale, self).__init__()
     self.scale = layers.create_global_var(
         name=unique_name.generate("loss_scale"),
         shape=[1],
         value=init_loss_scale,
         dtype='float32',
         persistable=True)
     self.good_steps = layers.create_global_var(
         name=unique_name.generate("good_steps"),
         shape=[1],
         value=0,
         dtype='int32',
         persistable=True)
     self.increment_every = layers.fill_constant(
         shape=[1], dtype='int32', value=increment_every)
     self.factor = factor
예제 #12
0
    def _create_ema_vars(self, param):
        param_ema = layers.create_global_var(
            name=unique_name.generate(self._name + param.name + '_ema'),
            shape=param.shape,
            value=0.0,
            dtype=param.dtype,
            persistable=True)

        return param_ema
예제 #13
0
 def _init_amp_var(self):
     # Ensure the data type of learning rate vars is float32 (same as the
     # master parameter dtype)
     if isinstance(self._optimizer._learning_rate, float):
         self._optimizer._learning_rate_map[default_main_program()] = \
                 layers.create_global_var(
                 name=unique_name.generate("learning_rate"),
                 shape=[1],
                 value=float(self._optimizer._learning_rate),
                 dtype='float32',
                 persistable=True)
예제 #14
0
    def test_eq(self):
        """
        test queue_generator op, enqueue op and dequeue op.
        """

        main_program = fluid.Program()
        startup_program = fluid.Program()
        value = np.random.rand(1)
        with fluid.program_guard(main_program, startup_program):
            data_in = layers.create_global_var(shape=[2, 3],
                                               value=value,
                                               dtype="float32",
                                               persistable=True,
                                               name='var_in')
            data_out = layers.create_global_var(shape=[2, 3],
                                                value=value - 1.0,
                                                dtype="float32",
                                                persistable=True,
                                                name='var_out')
        startup_block = startup_program.block(0)
        queue_name = 'blocking_queue'
        startup_block.create_var(name=queue_name,
                                 persistable=True,
                                 type=core.VarDesc.VarType.RAW)
        startup_block.append_op(type="queue_generator",
                                attrs={'names': [queue_name]})
        block = main_program.block(0)
        block.append_op(type='enqueue',
                        inputs={'X': data_in},
                        attrs={'queue_name': queue_name})
        block.append_op(type='dequeue',
                        outputs={'Out': [data_out]},
                        attrs={'queue_name': queue_name})

        place = fluid.CUDAPlace(
            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
        exe = fluid.Executor(place)
        exe.run(startup_program)
        ret = exe.run(main_program, fetch_list=[data_out.name])
        self.assertTrue(
            np.allclose(np.asarray(ret), np.full((2, 3), value, np.float32)))
예제 #15
0
    def optimize(self, metrics):
        """Optimize the model by loss.

        Args:
            metrics: A dict mapping metric names to corresponding metrics, which must include loss.
        """
        # TODO: support dygraph
        # lr scheduler
        if self.lr_scheduler == "noam" and self.warmup_steps <= 0:
            print(
                "[WARN] Using constant learning rate because of `warmup_steps` is not positive while using NoamScheduler."
            )
        if self.lr_scheduler == "noam" and self.warmup_steps > 0:
            scheduled_lr = layers.learning_rate_scheduler.noam_decay(
                1 / (self.warmup_steps * (self.learning_rate**2)),
                self.warmup_steps)
        elif self.lr_scheduler == "linear":
            scheduled_lr = lr_scheduler.linear_warmup_and_linear_decay(
                self.learning_rate, self.min_learning_rate, self.warmup_steps,
                self.max_training_steps)
        elif self.lr_scheduler == "cosine":
            scheduled_lr = lr_scheduler.linear_warmup_and_cosine_decay(
                self.learning_rate, self.min_learning_rate, self.warmup_steps,
                self.max_training_steps)
        else:  # constant
            scheduled_lr = layers.create_global_var(
                name=fluid.unique_name.generate("learning_rate"),
                shape=[1],
                value=self.learning_rate,
                dtype="float32",
                persistable=True)
        # grad norm
        if self.max_grad_norm > 0:
            grad_clip = fluid.clip.GradientClipByGlobalNorm(self.max_grad_norm)
        else:
            grad_clip = None

        # optimizer
        optimizer_cls = getattr(knover.optim, self.optimizer)
        optimizer = optimizer_cls(learning_rate=scheduled_lr,
                                  grad_clip=grad_clip,
                                  weight_decay=self.weight_decay,
                                  beta1=self.beta1,
                                  beta2=self.beta2)

        # distributed optimizer
        if self.is_distributed:
            optimizer = fleet.distributed_optimizer(
                optimizer, strategy=self.dist_strategy)

        optimizer.minimize(metrics["loss"])
        return scheduled_lr
    def check_result(self, fn, place, dtype):
        shape = [9, 10]

        x_data = np.random.random(size=shape).astype(dtype)
        y_data = np.random.random(size=shape).astype(dtype)
        python_out = fn(x_data, y_data)

        x_var = layers.create_global_var(
            name='x', shape=shape, value=0.0, dtype=dtype, persistable=True)
        y_var = layers.create_global_var(
            name='y', shape=shape, value=0.0, dtype=dtype, persistable=True)
        out = fn(x_var, y_var)

        exe = fluid.Executor(place)

        exe.run(fluid.default_startup_program())
        fluid_out = exe.run(fluid.default_main_program(),
                            feed={'x': x_data,
                                  'y': y_data},
                            fetch_list=[out])

        np.testing.assert_array_equal(python_out, fluid_out[0])
예제 #17
0
    def _increment_global_var(self):
        if self._global_step_var is None:
            self._global_step_var = layers.create_global_var(
                name=unique_name.generate("lookahead_step"),
                shape=[1],
                value=0,
                dtype='int32',
                persistable=True)

        self.helper.append_op(type='increment',
                              inputs={'X': [self._global_step_var]},
                              outputs={'Out': [self._global_step_var]},
                              attrs={'step': 1.0})
예제 #18
0
def create_coalesce_program(grad_dict):
    coalesce_program = fluid.Program()
    in_vars = []
    out_vars = []
    with fluid.program_guard(coalesce_program):
        grad_out_dict = {}
        for name in grad_dict:
            grad = grad_dict[name]
            grad_in = layers.fill_constant(shape=grad.shape,
                                           dtype='float32',
                                           value=1)
            grad_out = layers.create_global_var(name='output_' + grad.name,
                                                shape=grad.shape,
                                                value=0,
                                                dtype='float32',
                                                persistable=True)
            in_vars.append(grad_in)
            out_vars.append(grad_out)
            grad_out_dict[name] = grad_out
        grad_fused = layers.create_global_var(name='fused_output',
                                              shape=[1],
                                              value=0,
                                              dtype='float32',
                                              persistable=True)
        coalesce_program.global_block().append_op(type='coalesce_tensor',
                                                  inputs={'Input': in_vars},
                                                  outputs={
                                                      'Output': out_vars,
                                                      'FusedOutput': grad_fused
                                                  },
                                                  attrs={
                                                      'copy_data':
                                                      False,
                                                      'dtype':
                                                      core.VarDesc.VarType.FP32
                                                  })
        fused_shape = layers.shape(grad_fused)
    return coalesce_program, grad_out_dict, grad_fused, fused_shape
예제 #19
0
 def __init__(self,
              d_model,
              warmup_steps,
              learning_rate=0.001,
              current_steps=0,
              name="learning_rate"):
     self.current_steps = current_steps
     self.warmup_steps = warmup_steps
     self.d_model = d_model
     self.static_lr = learning_rate
     self.learning_rate = layers.create_global_var(
         name=name,
         shape=[1],
         value=float(learning_rate),
         dtype="float32",
         persistable=True)
예제 #20
0
    def _create_master_weight(self, param):
        assert isinstance(self.helper, LayerHelper)

        var_name = param.name + "_fp32_master"
        var_name = unique_name.generate(var_name)
        var = layers.create_global_var(name=var_name,
                                       shape=param.shape,
                                       value=0,
                                       dtype='float32',
                                       persistable=True)
        block = self.helper.startup_program.global_block()
        block.append_op(type="cast",
                        inputs={"X": [param]},
                        outputs={"Out": [var]},
                        attrs={
                            "in_dtype": param.dtype,
                            "out_dtype": core.VarDesc.VarType.FP32
                        })
        self._master_weights[param.name] = var
        return var
예제 #21
0
def linear_warmup_and_invsqrt_decay(learning_rate, warmup_steps, decay_steps):
    """Applies linear warmup and invsqrt decay to the learning rate."""
    dtype = "float32"

    with fluid.default_main_program()._lr_schedule_guard():
        lr = layers.create_global_var(shape=[1],
                                      value=0.0,
                                      dtype=dtype,
                                      persistable=True,
                                      name="learning_rate")

        global_step = _decay_step_counter(1)

        with layers.control_flow.Switch() as switch:
            with switch.case(global_step < warmup_steps):
                warmup_lr = learning_rate * (global_step / warmup_steps)
                layers.assign(warmup_lr, lr)
            with switch.default():
                decayed_lr = lr * ops.sqrt(
                    decay_steps / (global_step - warmup_steps + decay_steps))
                layers.assign(decayed_lr, lr)
        return lr
예제 #22
0
    def minimize_impl(self,
                      loss,
                      startup_program=None,
                      parameter_list=None,
                      no_grad_set=None):
        minimized = self.inner_opt.minimize(loss,
                                            startup_program=startup_program)

        k_steps_value = self.user_defined_strategy.localsgd_configs['k_steps']
        begin_step_value = self.user_defined_strategy.localsgd_configs[
            'begin_step']

        if startup_program is None:
            startup_program = default_startup_program()
        main_block = loss.block

        self.nrings = 2
        collective_helper = CollectiveHelper(self.role_maker, self.nrings)
        collective_helper.update_startup_program(startup_program)
        p2s = self.create_snapshot_vars(startup_program)
        self.init_snapshot_vars(startup_program, p2s)

        p2s = self.create_snapshot_vars(main_block.program)
        with program_guard(main_block.program, startup_program):
            step = layers.autoincreased_step_counter(begin=1)
            k_steps = layers.create_global_var(name="k_steps",
                                               shape=[1],
                                               value=k_steps_value,
                                               dtype='int64',
                                               persistable=True)

            begin_step = layers.create_global_var(name="begin_step",
                                                  shape=[1],
                                                  value=begin_step_value,
                                                  dtype='int64',
                                                  persistable=True)

            last_step = layers.create_global_var(name="last_step",
                                                 shape=[1],
                                                 value=begin_step_value,
                                                 dtype='int64',
                                                 persistable=True)

            def communicate():
                sub_block = default_main_program().current_block()
                ring_id = -1
                for param, snapshot in p2s:
                    sub_block.append_op(type='elementwise_sub',
                                        inputs={
                                            'X': [snapshot],
                                            'Y': [param]
                                        },
                                        outputs={'Out': [param]},
                                        attrs={OP_ROLE_KEY: OpRole.Optimize})
                    sub_block.append_op(type='c_sync_calc_stream',
                                        inputs={'X': param},
                                        outputs={'Out': param},
                                        attrs={OP_ROLE_KEY: OpRole.Optimize})
                    ring_id = (ring_id + 1) % self.nrings
                    sub_block.append_op(type='c_allreduce_sum',
                                        inputs={'X': [param]},
                                        outputs={'Out': [param]},
                                        attrs={
                                            'ring_id': ring_id,
                                            OP_ROLE_KEY: OpRole.Optimize
                                        })

                for ring_id in range(self.nrings):
                    sub_block.append_op(type='c_sync_comm_stream',
                                        inputs={'X': param},
                                        outputs={'Out': param},
                                        attrs={
                                            'ring_id': ring_id,
                                            OP_ROLE_KEY: OpRole.Optimize
                                        })

                for param, snapshot in p2s:
                    sub_block.append_op(type='scale',
                                        inputs={'X': [param]},
                                        outputs={'Out': [param]},
                                        attrs={
                                            'scale':
                                            1.0 /
                                            self.role_maker._worker_num(),
                                            OP_ROLE_KEY:
                                            OpRole.Optimize
                                        })
                    sub_block.append_op(type='elementwise_sub',
                                        inputs={
                                            'X': [snapshot],
                                            'Y': [param]
                                        },
                                        outputs={'Out': [param]},
                                        attrs={OP_ROLE_KEY: OpRole.Optimize})
                    sub_block.append_op(type='assign',
                                        inputs={'X': [param]},
                                        outputs={'Out': [snapshot]},
                                        attrs={OP_ROLE_KEY: OpRole.Optimize})
                layers.assign(step, last_step)

            def begin_localsgd():
                layers.cond(step - last_step == k_steps, communicate)

            layers.cond(step > begin_step, begin_localsgd, communicate)
        return minimized
예제 #23
0
    def minimize_impl(self,
                      loss,
                      startup_program=None,
                      parameter_list=None,
                      no_grad_set=None):
        minimized = self.inner_opt.minimize(loss,
                                            startup_program=startup_program)

        init_k_steps = self.user_defined_strategy.adaptive_localsgd_configs[
            'init_k_steps']
        begin_step_value = self.user_defined_strategy.adaptive_localsgd_configs[
            'begin_step']

        if startup_program is None:
            startup_program = default_startup_program()
        main_block = loss.block

        self.nrings = 2
        collective_helper = CollectiveHelper(self.role_maker, self.nrings)
        collective_helper.update_startup_program(startup_program)
        p2s = self.create_snapshot_vars(startup_program)
        self.init_snapshot_vars(startup_program, p2s)

        p2s = self.create_snapshot_vars(main_block.program)
        with program_guard(main_block.program, startup_program):
            step = layers.autoincreased_step_counter(begin=1)

            k_steps = layers.create_global_var(name="k_steps",
                                               shape=[1],
                                               value=int(init_k_steps),
                                               dtype='int64',
                                               persistable=True)

            begin_step = layers.create_global_var(name="begin_step",
                                                  shape=[1],
                                                  value=int(begin_step_value),
                                                  dtype='int64',
                                                  persistable=True)

            last_step = layers.create_global_var(name="last_step",
                                                 shape=[1],
                                                 value=int(0),
                                                 dtype='int64',
                                                 persistable=True)

            avg_loss = layers.create_global_var(name="avg_loss",
                                                shape=[1],
                                                value=float(0),
                                                dtype=loss.dtype,
                                                persistable=True)

            lr_0 = layers.create_global_var(name="lr_0",
                                            shape=[1],
                                            value=float(0),
                                            dtype='float32',
                                            persistable=True)

            loss_0 = layers.create_global_var(name="loss_0",
                                              shape=[1],
                                              value=float(0),
                                              dtype='float32',
                                              persistable=True)

            global_lr = self.inner_opt._global_learning_rate()

            def initialize():
                self._generate_avg_loss(main_block, loss, avg_loss)
                layers.assign(avg_loss, loss_0)
                layers.assign(global_lr, lr_0)

            layers.cond(step == 1, initialize)

            def communicate():
                sub_block = default_main_program().current_block()
                ring_id = -1
                for param, snapshot in p2s:
                    sub_block.append_op(type='elementwise_sub',
                                        inputs={
                                            'X': [snapshot],
                                            'Y': [param]
                                        },
                                        outputs={'Out': [param]},
                                        attrs={OP_ROLE_KEY: OpRole.Optimize})
                    sub_block.append_op(type='c_sync_calc_stream',
                                        inputs={'X': param},
                                        outputs={'Out': param},
                                        attrs={OP_ROLE_KEY: OpRole.Optimize})
                    ring_id = (ring_id + 1) % self.nrings
                    sub_block.append_op(type='c_allreduce_sum',
                                        inputs={'X': [param]},
                                        outputs={'Out': [param]},
                                        attrs={
                                            'ring_id': ring_id,
                                            OP_ROLE_KEY: OpRole.Optimize
                                        })

                for ring_id in range(self.nrings):
                    sub_block.append_op(type='c_sync_comm_stream',
                                        inputs={'X': param},
                                        outputs={'Out': param},
                                        attrs={
                                            'ring_id': ring_id,
                                            OP_ROLE_KEY: OpRole.Optimize
                                        })

                for param, snapshot in p2s:
                    sub_block.append_op(type='scale',
                                        inputs={'X': [param]},
                                        outputs={'Out': [param]},
                                        attrs={
                                            'scale':
                                            1.0 /
                                            self.role_maker._worker_num(),
                                            OP_ROLE_KEY:
                                            OpRole.Optimize
                                        })
                    sub_block.append_op(type='elementwise_sub',
                                        inputs={
                                            'X': [snapshot],
                                            'Y': [param]
                                        },
                                        outputs={'Out': [param]},
                                        attrs={OP_ROLE_KEY: OpRole.Optimize})
                    sub_block.append_op(type='assign',
                                        inputs={'X': [param]},
                                        outputs={'Out': [snapshot]},
                                        attrs={OP_ROLE_KEY: OpRole.Optimize})
                layers.assign(step, last_step)

            def communicate_avg_loss():
                communicate()
                self._generate_avg_loss(main_block, loss, avg_loss)
                next_local_steps = layers.cast(layers.ceil(
                    layers.sqrt(lr_0 * avg_loss / (global_lr * loss_0) *
                                float(init_k_steps))),
                                               dtype='int64')
                max_local_steps = layers.fill_constant(shape=[1],
                                                       dtype='int64',
                                                       value=16)
                min_local_steps = layers.fill_constant(shape=[1],
                                                       dtype='int64',
                                                       value=1)
                next_local_steps = layers.elementwise_min(
                    next_local_steps, max_local_steps)
                next_local_steps = layers.elementwise_max(
                    next_local_steps, min_local_steps)
                layers.assign(next_local_steps, k_steps)

            def begin_localsgd():
                layers.cond(step - last_step == k_steps, communicate_avg_loss)

            layers.cond(step > begin_step, begin_localsgd, communicate)

        return minimized
예제 #24
0
def network(batch_size, items_num, hidden_size, step, rate):
    stdv = 1.0 / math.sqrt(hidden_size)

    items = layers.data(
        name="items",
        shape=[batch_size, -1, 1],
        dtype="int64",
        append_batch_size=False)  #[bs, uniq_max, 1]
    seq_index = layers.data(
        name="seq_index",
        shape=[batch_size, -1],
        dtype="int64",
        append_batch_size=False)  #[-1(seq_max)*batch_size, 1]
    last_index = layers.data(
        name="last_index",
        shape=[batch_size],
        dtype="int64",
        append_batch_size=False)  #[batch_size, 1]
    adj_in = layers.data(
        name="adj_in",
        shape=[batch_size, -1, -1],
        dtype="float32",
        append_batch_size=False)
    adj_out = layers.data(
        name="adj_out",
        shape=[batch_size, -1, -1],
        dtype="float32",
        append_batch_size=False)
    mask = layers.data(
        name="mask",
        shape=[batch_size, -1, 1],
        dtype="float32",
        append_batch_size=False)
    label = layers.data(
        name="label",
        shape=[batch_size, 1],
        dtype="int64",
        append_batch_size=False)

    items_emb = layers.embedding(
        input=items,
        is_sparse=True,
        param_attr=fluid.ParamAttr(
            name="emb",
            learning_rate=rate,
            initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)),
        size=[items_num, hidden_size])  #[batch_size, uniq_max, h]
    data_feed = [items, seq_index, last_index, adj_in, adj_out, mask, label]

    pre_state = items_emb
    for i in range(step):
        pre_state = layers.reshape(
            x=pre_state, shape=[batch_size, -1, hidden_size])
        state_in = layers.fc(
            input=pre_state,
            name="state_in",
            size=hidden_size,
            act=None,
            num_flatten_dims=2,
            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)),
            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)))  #[batch_size, uniq_max, h]
        state_out = layers.fc(
            input=pre_state,
            name="state_out",
            size=hidden_size,
            act=None,
            num_flatten_dims=2,
            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)),
            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)))  #[batch_size, uniq_max, h]

        state_adj_in = layers.matmul(adj_in,
                                     state_in)  #[batch_size, uniq_max, h]
        state_adj_out = layers.matmul(adj_out,
                                      state_out)  #[batch_size, uniq_max, h]

        gru_input = layers.concat([state_adj_in, state_adj_out], axis=2)

        gru_input = layers.reshape(x=gru_input, shape=[-1, hidden_size * 2])
        gru_fc = layers.fc(input=gru_input,
                           name="gru_fc",
                           size=3 * hidden_size,
                           bias_attr=False)
        pre_state, _, _ = fluid.layers.gru_unit(
            input=gru_fc,
            hidden=layers.reshape(
                x=pre_state, shape=[-1, hidden_size]),
            size=3 * hidden_size)

    final_state = pre_state
    seq_index = layers.reshape(seq_index, shape=[-1])
    seq = layers.gather(final_state, seq_index)  #[batch_size*-1(seq_max), h]
    last = layers.gather(final_state, last_index)  #[batch_size, h]

    seq = layers.reshape(
        seq, shape=[batch_size, -1, hidden_size])  #[batch_size, -1(seq_max), h]
    last = layers.reshape(
        last, shape=[batch_size, hidden_size])  #[batch_size, h]

    seq_fc = layers.fc(
        input=seq,
        name="seq_fc",
        size=hidden_size,
        bias_attr=False,
        act=None,
        num_flatten_dims=2,
        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
            low=-stdv, high=stdv)))  #[batch_size, -1(seq_max), h]
    last_fc = layers.fc(input=last,
                        name="last_fc",
                        size=hidden_size,
                        bias_attr=False,
                        act=None,
                        num_flatten_dims=1,
                        param_attr=fluid.ParamAttr(
                            initializer=fluid.initializer.Uniform(
                                low=-stdv, high=stdv)))  #[bathc_size, h]

    seq_fc_t = layers.transpose(
        seq_fc, perm=[1, 0, 2])  #[-1(seq_max), batch_size, h]
    add = layers.elementwise_add(seq_fc_t,
                                 last_fc)  #[-1(seq_max), batch_size, h]
    b = layers.create_parameter(
        shape=[hidden_size],
        dtype='float32',
        default_initializer=fluid.initializer.Constant(value=0.0))  #[h]
    add = layers.elementwise_add(add, b)  #[-1(seq_max), batch_size, h]

    add_sigmoid = layers.sigmoid(add)  #[-1(seq_max), batch_size, h] 
    add_sigmoid = layers.transpose(
        add_sigmoid, perm=[1, 0, 2])  #[batch_size, -1(seq_max), h]

    weight = layers.fc(input=add_sigmoid,
                       name="weight_fc",
                       size=1,
                       act=None,
                       num_flatten_dims=2,
                       bias_attr=False,
                       param_attr=fluid.ParamAttr(
                           initializer=fluid.initializer.Uniform(
                               low=-stdv, high=stdv)))  #[batch_size, -1, 1]
    weight *= mask
    weight_mask = layers.elementwise_mul(seq, weight, axis=0)
    global_attention = layers.reduce_sum(weight_mask, dim=1)

    final_attention = layers.concat(
        [global_attention, last], axis=1)  #[batch_size, 2*h]
    final_attention_fc = layers.fc(
        input=final_attention,
        name="fina_attention_fc",
        size=hidden_size,
        bias_attr=False,
        act=None,
        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
            low=-stdv, high=stdv)))  #[batch_size, h]

    all_vocab = layers.create_global_var(
        shape=[items_num - 1, 1],
        value=0,
        dtype="int64",
        persistable=True,
        name="all_vocab")

    all_emb = layers.embedding(
        input=all_vocab,
        is_sparse=True,
        param_attr=fluid.ParamAttr(
            name="emb",
            learning_rate=rate,
            initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)),
        size=[items_num, hidden_size])  #[all_vocab, h]

    logits = layers.matmul(
        x=final_attention_fc, y=all_emb,
        transpose_y=True)  #[batch_size, all_vocab]
    softmax = layers.softmax_with_cross_entropy(
        logits=logits, label=label)  #[batch_size, 1]
    loss = layers.reduce_mean(softmax)  # [1]
    #fluid.layers.Print(loss)
    acc = layers.accuracy(input=logits, label=label, k=20)
    return loss, acc, data_feed, [items_emb, all_emb]
def _get_gm_cond_var(main_program, k_steps, dist_context):
    main_block = main_program.global_block()
    # Add const var
    k_step_var = layers.create_global_var(name="gradient_merge_k",
                                          shape=[1],
                                          value=int(k_steps),
                                          dtype='int32',
                                          persistable=True,
                                          force_cpu=True)
    set_var_dist_attr(dist_context, k_step_var, [-1],
                      world_process_group.ranks)

    zero_var = layers.create_global_var(name="gradient_merge_zero",
                                        shape=[1],
                                        value=int(0),
                                        dtype='int32',
                                        persistable=True,
                                        force_cpu=True)
    set_var_dist_attr(dist_context, zero_var, [-1], world_process_group.ranks)

    # Add step var & cond var
    step_var = layers.create_global_var(name="gradient_merge_step",
                                        shape=[1],
                                        value=int(0),
                                        dtype='int32',
                                        persistable=True,
                                        force_cpu=True)
    set_var_dist_attr(dist_context, step_var, [-1], world_process_group.ranks)

    cond_var = main_block.create_var(name="gradient_merge_cond",
                                     shape=[1],
                                     dtype='bool')
    set_var_dist_attr(dist_context, cond_var, [-1], world_process_group.ranks)

    with device_guard("cpu"):
        # step_var = (step_var + 1) % k_step
        layers.increment(x=step_var, value=1.0, in_place=True)
        elementwise_mod_op = main_block.append_op(type='elementwise_mod',
                                                  inputs={
                                                      'X': step_var,
                                                      'Y': k_step_var
                                                  },
                                                  outputs={'Out': step_var},
                                                  attrs={
                                                      'axis': -1,
                                                      'use_mkldnn': False
                                                  })
        naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
            elementwise_mod_op, world_process_group.ranks, [-1], dist_context)

        # cond_var = (step_var == 0)
        equal_op = main_block.append_op(type='equal',
                                        inputs={
                                            'X': step_var,
                                            'Y': zero_var
                                        },
                                        outputs={'Out': cond_var})
        naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
            equal_op, world_process_group.ranks, [-1], dist_context)

    return cond_var
예제 #26
0
def optimization(loss,
                 warmup_steps,
                 num_train_steps,
                 learning_rate,
                 train_program,
                 startup_prog,
                 weight_decay,
                 scheduler='linear_warmup_decay',
                 use_fp16=False,
                 init_loss_scaling=128,
                 incr_every_n_steps=1000,
                 decr_every_n_nan_or_inf=2,
                 incr_ratio=2.0,
                 decr_ratio=0.8):
    """do backword for static"""
    def exclude_from_weight_decay(param):
        name = param.name.rstrip('.master')
        if name.find("layer_norm") > -1:
            return True
        bias_suffix = ["_bias", "_b", ".b_0"]
        for suffix in bias_suffix:
            if name.endswith(suffix):
                return True
        return False

    if warmup_steps > 0:
        if scheduler == 'noam_decay':
            scheduled_lr = L.learning_rate_scheduler\
             .noam_decay(1/(warmup_steps *(learning_rate ** 2)),
                         warmup_steps)
        elif scheduler == 'linear_warmup_decay':
            scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps,
                                               num_train_steps)
        else:
            raise ValueError("Unkown learning rate scheduler, should be "
                             "'noam_decay' or 'linear_warmup_decay'")
        log.debug('using Adam')
        optimizer = F.optimizer.Adam(learning_rate=scheduled_lr)
    else:
        scheduled_lr = L.create_global_var(
            name=F.unique_name.generate("learning_rate"),
            shape=[1],
            value=learning_rate,
            dtype='float32',
            persistable=True)
        log.debug('using Adam')

        optimizer = F.optimizer.Adam(learning_rate=scheduled_lr)
        optimizer._learning_rate_map[F.default_main_program()] = scheduled_lr

    if use_fp16:
        log.info('AMP activated')
        optimizer = F.contrib.mixed_precision.decorate(
            optimizer,
            amp_lists=F.contrib.mixed_precision.AutoMixedPrecisionLists(
                custom_black_varnames={"loss"},
                custom_black_list={'layer_norm', 'arg_max', 'argmax'}),
            init_loss_scaling=init_loss_scaling,
            use_dynamic_loss_scaling=True,
        )
        loss_scaling = optimizer.get_loss_scaling()
    else:
        loss_scaling = None

    F.clip.set_gradient_clip(clip=F.clip.GradientClipByGlobalNorm(
        clip_norm=1.0))

    param_list = {}

    for param in train_program.global_block().all_parameters():
        param_list[param.name] = param * 1.0
        param_list[param.name].stop_gradient = True

    _, param_grads = optimizer.minimize(loss)

    if weight_decay > 0:
        for param, grad in param_grads:
            if exclude_from_weight_decay(param):
                continue
            with param.block.program._optimized_guard(
                [param, grad]), F.framework.name_scope("weight_decay"):
                updated_param = param - param_list[
                    param.name] * weight_decay * scheduled_lr
                L.assign(output=param, input=updated_param)

    return scheduled_lr, loss_scaling
예제 #27
0
def network(items_num, hidden_size, step, bs):
    stdv = 1.0 / math.sqrt(hidden_size)

    items = fluid.data(name="items", shape=[bs, -1],
                       dtype="int64")  #[batch_size, uniq_max]
    seq_index = fluid.data(name="seq_index", shape=[bs, -1, 2],
                           dtype="int32")  #[batch_size, seq_max, 2]
    last_index = fluid.data(name="last_index", shape=[bs, 2],
                            dtype="int32")  #[batch_size, 2]
    adj_in = fluid.data(name="adj_in", shape=[bs, -1, -1],
                        dtype="float32")  #[batch_size, seq_max, seq_max]
    adj_out = fluid.data(name="adj_out", shape=[bs, -1, -1],
                         dtype="float32")  #[batch_size, seq_max, seq_max]
    mask = fluid.data(name="mask", shape=[bs, -1, 1],
                      dtype="float32")  #[batch_size, seq_max, 1]
    label = fluid.data(name="label", shape=[bs, 1],
                       dtype="int64")  #[batch_size, 1]

    datas = [items, seq_index, last_index, adj_in, adj_out, mask, label]
    py_reader = fluid.io.DataLoader.from_generator(capacity=256,
                                                   feed_list=datas,
                                                   iterable=False)
    feed_datas = datas

    items_emb = fluid.embedding(
        input=items,
        param_attr=fluid.ParamAttr(name="emb",
                                   initializer=fluid.initializer.Uniform(
                                       low=-stdv, high=stdv)),
        size=[items_num, hidden_size])  #[batch_size, uniq_max, h]

    pre_state = items_emb
    for i in range(step):
        pre_state = layers.reshape(x=pre_state, shape=[bs, -1, hidden_size])
        state_in = layers.fc(
            input=pre_state,
            name="state_in",
            size=hidden_size,
            act=None,
            num_flatten_dims=2,
            param_attr=fluid.ParamAttr(
                initializer=fluid.initializer.Uniform(low=-stdv, high=stdv)),
            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)))  #[batch_size, uniq_max, h]
        state_out = layers.fc(
            input=pre_state,
            name="state_out",
            size=hidden_size,
            act=None,
            num_flatten_dims=2,
            param_attr=fluid.ParamAttr(
                initializer=fluid.initializer.Uniform(low=-stdv, high=stdv)),
            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)))  #[batch_size, uniq_max, h]

        state_adj_in = layers.matmul(adj_in,
                                     state_in)  #[batch_size, uniq_max, h]
        state_adj_out = layers.matmul(adj_out,
                                      state_out)  #[batch_size, uniq_max, h]

        gru_input = layers.concat([state_adj_in, state_adj_out], axis=2)

        gru_input = layers.reshape(x=gru_input, shape=[-1, hidden_size * 2])
        gru_fc = layers.fc(input=gru_input,
                           name="gru_fc",
                           size=3 * hidden_size,
                           bias_attr=False)
        pre_state, _, _ = fluid.layers.gru_unit(input=gru_fc,
                                                hidden=layers.reshape(
                                                    x=pre_state,
                                                    shape=[-1, hidden_size]),
                                                size=3 * hidden_size)

    final_state = layers.reshape(pre_state, shape=[bs, -1, hidden_size])
    seq = layers.gather_nd(final_state, seq_index)
    last = layers.gather_nd(final_state, last_index)

    seq_fc = layers.fc(
        input=seq,
        name="seq_fc",
        size=hidden_size,
        bias_attr=False,
        act=None,
        num_flatten_dims=2,
        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
            low=-stdv, high=stdv)))  #[batch_size, seq_max, h]
    last_fc = layers.fc(
        input=last,
        name="last_fc",
        size=hidden_size,
        bias_attr=False,
        act=None,
        num_flatten_dims=1,
        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
            low=-stdv, high=stdv)))  #[bathc_size, h]

    seq_fc_t = layers.transpose(seq_fc, perm=[1, 0,
                                              2])  #[seq_max, batch_size, h]
    add = layers.elementwise_add(seq_fc_t, last_fc)  #[seq_max, batch_size, h]
    b = layers.create_parameter(
        shape=[hidden_size],
        dtype='float32',
        default_initializer=fluid.initializer.Constant(value=0.0))  #[h]
    add = layers.elementwise_add(add, b)  #[seq_max, batch_size, h]

    add_sigmoid = layers.sigmoid(add)  #[seq_max, batch_size, h]
    add_sigmoid = layers.transpose(add_sigmoid,
                                   perm=[1, 0, 2])  #[batch_size, seq_max, h]

    weight = layers.fc(
        input=add_sigmoid,
        name="weight_fc",
        size=1,
        act=None,
        num_flatten_dims=2,
        bias_attr=False,
        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
            low=-stdv, high=stdv)))  #[batch_size, seq_max, 1]
    weight *= mask
    weight_mask = layers.elementwise_mul(seq, weight,
                                         axis=0)  #[batch_size, seq_max, h]
    global_attention = layers.reduce_sum(weight_mask, dim=1)  #[batch_size, h]

    final_attention = layers.concat([global_attention, last],
                                    axis=1)  #[batch_size, 2*h]
    final_attention_fc = layers.fc(
        input=final_attention,
        name="final_attention_fc",
        size=hidden_size,
        bias_attr=False,
        act=None,
        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
            low=-stdv, high=stdv)))  #[batch_size, h]

    all_vocab = layers.create_global_var(shape=[items_num - 1],
                                         value=0,
                                         dtype="int64",
                                         persistable=True,
                                         name="all_vocab")

    all_emb = fluid.embedding(input=all_vocab,
                              param_attr=fluid.ParamAttr(
                                  name="emb",
                                  initializer=fluid.initializer.Uniform(
                                      low=-stdv, high=stdv)),
                              size=[items_num, hidden_size])  #[all_vocab, h]

    logits = layers.matmul(x=final_attention_fc, y=all_emb,
                           transpose_y=True)  #[batch_size, all_vocab]
    softmax = layers.softmax_with_cross_entropy(logits=logits,
                                                label=label)  #[batch_size, 1]
    loss = layers.reduce_mean(softmax)  # [1]
    acc = layers.accuracy(input=logits, label=label, k=50)
    return loss, acc, py_reader, feed_datas, logits
예제 #28
0
def constant(name, value, dtype, hide_batch_size=True):
    """Create constant variable with given data.

    This function helps to create constants variable with
    given numpy.ndarray data.

    Args:
        name: variable name

        value: numpy.ndarray the value of constant

        dtype: the type of constant

        hide_batch_size: If set the first dimenstion as unknown, the explicit
                         batch size may cause some error in paddle. For example,
                         when the value has a shape of (batch_size, dim1, dim2),
                         it will return a variable with shape (-1, dim1, dim2).

    Return:
        A tuple contain the constant variable and the constant
        variable initialize function.

    Examples:
        .. code-block:: python

            import paddle.fluid as fluid
            place = fluid.CPUPlace()
            exe = fluid.Executor(place)
            constant_var, constant_var_init = constant(name="constant",
                              value=np.array([5.0],
                              dtype="float32"))
            exe.run(fluid.default_startup_program())
            # Run After default startup
            constant_var_init(place)

    """
    if not isinstance(value, np.ndarray):
        raise TypeError("value should be Numpy array.")

    value = value.astype(dtype)
    data = L.create_global_var(shape=value.shape,
                               value=0,
                               dtype=value.dtype,
                               name=name,
                               persistable=True)
    data.stop_gradient = True

    if hide_batch_size:
        shape = list(value.shape)
        shape[0] = -1
        data.desc.set_shape(shape)

    def initializer(place):
        if isinstance(place, fluid.CUDAPlace):
            pass
        elif isinstance(place, fluid.CUDAPinnedPlace):
            pass
        elif isinstance(place, fluid.CPUPlace):
            pass
        else:
            raise TypeError(
                "The input of initializer is not in"
                " [fluid.CUDAPlace, fluid.CPUPlace, fluid.CUDAPinnedPlace]")
        var = fluid.global_scope().var(data.name).get_tensor()
        var.set(value, place)

    return data, initializer
예제 #29
0
    def _create_gm_cond(self, main_block):
        # Add const var
        acc_step_var = layers.create_global_var(
            name="gradient_merge_acc_step",
            shape=[1],
            value=int(self._gradient_merge_acc_step),
            dtype='int32',
            persistable=True,
            force_cpu=True)

        zero_var = layers.create_global_var(name="gradient_merge_zero",
                                            shape=[1],
                                            value=int(0),
                                            dtype='int32',
                                            persistable=True,
                                            force_cpu=True)

        # Add step var & cond var
        current_step_var = layers.create_global_var(
            name="gradient_merge_current_step",
            shape=[1],
            value=int(0),
            dtype='int32',
            persistable=True,
            force_cpu=True)

        cond_var = layers.create_global_var(name="gradient_merge_cond",
                                            shape=[1],
                                            value=bool(0),
                                            dtype='bool',
                                            persistable=False,
                                            force_cpu=True)

        with device_guard("cpu"):
            # step_var = (step_var + 1) % k_step
            main_block.append_op(type='increment',
                                 inputs={'X': [current_step_var]},
                                 outputs={'Out': [current_step_var]},
                                 attrs={
                                     'step': float(1),
                                     OP_ROLE_KEY: OpRole.Optimize
                                 })

            main_block.append_op(type='elementwise_mod',
                                 inputs={
                                     'X': current_step_var,
                                     'Y': acc_step_var
                                 },
                                 outputs={'Out': current_step_var},
                                 attrs={
                                     'axis': -1,
                                     OP_ROLE_KEY: OpRole.Optimize,
                                     'use_mkldnn': False
                                 })

            # cond_var = (step_var == 0)
            main_block.append_op(type='equal',
                                 inputs={
                                     'X': current_step_var,
                                     'Y': zero_var
                                 },
                                 outputs={'Out': cond_var},
                                 attrs={OP_ROLE_KEY: OpRole.Optimize})
        # paddle.static.Print(current_step_var, message="in FWBW last conditional")
        return cond_var
예제 #30
0
    op_maker = core.op_proto_and_checker_maker
    op_role_key = op_maker.kOpRoleAttrName()  # "op_role"
    op_role_var_key = op_maker.kOpRoleVarAttrName()  # "op_role_var"
    param2avg = []
    for idx, op in list(enumerate(block.ops)):
        if _is_backward_op(op,
                           op_role_key) and op_role_var_key in op.attr_names:
            op_role_var = op.all_attrs()[op_role_var_key]
            if len(op_role_var) == 0:
                continue
            assert len(op_role_var) % 2 == 0
            for i in range(0, len(op_role_var), 2):
                param = block.vars[op_role_var[i]]
                avg_var = layers.create_global_var(name=param.name + "@avg",
                                                   shape=param.shape,
                                                   value=1.0,
                                                   dtype='float32',
                                                   persistable=True)
                avgw_list.append(avg_var)

                tmp0 = layers.elementwise_mul(avg_var, decay_var)
                tmp1 = layers.elementwise_mul(param, rev_decay_var)
                block.append_op(type='elementwise_add',
                                inputs={
                                    'X': tmp0,
                                    'Y': tmp1
                                },
                                outputs={'Out': avg_var},
                                stop_gradient=True)

# 执行器声明
예제 #31
0
    def init_fp16_params(self, loss_type, fp16_user_dict):
        # set default value for fp16_params_dict
        fp16_params_dict = dict()
        fp16_params_dict['init_loss_scaling'] = 1.0
        fp16_params_dict['incr_every_n_steps'] = 1000
        fp16_params_dict['decr_every_n_nan_or_inf'] = 2
        fp16_params_dict['incr_ratio'] = 2.0
        fp16_params_dict['decr_ratio'] = 0.5
        fp16_params_dict['use_dynamic_loss_scaling'] = True
        fp16_params_dict['amp_lists'] = None
        if fp16_user_dict is not None:
            # update fp16_params_dict
            for key in fp16_user_dict:
                if fp16_params_dict.has_key(key):
                    fp16_params_dict[key] = fp16_user_dict[key]
                else:
                    logging.warning(
                        "Can't find name '%s' in our fp16_params_dict. "
                        "Please check your dict key. You can set fp16 params only "
                        "in [init_loss_scaling, incr_every_n_steps, "
                        "decr_every_n_nan_or_inf, incr_ratio, decr_ratio, "
                        "use_dynamic_loss_scaling, amp_lists]" % (key))

        self._amp_lists = fp16_params_dict['amp_lists']
        if self._amp_lists is None:
            self._amp_lists = AutoMixedPrecisionLists()

        self._loss_type = loss_type
        self._loss_scaling = layers.create_global_var(
            name=unique_name.generate("loss_scaling"),
            shape=[1],
            value=fp16_params_dict['init_loss_scaling'],
            dtype='float32',
            persistable=True)
        self._use_dynamic_loss_scaling = fp16_params_dict[
            'use_dynamic_loss_scaling']
        if self._use_dynamic_loss_scaling:
            self._incr_every_n_steps = layers.fill_constant(
                shape=[1],
                dtype='int32',
                value=fp16_params_dict['incr_every_n_steps'])
            self._decr_every_n_nan_or_inf = layers.fill_constant(
                shape=[1],
                dtype='int32',
                value=fp16_params_dict['decr_every_n_nan_or_inf'])
            self._incr_ratio = fp16_params_dict['incr_ratio']
            self._decr_ratio = fp16_params_dict['decr_ratio']
            self._num_good_steps = layers.create_global_var(
                name=unique_name.generate("num_good_steps"),
                shape=[1],
                value=0,
                dtype='int32',
                persistable=True)
            self._num_bad_steps = layers.create_global_var(
                name=unique_name.generate("num_bad_steps"),
                shape=[1],
                value=0,
                dtype='int32',
                persistable=True)

        # Ensure the data type of learning rate vars is float32 (same as the
        # master parameter dtype)
        if isinstance(self._optimizer._learning_rate, float):
            self._optimizer._learning_rate_map[fluid.default_main_program()] = \
                        layers.create_global_var(
                        name=unique_name.generate("learning_rate"),
                        shape=[1],
                        value=float(self._optimizer._learning_rate),
                        dtype='float32',
                        persistable=True)