def train(X, contents_Y, styles_Y, ctx, lr, num_epochs, lr_decay_epoch):
  X, styles_Y_gram, trainer = get_inits(X, ctx, lr, styles_Y)
  animator = d2l.Animator(xlabel='epoch', ylabel='loss', xlim=[1, num_epochs],
                            legend=['content', 'style', 'TV'],
                            ncols=2, figsize=(7,2.5))
  
  for epoch in range(1, num_epochs+1):
    with autograd.record():
            contents_Y_hat, styles_Y_hat = extract_features(
                X, content_layers, style_layers)
            contents_l, styles_l, tv_l, l = compute_loss(
                X, contents_Y_hat, styles_Y_hat, contents_Y, styles_Y_gram)
            
    l.backward()
    trainer.step(1)
    nd.waitall()
    if epoch % lr_decay_epoch == 0:
      trainer.set_learning_rate(trainer.learning_rate * 0.1)
    # if epoch%10 == 0:
    #   print(epoch)
    if epoch % 10 == 0:
      # animator.axes[1].imshow(postprocess(X).asnumpy())
      animator.add(epoch, [nd.add_n(*contents_l).asscalar(),
                            nd.add_n(*styles_l).asscalar(), tv_l.asscalar()])
    if epoch % 100 == 0:
      d2l.plt.imsave('neural-style'+str(epoch)+'.png', postprocess(X).asnumpy())

    


  return X
def compute_loss(X, contents_Y_hat, styles_Y_hat, contents_Y, styles_Y_gram): 
  # Calculate the content, style, and total variance losses respectively 
  contents_l = [content_loss(Y_hat, Y) * content_weight for Y_hat, Y in zip(
      contents_Y_hat, contents_Y)]
  styles_l = [style_loss(Y_hat, Y) * style_weight for Y_hat, Y in zip(
      styles_Y_hat, styles_Y_gram)]
  tv_l = tv_loss(X) * tv_weight
  # Add up all the losses
  l = nd.add_n(*styles_l) + nd.add_n(*contents_l) + tv_l
  return contents_l, styles_l, tv_l, l
예제 #3
0
def compute_loss(X, contents_Y_hat, styles_Y_hat, contents_Y, styles_Y_gram):
    # 分别计算内容损失、样式损失和总变差损失
    contents_l = [content_loss(Y_hat, Y) * content_weight for Y_hat, Y in zip(
        contents_Y_hat, contents_Y)]
    styles_l = [style_loss(Y_hat, Y) * style_weight for Y_hat, Y in zip(
        styles_Y_hat, styles_Y_gram)]
    tv_l = tv_loss(X) * tv_weight
    # 对所有损失求和
    l = nd.add_n(*styles_l) + nd.add_n(*contents_l) + tv_l
    return contents_l, styles_l, tv_l, l
def compute_loss(X, contents_Y_hat, styles_Y_hat, contents_Y, styles_Y_gram):
    contents_l = [
        content_loss(Y_hat, Y) * content_weight
        for Y_hat, Y in zip(contents_Y_hat, contents_Y)
    ]
    styles_l = [
        style_loss(Y_hat, Y) * style_weight
        for Y_hat, Y in zip(styles_Y_hat, styles_Y_gram)
    ]
    tv_l = tv_loss(X) * tv_weight
    l = nd.add_n(*styles_l) + nd.add_n(*contents_l) + tv_l
    return contents_l, styles_l, tv_l, l
예제 #5
0
def corr2d_multi_in(X, K):
    """
    首先沿着X和K的第0维(通道维)遍历。
    然后使用*将结果列表变成add_n函数的位置参数 # (positional argument)来进行相加
    """
    # zip(ndarray, ndarray)打包成组合ndarray实现同时取数
    return nd.add_n(*[tool.corr2d(x, k) for x, k in zip(X, K)])
예제 #6
0
    def forward(self, x_list):
        '''
        Parameters
        ----------
        x_list: list[mx.ndarray],
                shape is (batch_size, num_of_vertices,
                          num_of_features, num_of_timesteps)

        Returns
        ----------
        Y_hat: mx.ndarray,
               shape is (batch_size, num_of_vertices, num_for_prediction)

        '''
        if len(x_list) != len(self.submodules):
            raise ValueError("num of submodule not equals to "
                             "length of the input list")

        num_of_vertices_set = {i.shape[1] for i in x_list}
        if len(num_of_vertices_set) != 1:
            raise ValueError("Different num_of_vertices detected! "
                             "Check if your input data have same "
                             "size on axis 1.")

        batch_size_set = {i.shape[0] for i in x_list}
        if len(batch_size_set) != 1:
            raise ValueError("Input values must have same batch size!")

        submodule_outputs = [
            self.submodules[idx](x_list[idx]) for idx in range(len(x_list))
        ]

        return nd.add_n(*submodule_outputs)
def compute_loss(res_img, weights, contents_features_h, styles_features_h,
                 contents_features, styles_features_gram):
    content_weight, style_weight, tv_weight = weights
    contents_l = [
        content_loss(c_f_h, c_f) * content_weight
        for c_f_h, c_f in zip(contents_features_h, contents_features)
    ]
    contents_l = nd.add_n(*contents_l).asscalar()
    styles_l = [
        style_loss(s_f_h, s_f_gram) * style_weight
        for s_f_h, s_f_gram in zip(styles_features_h, styles_features_gram)
    ]
    styles_l = nd.add_n(*styles_l).asscalar()
    tv_l = (tv_loss(res_img) * tv_weight).asscalar()

    total_l = contents_l + styles_l + tv_l

    return total_l, contents_l, styles_l, tv_l
예제 #8
0
def corr1d_multi_in(X, K):
    """
        多输入通道一维卷积
    :param X:
    :param K:
    :return:
    """
    # 首先沿着 X 和 Y 的第 0 维(通道维)遍历,使用 * 将结果列表变成 add_n 函数的位置参数来进行相加
    return nd.add_n(*[corr1d(x, k) for x, k in zip(X, K)])
예제 #9
0
def train(X, contents_Y, styles_Y, ctx, lr, max_epochs, lr_decay_epoch):
    X, styles_Y_gram, trainer = get_inits(X, ctx, lr, styles_Y)
    for i in range(max_epochs):
        start = time.time()
        with autograd.record():
            contents_Y_hat, styles_Y_hat = extract_features(
                X, content_layers, style_layers)
            contents_l, styles_l, tv_l, l = compute_loss(
                X, contents_Y_hat, styles_Y_hat, contents_Y, styles_Y_gram)
        l.backward()
        trainer.step(1)
        nd.waitall()
        if i % 50 == 0 and i != 0:
            print('epoch %3d, content loss %.2f, style loss %.2f, '
                  'TV loss %.2f, %.2f sec' %
                  (i, nd.add_n(*contents_l).asscalar(),
                   nd.add_n(*styles_l).asscalar(), tv_l.asscalar(),
                   time.time() - start))
        if i % lr_decay_epoch == 0 and i != 0:
            trainer.set_learning_rate(trainer.learning_rate * 0.1)
            print('change lr to %.1e' % trainer.learning_rate)
    return X
예제 #10
0
def test_ActivationRegularizationLoss(alpha: float):
    ar = ActivationRegularizationLoss(alpha, batch_axis=0)
    inputs = [
        nd.arange(1000).reshape(10, 10, 10),
        nd.arange(1000).reshape(10, 10, 10),
        nd.arange(1000).reshape(10, 10, 10),
    ]
    ar_result = ar(*inputs)
    outputs = [
        alpha * nd.mean((array * array), axis=0, exclude=True)
        for array in inputs
    ]
    assert np.isclose(nd.add_n(*outputs).asnumpy(), ar_result.asnumpy()).all()
예제 #11
0
def test_TemporalActivationRegularizationLoss(beta: float):
    tar = TemporalActivationRegularizationLoss(beta, time_axis=1, batch_axis=0)
    inputs = [
        nd.arange(1000).reshape(10, 10, 10),
        nd.arange(1000).reshape(10, 10, 10),
        nd.arange(1000).reshape(10, 10, 10),
    ]
    tar_result = tar(*inputs)
    outputs = [
        beta * nd.mean(
            (array[:, 1:, :] - array[:, :-1, :]).__pow__(2),
            axis=0,
            exclude=True,
        ) for array in inputs
    ]
    assert np.isclose(nd.add_n(*outputs).asnumpy(), tar_result.asnumpy()).all()
예제 #12
0
def corr2d_multi_in(X, K):
    # 首先沿着X和K的第0维(通道维)遍历。然后使用*将结果列表变成add_n函数的位置参数
    # (positional argument)来进行相加

    #
    # [d2l.corr2d(x, k) for x, k in zip(X, K)]
    # [0]
    # [[19. 25.]
    # [37. 43.]]

    # [1]
    # [[37. 47.]
    # [67. 77.]]

    # [0] + [1] =
    # 56    72
    # 104   120
    return nd.add_n(*[d2l.corr2d(x, k) for x, k in zip(X, K)])
def global_norm(
    arrays: Union[Generator[NDArray, NDArray, NDArray], List[NDArray],
                  Tuple[NDArray]]
) -> NDArray:
    """
    Calculate global norm on list or tuple of NDArrays using this formula:
        `global_norm = sqrt(sum([l2norm(p)**2 for p in parameters]))`

    :param arrays: list or tuple of parameters to calculate global norm on
    :return: single-value NDArray
    """
    def _norm(array):
        if array.stype == 'default':
            x = array.reshape((-1, ))
            return nd.dot(x, x)
        return array.norm().square()

    total_norm = nd.add_n(*[_norm(arr) for arr in arrays])
    total_norm = nd.sqrt(total_norm)
    return total_norm
예제 #14
0
def grad_global_norm(parameters, max_norm):
    """Calculate the 2-norm of gradients of parameters, and how much they should be scaled down
    such that their 2-norm does not exceed `max_norm`.

    If gradients exist for more than one context for a parameter, user needs to explicitly call
    ``trainer.allreduce_grads`` so that the gradients are summed first before calculating
    the 2-norm.

    .. note::

        This function is only for use when `update_on_kvstore` is set to False in trainer.

    Example::

        trainer = Trainer(net.collect_params(), update_on_kvstore=False, ...)
        for x, y in mx.gluon.utils.split_and_load(X, [mx.gpu(0), mx.gpu(1)]):
            with mx.autograd.record():
                y = net(x)
                loss = loss_fn(y, label)
            loss.backward()
        trainer.allreduce_grads()
        norm, ratio = grad_global_norm(net.collect_params().values(), max_norm)
        trainer.update(batch_size * ratio)
        ...

    Parameters
    ----------
    parameters : list of Parameters

    Returns
    -------
    NDArray
      Total norm. Shape is (1,)
    NDArray
      Ratio for rescaling gradients based on max_norm s.t. grad = grad / ratio.
      If total norm is NaN, ratio will be NaN, too. Shape is (1,)
    NDArray
      Whether the total norm is finite. Shape is (1,)
    """
    # collect gradient arrays
    arrays = []
    idx = 0
    for p in parameters:
        if p.grad_req != 'null':
            p_grads = p.list_grad()
            arrays.append(p_grads[idx % len(p_grads)])
            idx += 1
    assert len(arrays) > 0, 'No parameter found available for gradient norm.'

    # compute gradient norms
    def _norm(array):
        # TODO(haibin) norm operator does not support fp16 safe reduction.
        # Issue is tracked at: https://github.com/apache/incubator-mxnet/issues/14126
        x = array.reshape((-1, )).astype('float32', copy=False)
        return nd.dot(x, x)

    norm_arrays = [_norm(arr) for arr in arrays]

    # group norm arrays by ctx
    def group_by_ctx(arr_list):
        groups = collections.defaultdict(list)
        for arr in arr_list:
            ctx = arr.context
            groups[ctx].append(arr)
        return groups

    norm_groups = group_by_ctx(norm_arrays)

    # reduce
    ctx, dtype = arrays[0].context, 'float32'
    norms = [nd.add_n(*g).as_in_context(ctx) for g in norm_groups.values()]
    total_norm = nd.add_n(*norms).sqrt()
    scale = total_norm / max_norm
    # is_finite = 0 if NaN or Inf, 1 otherwise.
    is_finite = nd.contrib.isfinite(scale)
    # if scale is finite, nd.maximum selects the max between scale and 1. That is,
    # 1 is returned if total_norm does not exceed max_norm.
    # if scale = NaN or Inf, the result of nd.minimum is undefined. Therefore, we use
    # choices.take to return NaN or Inf.
    scale_or_one = nd.maximum(nd.ones((1, ), dtype=dtype, ctx=ctx), scale)
    choices = nd.concat(scale, scale_or_one, dim=0)
    chosen_scale = choices.take(is_finite)
    return total_norm, chosen_scale, is_finite
예제 #15
0
def corr2d_multi_in(X, K):
    return nd.add_n(*[corr2d(x, k) for x, k in zip(X, K)])
예제 #16
0
    def accumulate_gradients(self,
                             inputs: Dict[str, np.ndarray],
                             targets: List[np.ndarray],
                             additional_fetches: List[Tuple[int, str]] = None,
                             importance_weights: np.ndarray = None,
                             no_accumulation: bool = False) -> Tuple[float, List[float], float, list]:
        """
        Runs a forward & backward pass, clips gradients if needed and accumulates them into the accumulation
        :param inputs: environment states (observation, etc.) as well extra inputs required by loss. Shape of ndarray
            is (batch_size, observation_space_size) or (batch_size, observation_space_size, stack_size)
        :param targets: targets required by  loss (e.g. sum of discounted rewards)
        :param additional_fetches: additional fetches to calculate and return. Each fetch is specified as (int, str)
            tuple of head-type-index and fetch-name. The tuple is obtained from each head.
        :param importance_weights: ndarray of shape (batch_size,) to multiply with batch loss.
        :param no_accumulation: if True, set gradient values to the new gradients, otherwise sum with previously
            calculated gradients
        :return: tuple of total_loss, losses, norm_unclipped_grads, fetched_tensors
            total_loss (float): sum of all head losses
            losses (list of float): list of all losses. The order is list of target losses followed by list of
                regularization losses. The specifics of losses is dependant on the network parameters
                (number of heads, etc.)
            norm_unclippsed_grads (float): global norm of all gradients before any gradient clipping is applied
            fetched_tensors: all values for additional_fetches
        """
        if self.accumulated_gradients is None:
            self.reset_accumulated_gradients()

        embedders = [emb.embedder_name for emb in self.model.nets[0].input_embedders]
        nd_inputs = tuple(nd.array(inputs[emb]) for emb in embedders)

        assert self.middleware.__class__.__name__ != 'LSTMMiddleware', "LSTM middleware not supported"

        targets = force_list(targets)
        with autograd.record():
            out_per_head = utils.split_outputs_per_head(self.model(*nd_inputs), self.model.output_heads)
            tgt_per_loss = utils.split_targets_per_loss(targets, self.losses)

            losses = list()
            regularizations = list()
            additional_fetches = [(k, None) for k in additional_fetches]
            for h, h_loss, h_out, l_tgt in zip(self.model.output_heads, self.losses, out_per_head, tgt_per_loss):
                l_in = utils.get_loss_agent_inputs(inputs, head_type_idx=h.head_type_idx, loss=h_loss)
                # Align arguments with loss.loss_forward and convert to NDArray
                l_args = utils.to_mx_ndarray(utils.align_loss_args(h_out, l_in, l_tgt, h_loss))
                # Calculate loss and all auxiliary outputs
                loss_outputs = utils.loss_output_dict(utils.to_list(h_loss(*l_args)), h_loss.output_schema)
                if LOSS_OUT_TYPE_LOSS in loss_outputs:
                    losses.extend(loss_outputs[LOSS_OUT_TYPE_LOSS])
                if LOSS_OUT_TYPE_REGULARIZATION in loss_outputs:
                    regularizations.extend(loss_outputs[LOSS_OUT_TYPE_REGULARIZATION])
                # Set additional fetches
                for i, fetch in enumerate(additional_fetches):
                    head_type_idx, fetch_name = fetch[0]  # fetch key is a tuple of (head_type_index, fetch_name)
                    if head_type_idx == h.head_type_idx:
                        assert fetch[1] is None  # sanity check that fetch is None
                        additional_fetches[i] = (fetch[0], loss_outputs[fetch_name])

            # Total loss is losses and regularization (NOTE: order is important)
            total_loss_list = losses + regularizations
            total_loss = nd.add_n(*total_loss_list)

        # Calculate gradients
        total_loss.backward()

        assert self.optimizer_type != 'LBFGS', 'LBFGS not supported'

        # allreduce gradients from all contexts
        self.trainer.allreduce_grads()

        # Calculate global norm of gradients
        # FIXME global norm is returned even when not used for clipping! Is this necessary?
        # FIXME global norm might be calculated twice if clipping method is global norm
        norm_unclipped_grads = utils.global_norm(self._model_grads)

        # Clip gradients
        if self.network_parameters.clip_gradients:
            utils.clip_grad(
                self._model_grads,
                clip_method=self.network_parameters.gradients_clipping_method,
                clip_val=self.network_parameters.clip_gradients,
                inplace=True)

        # Update self.accumulated_gradients depending on no_accumulation flag
        if no_accumulation:
            for acc_grad, model_grad in zip(self.accumulated_gradients, self._model_grads):
                acc_grad[:] = model_grad
        else:
            for acc_grad, model_grad in zip(self.accumulated_gradients, self._model_grads):
                acc_grad += model_grad

        # result of of additional fetches
        fetched_tensors = [fetch[1] for fetch in additional_fetches]

        # convert everything to numpy or scalar before returning
        result = utils.asnumpy_or_asscalar((total_loss, total_loss_list, norm_unclipped_grads, fetched_tensors))
        return result
예제 #17
0
def test_add_n():
    x = [nd.ones(LARGE_X) for j in range(SMALL_Y)]
    y = nd.add_n(*x)
    assert y[0] == SMALL_Y
    assert y[-1] == SMALL_Y
def test_add_n():
    x = [nd.ones(LARGE_X)]
    y = nd.add_n(*x)
    assert y[0] == 1
    assert y[-1] == 1
예제 #19
0
def corr2d_multi_in(X, K):
    #首先沿着X,K的通道维遍历
    return nd.add_n(*[d2l.corr2d(x,k) for x,k in zip(X,K)])
예제 #20
0
def corr2d_multi_in(X, K):
    #    for x, k in zip(X, K):
    #        print(d2l.corr2d(x, k))
    return nd.add_n(*[d2l.corr2d(x, k) for x, k in zip(X, K)])
예제 #21
0
def sum_loss(loss, pred, truths, weights):
    return nd.add_n(
        *[w * loss(yhat, y) for w, yhat, y in zip(weights, pred, truths)])
예제 #22
0
def sum_loss(loss, preds, truths, weights):
    # loss: a function. e.g. content_loss.
    return nd.add_n(
        *[w * loss(yhat, y) for w, yhat, y in zip(weights, preds, truths)])
    '''
예제 #23
0
def clip_grad_global_norm(parameters, max_norm, check_isfinite=True):
    """Rescales gradients of parameters so that the sum of their 2-norm is smaller than `max_norm`.
    If gradients exist for more than one context for a parameter, user needs to explicitly call
    ``trainer.allreduce_grads`` so that the gradients are summed first before calculating
    the 2-norm.

    .. note::

        This function is only for use when `update_on_kvstore` is set to False in trainer.
        In cases where training happens on multiple contexts, this method should be used in
        conjunction with ``trainer.allreduce_grads()`` and ``trainer.update()``.
        (**not** ``trainer.step()``)

    Example::

        trainer = Trainer(net.collect_params(), update_on_kvstore=False, ...)
        for x, y in mx.gluon.utils.split_and_load(X, [mx.gpu(0), mx.gpu(1)]):
            with mx.autograd.record():
                y = net(x)
                loss = loss_fn(y, label)
            loss.backward()
        trainer.allreduce_grads()
        nlp.utils.clip_grad_global_norm(net.collect_params().values(), max_norm)
        trainer.update(batch_size)
        ...

    Parameters
    ----------
    parameters : list of Parameters
    max_norm : float
    check_isfinite : bool, default True
         If True, check that the total_norm is finite (not nan or inf). This
         requires a blocking .asscalar() call.

    Returns
    -------
    NDArray or float
      Total norm. Return type is NDArray of shape (1,) if check_isfinite is
      False. Otherwise a float is returned.

    """
    def _norm(array):
        if array.stype == 'default':
            x = array.reshape((-1))
            return nd.dot(x, x)
        return array.norm().square()

    arrays = []
    i = 0
    for p in parameters:
        if p.grad_req != 'null':
            grad_list = p.list_grad()
            arrays.append(grad_list[i % len(grad_list)])
            i += 1
    assert len(
        arrays) > 0, 'No parameter found available for gradient norm clipping.'
    ctx, dtype = arrays[0].context, arrays[0].dtype
    total_norm = nd.add_n(*[_norm(arr).as_in_context(ctx) for arr in arrays])
    total_norm = nd.sqrt(total_norm)
    if check_isfinite:
        total_norm = total_norm.asscalar()
        if not np.isfinite(total_norm):
            warnings.warn(UserWarning('nan or inf is detected. '
                                      'Clipping results will be undefined.'),
                          stacklevel=2)
    scale = max_norm / (total_norm + 1e-8)
    if check_isfinite:
        scale = nd.array([scale], dtype=dtype, ctx=ctx)
    scale = nd.min(
        nd.concat(scale, nd.ones((1, ), dtype=dtype, ctx=ctx), dim=0))
    for p in parameters:
        if p.grad_req != 'null':
            for arr in p.list_grad():
                arr *= scale.as_in_context(arr.context)
    return total_norm
예제 #24
0
def content_loss(content_y_hat, content_y, weights):
    loss = []
    for y, y_hat, w in zip(content_y, content_y_hat, weights):
        loss.append(w * nd.mean(nd.abs(y - y_hat), axis=0, exclude=True))
    if len(loss) == 0: return 0
    return nd.add_n(*loss)
예제 #25
0
파일: demo.py 프로젝트: z01nl1o02/tests
def sum_loss(loss, preds, truths, weights):
    return nd.add_n(*[w*loss(yhat, y) for w, yhat, y in zip(
        weights, preds, truths)])
예제 #26
0
파일: channel.py 프로젝트: zqw2/mxnet-gpu
def corr2d_multi_in(X, K):
    # ⾸先沿着X和K的第0维(通道维)遍历。然后使⽤*将结果列表变成add_n函数的位置参数
    # ( positional argument)来进⾏相加
    return nd.add_n(*[d2l.corr2d(x, k) for x, k in zip(X, K)])
예제 #27
0
from mxnet import nd


def conv_2d_multi_in(X, K):
    return nd.add_n(*[conv_2d(x, k) for x, k in zip(X, K)])


# 2 * 3 * 3
X = nd.array([[[0, 1, 2], [3, 4, 5], [6, 7, 8]],
              [[1, 2, 3], [4, 5, 6], [7, 8, 9]]])
# 2 * 2 * 2
K = nd.array([[[0, 1], [2, 3]], [[1, 2], [3, 4]]])
items = []
for x, k in zip(X, K):
    items.append(conv_2d(x, k))
print(nd.add_n(X, X))
print(nd.add_n(*[X, X]))
print(nd.add_n(*items))
print(conv_2d_multi_in(X, K))


def conv_2d_multi_in_out(X, K):
    return nd.stack(*[conv_2d_multi_in(X, k) for k in K])


print(K)
print(nd.stack(K, K + 1, K + 2).shape)
print(conv_2d_multi_in_out(X, [K, K + 1, K + 2]))


def conv2d_multi_in_out_1x1(X, K):
예제 #28
0
def corr2d_multi_in(X, K):
    # 我们首先沿着 X 和 K 的第 0 维(通道维)遍历。然后使用 * 将结果列表 (list) 变成
    # add_n 的位置参数(positional argument)来进行相加。
    return nd.add_n(*[corr2d(x, k) for x, k in zip(X, K)])
예제 #29
0
def corr2d_multi_in(X, K):
    # 首先沿着X和K的第0维(通道维)遍历
    # 然后使用*将结果列表变成add_n函数的位置参数来进行相加
    return nd.add_n(*[corr2d(x, k) for x, k in zip(X, K)])
예제 #30
0
def corr1d_multi_in(X, K):
    # 我们⾸先沿着 X 和 K 的第 0 维(通道维)遍历。然后使⽤ * 将结果列表变成 add_n 函数
    # 的位置参数(positional argument)来进⾏相加。
    return nd.add_n(*[corr1d(x, k) for x, k in zip(X, K)])
예제 #31
0
def corr2d_multi_in(X, K):
    # First, traverse along the 0th dimension (channel dimension) of X and K.
    # Then, add them together by using * to turn the result list into a
    # positional argument of the add_n function
    return nd.add_n(*[d2l.corr2d(x, k) for x, k in zip(X, K)])