def __init__(self, word_to_idx, input_dim=512, wordvec_dim=128, hidden_dim=128, cell_type='rnn', dtype=None): """ Construct a new CaptioningRNN instance. Inputs: - word_to_idx: A dictionary giving the vocabulary. It contains V entries, and maps each string to a unique integer in the range [0, V). - input_dim: Dimension D of input image feature vectors. - wordvec_dim: Dimension W of word vectors. - hidden_dim: Dimension H for the hidden state of the RNN. - cell_type: What type of RNN to use; either 'rnn' or 'l#stm'. - dtype: numpy datatype to use; use float32 for training and float64 for numeric gradient checking. """ if cell_type not in {'rnn', 'lstm'}: raise ValueError('Invalid cell_type "%s"' % cell_type) self.cell_type = cell_type self.dtype = dtype self.word_to_idx = word_to_idx self.idx_to_word = {i: w for w, i in word_to_idx.iteritems()} self.params = {} vocab_size = len(word_to_idx) self._null = word_to_idx['<NULL>'] self._start = word_to_idx.get('<START>', None) self._end = word_to_idx.get('<END>', None) # Initialize word vectors self.params['W_embed'] = np.random.randn(vocab_size, wordvec_dim) self.params['W_embed'] /= 100 # Initialize CNN -> hidden state projection parameters self.params['W_proj'] = np.random.randn(input_dim, hidden_dim) self.params['W_proj'] /= np.sqrt(input_dim) self.params['b_proj'] = np.zeros(hidden_dim) # Initialize parameters for the RNN dim_mul = {'lstm': 4, 'rnn': 1}[cell_type] self.params['Wx'] = np.random.randn(wordvec_dim, dim_mul * hidden_dim) self.params['Wx'] /= np.sqrt(wordvec_dim) self.params['Wh'] = np.random.randn(hidden_dim, dim_mul * hidden_dim) self.params['Wh'] /= np.sqrt(hidden_dim) self.params['b'] = np.zeros(dim_mul * hidden_dim) # Initialize output to vocab weights self.params['W_vocab'] = np.random.randn(hidden_dim, vocab_size) self.params['W_vocab'] /= np.sqrt(hidden_dim) self.params['b_vocab'] = np.zeros(vocab_size)
def rmsprop(x, dx, config=None): """ Uses the RMSProp update rule, which uses a moving average of squared gradient values to set adaptive per-parameter learning rates. config format: - learning_rate: Scalar learning rate. - decay_rate: Scalar between 0 and 1 giving the decay rate for the squared gradient cache. - epsilon: Small scalar used for smoothing to avoid dividing by zero. - cache: Moving average of second moments of gradients. """ if config is None: config = {} config.setdefault('learning_rate', 1e-2) config.setdefault('decay_rate', 0.99) config.setdefault('epsilon', 1e-8) config.setdefault('cache', np.zeros_like(x)) cache = config['cache'] ############################################################################# # TODO: Implement the RMSprop update formula, storing the next value of x # # in the next_x variable. Don't forget to update cache value stored in # # config['cache']. # ############################################################################# cache = cache * config['decay_rate'] + dx**2 * (1 - config['decay_rate']) next_x = x - config['learning_rate'] * dx / (np.sqrt(cache) + config['epsilon']) ############################################################################# # END OF YOUR CODE # ############################################################################# config['cache'] = cache return next_x, config
def rmsprop(x, dx, config=None): """ Uses the RMSProp update rule, which uses a moving average of squared gradient values to set adaptive per-parameter learning rates. config format: - learning_rate: Scalar learning rate. - decay_rate: Scalar between 0 and 1 giving the decay rate for the squared gradient cache. - epsilon: Small scalar used for smoothing to avoid dividing by zero. - cache: Moving average of second moments of gradients. """ if config is None: config = {} config.setdefault('learning_rate', 1e-2) config.setdefault('decay_rate', 0.99) config.setdefault('epsilon', 1e-8) config.setdefault('cache', np.zeros_like(x)) cache = config['cache'] ############################################################################# # TODO: Implement the RMSprop update formula, storing the next value of x # # in the next_x variable. Don't forget to update cache value stored in # # config['cache']. # ############################################################################# cache = cache * config['decay_rate'] + dx ** 2 * ( 1 - config['decay_rate'] ) next_x = x - config['learning_rate'] * dx / ( np.sqrt(cache) + config['epsilon'] ) ############################################################################# # END OF YOUR CODE # ############################################################################# config['cache'] = cache return next_x, config
def adam(x, dx, config=None): """ Uses the Adam update rule, which incorporates moving averages of both the gradient and its square and a bias correction term. config format: - learning_rate: Scalar learning rate. - beta1: Decay rate for moving average of first moment of gradient. - beta2: Decay rate for moving average of second moment of gradient. - epsilon: Small scalar used for smoothing to avoid dividing by zero. - m: Moving average of gradient. - v: Moving average of squared gradient. - t: Iteration number. """ if config is None: config = {} config.setdefault('learning_rate', 1e-3) config.setdefault('beta1', 0.9) config.setdefault('beta2', 0.999) config.setdefault('epsilon', 1e-8) config.setdefault('m', np.zeros_like(x)) config.setdefault('v', np.zeros_like(x)) config.setdefault('t', 0) v = config['v'] m = config['m'] t = config['t'] ############################################################################# # TODO: Implement the Adam update formula, storing the next value of x in # # the next_x variable. Don't forget to update the m, v, and t variables # # stored in config. # ############################################################################# t = t + 1 m = config['beta1'] * m + (1 - config['beta1']) * dx v = config['beta2'] * v + (1 - config['beta2']) * (dx**2) m_ = m / (1 - config['beta1']**t) v_ = v / (1 - config['beta2']**t) next_x = x - config['learning_rate'] * m_ / (np.sqrt(v_) + config['epsilon'] ) ############################################################################# # END OF YOUR CODE # ############################################################################# config['v'] = v config['m'] = m config['t'] = t return next_x, config
def adam(x, dx, config=None): """ Uses the Adam update rule, which incorporates moving averages of both the gradient and its square and a bias correction term. config format: - learning_rate: Scalar learning rate. - beta1: Decay rate for moving average of first moment of gradient. - beta2: Decay rate for moving average of second moment of gradient. - epsilon: Small scalar used for smoothing to avoid dividing by zero. - m: Moving average of gradient. - v: Moving average of squared gradient. - t: Iteration number. """ if config is None: config = {} config.setdefault('learning_rate', 1e-3) config.setdefault('beta1', 0.9) config.setdefault('beta2', 0.999) config.setdefault('epsilon', 1e-8) config.setdefault('m', np.zeros_like(x)) config.setdefault('v', np.zeros_like(x)) config.setdefault('t', 0) v = config['v'] m = config['m'] t = config['t'] ############################################################################# # TODO: Implement the Adam update formula, storing the next value of x in # # the next_x variable. Don't forget to update the m, v, and t variables # # stored in config. # ############################################################################# t = t + 1 m = config['beta1'] * m + (1 - config['beta1']) * dx v = config['beta2'] * v + (1 - config['beta2']) * (dx**2) m_ = m / (1 - config['beta1']**t) v_ = v / (1 - config['beta2']**t) next_x = x - config['learning_rate'] * m_ / (np.sqrt(v_) + config['epsilon']) ############################################################################# # END OF YOUR CODE # ############################################################################# config['v'] = v config['m'] = m config['t'] = t return next_x, config
def forward(self, inputs, params): weight = params[self._weight] bias = params[self._bias] gain = params[self._gain] weight_norm = np.sqrt(np.sum(weight**2, axis=0)) weight_norm = weight_norm.reshape((1, weight.shape[1])) weight /= weight_norm weight *= gain outputs = np.dot(inputs, weight) if not self._no_bias: outputs += bias return outputs
def adam(x, dx, config=None): """ Uses the Adam update rule, which incorporates moving averages of both the gradient and its square and a bias correction term. config format: - learning_rate: Scalar learning rate. - beta1: Decay rate for moving average of first moment of gradient. - beta2: Decay rate for moving average of second moment of gradient. - epsilon: Small scalar used for smoothing to avoid dividing by zero. - m: Moving average of gradient. - v: Moving average of squared gradient. - t: Iteration number. """ if config is None: config = {} config.setdefault('learning_rate', 1e-3) config.setdefault('beta1', 0.9) config.setdefault('beta2', 0.999) config.setdefault('epsilon', 1e-8) config.setdefault('m', np.zeros_like(x)) config.setdefault('v', np.zeros_like(x)) config.setdefault('t', 0) next_x = None beta1, beta2, eps = config['beta1'], config['beta2'], config['epsilon'] t, m, v = config['t'], config['m'], config['v'] m = beta1 * m + (1 - beta1) * dx v = beta2 * v + (1 - beta2) * (dx * dx) t += 1 alpha = config['learning_rate'] * np.sqrt(1 - beta2 ** t) / (1 - beta1 ** t) x -= alpha * (m / (np.sqrt(v) + eps)) config['t'] = t config['m'] = m config['v'] = v next_x = x return next_x, config
def blob_normalization(X, settings, gamma, beta, mode='train', epsilon=1e-5, momentum=0.9, running_mean=None, running_variance=None): N, D = map(int, X.shape) size = N * D if running_mean is None: running_mean = np.zeros(1) if running_variance is None: running_variance = np.zeros(1) if mode == 'train': if 'shared_mean' in settings: mean = np.sum(X) / size else: mean = np.sum(X, axis=0) / N mean = np.reshape(mean, (1, D)) centered_X = X - mean if 'shared_deviation' in settings: variance = np.sum(centered_X**2) / size else: variance = np.sum(centered_X**2, axis=0) / N variance = np.reshape(variance, (1, D)) deviation = variance**0.5 rescaled_X = centered_X / deviation out = gamma * rescaled_X + beta running_mean = momentum * running_mean + (1.0 - momentum) * mean running_variance = momentum * running_variance + (1.0 - momentum) * variance elif mode == 'test': X_hat = (X - running_mean) / np.sqrt(running_variance + epsilon) out = gamma * X_hat + beta return out, running_mean, running_variance
def update_parameter(x, dx, epoch, learning_rate, m, v, beta1, beta2, eps): """ Updates the parameter x using its gradient dx. :param x: The parameter to be updated. :param dx: The gradient of the parameter to be updated. :param epoch: The current training epoch. :param learning_rate: The learning rate of the network. Indicates the size of learning steps. :param m: The current momentum. :param v: The current velocity. :param beta1: Hyperparameter for the Adam parameter update. Recommended to be .9. :param beta2: Hyperparameter for the Adam parameter update. Recommended to be .999. :param eps: Hyperparameter for the Adam parameter update. Recommended to be 1e-8. :return: The updated parameter of the same type as x. """ m = beta1 * m + (1 - beta1) * dx mt = m / (1 - beta1**epoch) v = beta2 * v + (1 - beta2) * (np.square(dx)) vt = v / (1 - beta2**epoch) update = -learning_rate * mt / (np.sqrt(vt) + eps) #print(np.mean(update / x)) x += update # x += - learning_rate * dx return x, m, v
def garchLLH(y, par): h = garchSim(np.square(y), par) T = y.shape[0] llh = -0.5 * (T - 1) * math.log( 2 * math.pi) - 0.5 * np.sum(np.log(h) + (y / np.sqrt(h))**2) return llh[0]
def batchnorm_forward(x, gamma, beta, bn_param): """ Forward pass for batch normalization. During training the sample mean and (uncorrected) sample variance are computed from minibatch statistics and used to normalize the incoming data. During training we also keep an exponentially decaying running mean of the mean and variance of each feature, and these averages are used to normalize data at test-time. At each timestep we update the running averages for mean and variance using an exponential decay based on the momentum parameter: running_mean = momentum * running_mean + (1 - momentum) * sample_mean running_var = momentum * running_var + (1 - momentum) * sample_var Note that the batch normalization paper suggests a different test-time behavior: they compute sample mean and variance for each feature using a large number of training images rather than using a running average. For this implementation we have chosen to use running averages instead since they do not require an additional estimation step; the torch7 implementation of batch normalization also uses running averages. Input: - x: Data of shape (N, D) - gamma: Scale parameter of shape (D,) - beta: Shift paremeter of shape (D,) - bn_param: Dictionary with the following keys: - mode: 'train' or 'test'; required - eps: Constant for numeric stability - momentum: Constant for running mean / variance. - running_mean: Array of shape (D,) giving running mean of features - running_var Array of shape (D,) giving running variance of features Returns a tuple of: - out: of shape (N, D) - cache: A tuple of values needed in the backward pass """ mode = bn_param['mode'] eps = bn_param.get('eps', 1e-5) momentum = bn_param.get('momentum', 0.9) N, D = x.shape running_mean = bn_param.get('running_mean', np.zeros(D)) running_var = bn_param.get('running_var', np.zeros(D)) out = None if mode == 'train': mean = np.sum(x, axis=0) / float(N) x_mean = (x - mean) sqr_x_mean = x_mean**2 var = np.sum(sqr_x_mean, axis=0) / float(N) sqrt_var = np.sqrt(var + eps) inv_sqrt_var = 1.0 / sqrt_var x_hat = x_mean * inv_sqrt_var out = gamma * x_hat + beta running_mean = momentum * running_mean + (1.0 - momentum) * mean running_var = momentum * running_var + (1.0 - momentum) * var elif mode == 'test': x_hat = (x - running_mean) / np.sqrt(running_var + eps) out = gamma * x_hat + beta else: raise ValueError('Invalid forward batchnorm mode "%s"' % mode) # Store the updated running means back into bn_param bn_param['running_mean'] = running_mean bn_param['running_var'] = running_var return out
def batchnorm(x, gamma, beta, mode='train', eps=1e-5, momentum=0.9, running_mean=None, running_var=None): """ Forward pass for batch normalization. During training the sample mean and (uncorrected) sample variance are computed from minibatch statistics and used to normalize the incoming data. During training we also keep an exponentially decaying running mean of the mean and variance of each feature, and these averages are used to normalize data at test-time. At each timestep we update the running averages for mean and variance using an exponential decay based on the momentum parameter: running_mean = momentum * running_mean + (1 - momentum) * sample_mean running_var = momentum * running_var + (1 - momentum) * sample_var Note that the batch normalization paper suggests a different test-time behavior: they compute sample mean and variance for each feature using a large number of training images rather than using a running average. For this implementation we have chosen to use running averages instead since they do not require an additional estimation step; the torch7 implementation of batch normalization also uses running averages. Input: - x: Data of shape (N, D) - gamma: Scale parameter of shape (D,) - beta: Shift paremeter of shape (D,) - mode: 'train' or 'test' - eps: Constant for numeric stability - momentum: Constant for running mean / variance. - running_mean: Array of shape (D,) giving running mean of features - running_var Array of shape (D,) giving running variance of features Returns a tuple of: - out: of shape (N, D) - running_mean: updated running_mean - running_var: updated running_var """ # TODO: fix NDArray type system N, D = x.shape N, D = int(N), int(D) if running_mean is None: running_mean = np.zeros(D) if running_var is None: running_var = np.zeros(D) out = None if mode == 'train': mean = np.sum(x, axis=0) / N x_mean = (x - np.expand_dims(mean, axis=0)) sqr_x_mean = x_mean**2 var = np.sum(sqr_x_mean, axis=0) / N sqrt_var = np.sqrt(var + eps) inv_sqrt_var = 1.0 / sqrt_var x_hat = x_mean * np.expand_dims(inv_sqrt_var, axis=0) out = gamma * x_hat + beta running_mean = momentum * running_mean + (1.0 - momentum) * mean running_var = momentum * running_var + (1.0 - momentum) * var elif mode == 'test': x_hat = (x - running_mean) / np.sqrt(running_var + eps) out = gamma * x_hat + beta else: raise ValueError('Invalid forward batchnorm mode "%s"' % mode) # return the updated running means return out, running_mean, running_var
def batchnorm_forward(x, gamma, beta, bn_param): """ Forward pass for batch normalization. During training the sample mean and (uncorrected) sample variance are computed from minibatch statistics and used to normalize the incoming data. During training we also keep an exponentially decaying running mean of the mean and variance of each feature, and these averages are used to normalize data at test-time. At each timestep we update the running averages for mean and variance using an exponential decay based on the momentum parameter: running_mean = momentum * running_mean + (1 - momentum) * sample_mean running_var = momentum * running_var + (1 - momentum) * sample_var Note that the batch normalization paper suggests a different test-time behavior: they compute sample mean and variance for each feature using a large number of training images rather than using a running average. For this implementation we have chosen to use running averages instead since they do not require an additional estimation step; the torch7 implementation of batch normalization also uses running averages. Input: - x: Data of shape (N, D) - gamma: Scale parameter of shape (D,) - beta: Shift paremeter of shape (D,) - bn_param: Dictionary with the following keys: - mode: 'train' or 'test'; required - eps: Constant for numeric stability - momentum: Constant for running mean / variance. - running_mean: Array of shape (D,) giving running mean of features - running_var Array of shape (D,) giving running variance of features Returns a tuple of: - out: of shape (N, D) - cache: A tuple of values needed in the backward pass """ mode = bn_param['mode'] eps = bn_param.get('eps', 1e-5) momentum = bn_param.get('momentum', 0.9) N, D = x.shape running_mean = bn_param.get('running_mean', np.zeros(D)) running_var = bn_param.get('running_var', np.zeros(D)) out = None if mode == 'train': mean = np.sum(x, axis = 0)/float(N) x_mean = (x - mean) sqr_x_mean = x_mean ** 2 var = np.sum(sqr_x_mean, axis = 0)/float(N) sqrt_var = np.sqrt(var + eps) inv_sqrt_var = 1.0/sqrt_var x_hat = x_mean * inv_sqrt_var out = gamma * x_hat + beta running_mean = momentum*running_mean + (1.0 - momentum) * mean running_var = momentum*running_var + (1.0 - momentum) * var elif mode == 'test': x_hat = (x - running_mean)/np.sqrt(running_var + eps) out = gamma * x_hat + beta else: raise ValueError('Invalid forward batchnorm mode "%s"' % mode) # Store the updated running means back into bn_param bn_param['running_mean'] = running_mean bn_param['running_var'] = running_var return out
def test_ufunc(): x = np.array([-1.2, 1.2]) np.absolute(x) np.absolute(1.2 + 1j) x = np.linspace(start=-10, stop=10, num=101) np.add(1.0, 4.0) x1 = np.arange(9.0).reshape((3, 3)) x2 = np.arange(3.0) np.add(x1, x2) np.arccos([1, -1]) x = np.linspace(-1, 1, num=100) np.arccosh([np.e, 10.0]) np.arccosh(1) np.arcsin(0) np.arcsinh(np.array([np.e, 10.0])) np.arctan([0, 1]) np.pi / 4 x = np.linspace(-10, 10) x = np.array([-1, +1, +1, -1]) y = np.array([-1, -1, +1, +1]) np.arctan2(y, x) * 180 / np.pi np.arctan2([1., -1.], [0., 0.]) np.arctan2([0., 0., np.inf], [+0., -0., np.inf]) np.arctanh([0, -0.5]) np.bitwise_and(13, 17) np.bitwise_and(14, 13) # np.binary_repr(12) return str np.bitwise_and([14, 3], 13) np.bitwise_and([11, 7], [4, 25]) np.bitwise_and(np.array([2, 5, 255]), np.array([3, 14, 16])) np.bitwise_and([True, True], [False, True]) np.bitwise_or(13, 16) # np.binary_repr(29) np.bitwise_or(32, 2) np.bitwise_or([33, 4], 1) np.bitwise_or([33, 4], [1, 2]) np.bitwise_or(np.array([2, 5, 255]), np.array([4, 4, 4])) # np.array([2, 5, 255]) | np.array([4, 4, 4]) np.bitwise_or(np.array([2, 5, 255, 2147483647], dtype=np.int32), np.array([4, 4, 4, 2147483647], dtype=np.int32)) np.bitwise_or([True, True], [False, True]) np.bitwise_xor(13, 17) # np.binary_repr(28) np.bitwise_xor(31, 5) np.bitwise_xor([31, 3], 5) np.bitwise_xor([31, 3], [5, 6]) np.bitwise_xor([True, True], [False, True]) a = np.array([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) np.ceil(a) a = np.array([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) np.trunc(a) np.cos(np.array([0, np.pi / 2, np.pi])) np.cosh(0) x = np.linspace(-4, 4, 1000) rad = np.arange(12.) * np.pi / 6 np.degrees(rad) out = np.zeros((rad.shape)) r = np.degrees(rad, out) # np.all(r == out) return bool np.rad2deg(np.pi / 2) np.divide(2.0, 4.0) x1 = np.arange(9.0).reshape((3, 3)) x2 = np.arange(3.0) np.divide(2, 4) np.divide(2, 4.) np.equal([0, 1, 3], np.arange(3)) np.equal(1, np.ones(1)) x = np.linspace(-2 * np.pi, 2 * np.pi, 100) np.exp2([2, 3]) np.expm1(1e-10) np.exp(1e-10) - 1 np.fabs(-1) np.fabs([-1.2, 1.2]) a = np.array([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) np.floor(a) np.floor_divide(7, 3) np.floor_divide([1., 2., 3., 4.], 2.5) np.fmod([-3, -2, -1, 1, 2, 3], 2) np.remainder([-3, -2, -1, 1, 2, 3], 2) np.fmod([5, 3], [2, 2.]) a = np.arange(-3, 3).reshape(3, 2) np.fmod(a, [2, 2]) np.greater([4, 2], [2, 2]) a = np.array([4, 2]) b = np.array([2, 2]) a > b np.greater_equal([4, 2, 1], [2, 2, 2]) np.hypot(3 * np.ones((3, 3)), 4 * np.ones((3, 3))) np.hypot(3 * np.ones((3, 3)), [4]) np.bitwise_not is np.invert np.invert(np.array([13], dtype=np.uint8)) # np.binary_repr(242, width=8) np.invert(np.array([13], dtype=np.uint16)) np.invert(np.array([13], dtype=np.int8)) # np.binary_repr(-14, width=8) np.invert(np.array([True, False])) # np.isfinite(1) # np.isfinite(0) # np.isfinite(np.nan) # np.isfinite(np.inf) # np.isfinite(np.NINF) x = np.array([-np.inf, 0., np.inf]) y = np.array([2, 2, 2]) np.isfinite(x, y) # np.isinf(np.inf) # np.isinf(np.nan) # np.isinf(np.NINF) # np.isinf([np.inf, -np.inf, 1.0, np.nan]) x = np.array([-np.inf, 0., np.inf]) y = np.array([2, 2, 2]) # np.isinf(x, y) # np.isnan(np.nan) # np.isnan(np.inf) # np.binary_repr(5) np.left_shift(5, 2) # np.binary_repr(20) np.left_shift(5, [1, 2, 3]) np.less([1, 2], [2, 2]) np.less_equal([4, 2, 1], [2, 2, 2]) x = np.array([0, 1, 2, 2**4]) xi = np.array([0 + 1.j, 1, 2 + 0.j, 4.j]) np.log2(xi) prob1 = np.log(1e-50) prob2 = np.log(2.5e-50) prob12 = np.logaddexp(prob1, prob2) prob12 np.exp(prob12) prob1 = np.log2(1e-50) prob2 = np.log2(2.5e-50) prob12 = np.logaddexp2(prob1, prob2) prob1, prob2, prob12 2**prob12 np.log1p(1e-99) np.log(1 + 1e-99) # np.logical_and(True, False) # np.logical_and([True, False], [False, False]) x = np.arange(5) # np.logical_and(x>1, x<4) # np.logical_not(3) # np.logical_not([True, False, 0, 1]) x = np.arange(5) # np.logical_not(x<3) # np.logical_or(True, False) # np.logical_or([True, False], [False, False]) x = np.arange(5) # np.logical_or(x < 1, x > 3) # np.logical_xor(True, False) # np.logical_xor([True, True, False, False], [True, False, True, False]) x = np.arange(5) # np.logical_xor(x < 1, x > 3) # np.logical_xor(0, np.eye(2)) np.maximum([2, 3, 4], [1, 5, 2]) # np.maximum([np.nan, 0, np.nan], [0, np.nan, np.nan]) # np.maximum(np.Inf, 1) np.minimum([2, 3, 4], [1, 5, 2]) # np.minimum([np.nan, 0, np.nan],[0, np.nan, np.nan]) # np.minimum(-np.Inf, 1) np.fmax([2, 3, 4], [1, 5, 2]) np.fmax(np.eye(2), [0.5, 2]) # np.fmax([np.nan, 0, np.nan],[0, np.nan, np.nan]) np.fmin([2, 3, 4], [1, 5, 2]) np.fmin(np.eye(2), [0.5, 2]) # np.fmin([np.nan, 0, np.nan],[0, np.nan, np.nan]) np.modf([0, 3.5]) np.modf(-0.5) np.multiply(2.0, 4.0) x1 = np.arange(9.0).reshape((3, 3)) x2 = np.arange(3.0) np.multiply(x1, x2) np.negative([1., -1.]) np.not_equal([1., 2.], [1., 3.]) np.not_equal([1, 2], [[1, 3], [1, 4]]) x1 = range(6) np.power(x1, 3) x2 = [1.0, 2.0, 3.0, 3.0, 2.0, 1.0] np.power(x1, x2) x2 = np.array([[1, 2, 3, 3, 2, 1], [1, 2, 3, 3, 2, 1]]) np.power(x1, x2) deg = np.arange(12.) * 30. np.radians(deg) out = np.zeros((deg.shape)) ret = np.radians(deg, out) ret is out np.deg2rad(180) np.reciprocal(2.) np.reciprocal([1, 2., 3.33]) np.remainder([4, 7], [2, 3]) np.remainder(np.arange(7), 5) # np.binary_repr(10) np.right_shift(10, 1) # np.binary_repr(5) np.right_shift(10, [1, 2, 3]) a = np.array([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) np.rint(a) np.sign([-5., 4.5]) np.sign(0) # np.sign(5-2j) # np.signbit(-1.2) np.signbit(np.array([1, -2.3, 2.1])) np.copysign(1.3, -1) np.copysign([-1, 0, 1], -1.1) np.copysign([-1, 0, 1], np.arange(3) - 1) np.sin(np.pi / 2.) np.sin(np.array((0., 30., 45., 60., 90.)) * np.pi / 180.) x = np.linspace(-np.pi, np.pi, 201) np.sinh(0) # np.sinh(np.pi*1j/2) np.sqrt([1, 4, 9]) np.sqrt([4, -1, -3 + 4J]) np.cbrt([1, 8, 27]) np.square([-1j, 1]) np.subtract(1.0, 4.0) x1 = np.arange(9.0).reshape((3, 3)) x2 = np.arange(3.0) np.subtract(x1, x2) np.tan(np.array([-pi, pi / 2, pi])) np.tanh((0, np.pi * 1j, np.pi * 1j / 2)) x = np.arange(5) np.true_divide(x, 4) x = np.arange(9) y1, y2 = np.frexp(x) y1 * 2**y2 np.ldexp(5, np.arange(4)) x = np.arange(6) np.ldexp(*np.frexp(x))
def batchnorm(x, gamma, beta, mode='train', eps=1e-5, momentum=0.9, running_mean=None, running_var=None): """ Forward pass for batch normalization. During training the sample mean and (uncorrected) sample variance are computed from minibatch statistics and used to normalize the incoming data. During training we also keep an exponentially decaying running mean of the mean and variance of each feature, and these averages are used to normalize data at test-time. At each timestep we update the running averages for mean and variance using an exponential decay based on the momentum parameter: running_mean = momentum * running_mean + (1 - momentum) * sample_mean running_var = momentum * running_var + (1 - momentum) * sample_var Note that the batch normalization paper suggests a different test-time behavior: they compute sample mean and variance for each feature using a large number of training images rather than using a running average. For this implementation we have chosen to use running averages instead since they do not require an additional estimation step; the torch7 implementation of batch normalization also uses running averages. Input: - x: Data of shape (N, D) - gamma: Scale parameter of shape (D,) - beta: Shift paremeter of shape (D,) - mode: 'train' or 'test' - eps: Constant for numeric stability - momentum: Constant for running mean / variance. - running_mean: Array of shape (D,) giving running mean of features - running_var Array of shape (D,) giving running variance of features Returns a tuple of: - out: of shape (N, D) - running_mean: updated running_mean - running_var: updated running_var """ # TODO: fix NDArray type system N, D = x.shape N, D = int(N), int(D) if running_mean is None: running_mean = np.zeros(D) if running_var is None: running_var = np.zeros(D) out = None if mode == 'train': mean = np.sum(x, axis=0) / N x_mean = (x - np.expand_dims(mean, axis=0)) sqr_x_mean = x_mean ** 2 var = np.sum(sqr_x_mean, axis=0) / N sqrt_var = np.sqrt(var + eps) inv_sqrt_var = 1.0 / sqrt_var x_hat = x_mean * np.expand_dims(inv_sqrt_var, axis=0) out = gamma * x_hat + beta running_mean = momentum * running_mean + (1.0 - momentum) * mean running_var = momentum * running_var + (1.0 - momentum) * var elif mode == 'test': x_hat = (x - running_mean) / np.sqrt(running_var + eps) out = gamma * x_hat + beta else: raise ValueError('Invalid forward batchnorm mode "%s"' % mode) # return the updated running means return out, running_mean, running_var
def test_ufunc(): x = np.array([-1.2, 1.2]) np.absolute(x) np.absolute(1.2 + 1j) x = np.linspace(start=-10, stop=10, num=101) np.add(1.0, 4.0) x1 = np.arange(9.0).reshape((3, 3)) x2 = np.arange(3.0) np.add(x1, x2) np.arccos([1, -1]) x = np.linspace(-1, 1, num=100) np.arccosh([np.e, 10.0]) np.arccosh(1) np.arcsin(0) np.arcsinh(np.array([np.e, 10.0])) np.arctan([0, 1]) np.pi/4 x = np.linspace(-10, 10) x = np.array([-1, +1, +1, -1]) y = np.array([-1, -1, +1, +1]) np.arctan2(y, x) * 180 / np.pi np.arctan2([1., -1.], [0., 0.]) np.arctan2([0., 0., np.inf], [+0., -0., np.inf]) np.arctanh([0, -0.5]) np.bitwise_and(13, 17) np.bitwise_and(14, 13) # np.binary_repr(12) return str np.bitwise_and([14,3], 13) np.bitwise_and([11,7], [4,25]) np.bitwise_and(np.array([2,5,255]), np.array([3,14,16])) np.bitwise_and([True, True], [False, True]) np.bitwise_or(13, 16) # np.binary_repr(29) np.bitwise_or(32, 2) np.bitwise_or([33, 4], 1) np.bitwise_or([33, 4], [1, 2]) np.bitwise_or(np.array([2, 5, 255]), np.array([4, 4, 4])) # np.array([2, 5, 255]) | np.array([4, 4, 4]) np.bitwise_or(np.array([2, 5, 255, 2147483647], dtype=np.int32), np.array([4, 4, 4, 2147483647], dtype=np.int32)) np.bitwise_or([True, True], [False, True]) np.bitwise_xor(13, 17) # np.binary_repr(28) np.bitwise_xor(31, 5) np.bitwise_xor([31,3], 5) np.bitwise_xor([31,3], [5,6]) np.bitwise_xor([True, True], [False, True]) a = np.array([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) np.ceil(a) a = np.array([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) np.trunc(a) np.cos(np.array([0, np.pi/2, np.pi])) np.cosh(0) x = np.linspace(-4, 4, 1000) rad = np.arange(12.)*np.pi/6 np.degrees(rad) out = np.zeros((rad.shape)) r = np.degrees(rad, out) # np.all(r == out) return bool np.rad2deg(np.pi/2) np.divide(2.0, 4.0) x1 = np.arange(9.0).reshape((3, 3)) x2 = np.arange(3.0) np.divide(2, 4) np.divide(2, 4.) np.equal([0, 1, 3], np.arange(3)) np.equal(1, np.ones(1)) x = np.linspace(-2*np.pi, 2*np.pi, 100) np.exp2([2, 3]) np.expm1(1e-10) np.exp(1e-10) - 1 np.fabs(-1) np.fabs([-1.2, 1.2]) a = np.array([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) np.floor(a) np.floor_divide(7,3) np.floor_divide([1., 2., 3., 4.], 2.5) np.fmod([-3, -2, -1, 1, 2, 3], 2) np.remainder([-3, -2, -1, 1, 2, 3], 2) np.fmod([5, 3], [2, 2.]) a = np.arange(-3, 3).reshape(3, 2) np.fmod(a, [2,2]) np.greater([4,2],[2,2]) a = np.array([4,2]) b = np.array([2,2]) a > b np.greater_equal([4, 2, 1], [2, 2, 2]) np.hypot(3*np.ones((3, 3)), 4*np.ones((3, 3))) np.hypot(3*np.ones((3, 3)), [4]) np.bitwise_not is np.invert np.invert(np.array([13], dtype=np.uint8)) # np.binary_repr(242, width=8) np.invert(np.array([13], dtype=np.uint16)) np.invert(np.array([13], dtype=np.int8)) # np.binary_repr(-14, width=8) np.invert(np.array([True, False])) # np.isfinite(1) # np.isfinite(0) # np.isfinite(np.nan) # np.isfinite(np.inf) # np.isfinite(np.NINF) x = np.array([-np.inf, 0., np.inf]) y = np.array([2, 2, 2]) np.isfinite(x, y) # np.isinf(np.inf) # np.isinf(np.nan) # np.isinf(np.NINF) # np.isinf([np.inf, -np.inf, 1.0, np.nan]) x = np.array([-np.inf, 0., np.inf]) y = np.array([2, 2, 2]) # np.isinf(x, y) # np.isnan(np.nan) # np.isnan(np.inf) # np.binary_repr(5) np.left_shift(5, 2) # np.binary_repr(20) np.left_shift(5, [1,2,3]) np.less([1, 2], [2, 2]) np.less_equal([4, 2, 1], [2, 2, 2]) x = np.array([0, 1, 2, 2**4]) xi = np.array([0+1.j, 1, 2+0.j, 4.j]) np.log2(xi) prob1 = np.log(1e-50) prob2 = np.log(2.5e-50) prob12 = np.logaddexp(prob1, prob2) prob12 np.exp(prob12) prob1 = np.log2(1e-50) prob2 = np.log2(2.5e-50) prob12 = np.logaddexp2(prob1, prob2) prob1, prob2, prob12 2**prob12 np.log1p(1e-99) np.log(1 + 1e-99) # np.logical_and(True, False) # np.logical_and([True, False], [False, False]) x = np.arange(5) # np.logical_and(x>1, x<4) # np.logical_not(3) # np.logical_not([True, False, 0, 1]) x = np.arange(5) # np.logical_not(x<3) # np.logical_or(True, False) # np.logical_or([True, False], [False, False]) x = np.arange(5) # np.logical_or(x < 1, x > 3) # np.logical_xor(True, False) # np.logical_xor([True, True, False, False], [True, False, True, False]) x = np.arange(5) # np.logical_xor(x < 1, x > 3) # np.logical_xor(0, np.eye(2)) np.maximum([2, 3, 4], [1, 5, 2]) # np.maximum([np.nan, 0, np.nan], [0, np.nan, np.nan]) # np.maximum(np.Inf, 1) np.minimum([2, 3, 4], [1, 5, 2]) # np.minimum([np.nan, 0, np.nan],[0, np.nan, np.nan]) # np.minimum(-np.Inf, 1) np.fmax([2, 3, 4], [1, 5, 2]) np.fmax(np.eye(2), [0.5, 2]) # np.fmax([np.nan, 0, np.nan],[0, np.nan, np.nan]) np.fmin([2, 3, 4], [1, 5, 2]) np.fmin(np.eye(2), [0.5, 2]) # np.fmin([np.nan, 0, np.nan],[0, np.nan, np.nan]) np.modf([0, 3.5]) np.modf(-0.5) np.multiply(2.0, 4.0) x1 = np.arange(9.0).reshape((3, 3)) x2 = np.arange(3.0) np.multiply(x1, x2) np.negative([1.,-1.]) np.not_equal([1.,2.], [1., 3.]) np.not_equal([1, 2], [[1, 3],[1, 4]]) x1 = range(6) np.power(x1, 3) x2 = [1.0, 2.0, 3.0, 3.0, 2.0, 1.0] np.power(x1, x2) x2 = np.array([[1, 2, 3, 3, 2, 1], [1, 2, 3, 3, 2, 1]]) np.power(x1, x2) deg = np.arange(12.) * 30. np.radians(deg) out = np.zeros((deg.shape)) ret = np.radians(deg, out) ret is out np.deg2rad(180) np.reciprocal(2.) np.reciprocal([1, 2., 3.33]) np.remainder([4, 7], [2, 3]) np.remainder(np.arange(7), 5) # np.binary_repr(10) np.right_shift(10, 1) # np.binary_repr(5) np.right_shift(10, [1,2,3]) a = np.array([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) np.rint(a) np.sign([-5., 4.5]) np.sign(0) # np.sign(5-2j) # np.signbit(-1.2) np.signbit(np.array([1, -2.3, 2.1])) np.copysign(1.3, -1) np.copysign([-1, 0, 1], -1.1) np.copysign([-1, 0, 1], np.arange(3)-1) np.sin(np.pi/2.) np.sin(np.array((0., 30., 45., 60., 90.)) * np.pi / 180. ) x = np.linspace(-np.pi, np.pi, 201) np.sinh(0) # np.sinh(np.pi*1j/2) np.sqrt([1,4,9]) np.sqrt([4, -1, -3+4J]) np.cbrt([1,8,27]) np.square([-1j, 1]) np.subtract(1.0, 4.0) x1 = np.arange(9.0).reshape((3, 3)) x2 = np.arange(3.0) np.subtract(x1, x2) np.tan(np.array([-pi,pi/2,pi])) np.tanh((0, np.pi*1j, np.pi*1j/2)) x = np.arange(5) np.true_divide(x, 4) x = np.arange(9) y1, y2 = np.frexp(x) y1 * 2**y2 np.ldexp(5, np.arange(4)) x = np.arange(6) np.ldexp(*np.frexp(x))