def hybrid_forward( self, F, feat_static_cat: Tensor, # (batch_size, 1) past_time_feat: Tensor, # (batch_size, history_length, num_features) past_target: Tensor, # (batch_size, history_length) ) -> Tensor: """ Parameters ---------- F Function space feat_static_cat Shape: (batch_size, 1) past_time_feat Shape: (batch_size, history_length, num_features) past_target Shape: (batch_size, history_length) Returns ------- Tensor A batch of negative log likelihoods. """ fixed_effect, random_effect = self.compute_global_local( F, feat_static_cat, past_time_feat) loss = self.negative_normal_likelihood(F, past_target.expand_dims(axis=2), fixed_effect, random_effect) return loss
def _expand_param(p: Tensor, num_samples: Optional[int] = None) -> Tensor: """ Expand parameters by num_samples along the first dimension. """ if num_samples is None: return p return p.expand_dims(axis=0).repeat(axis=0, repeats=num_samples)
def s(mu: Tensor, D: Tensor, W: Tensor) -> Tensor: F = getF(mu) samples_D = F.sample_normal(mu=F.zeros_like(mu), sigma=F.ones_like(mu), dtype=dtype) cov_D = D.sqrt() * samples_D # dummy only use to get the shape (..., rank, 1) dummy_tensor = F.linalg_gemm2(W, mu.expand_dims(axis=-1), transpose_a=True).squeeze(axis=-1) samples_W = F.sample_normal( mu=F.zeros_like(dummy_tensor), sigma=F.ones_like(dummy_tensor), dtype=dtype, ) cov_W = F.linalg_gemm2( W, samples_W.expand_dims(axis=-1)).squeeze(axis=-1) samples = mu + cov_D + cov_W return samples
def capacitance_tril(F, rank: Tensor, W: Tensor, D: Tensor) -> Tensor: r""" Parameters ---------- F rank W : (..., dim, rank) D : (..., dim) Returns ------- the capacitance matrix :math:`I + W^T D^{-1} W` """ # (..., dim, rank) Wt_D_inv_t = F.broadcast_div(W, D.expand_dims(axis=-1)) # (..., rank, rank) K = F.linalg_gemm2(Wt_D_inv_t, W, transpose_a=True) # (..., rank, rank) Id = F.broadcast_mul(F.ones_like(K), F.eye(rank)) # (..., rank, rank) return F.linalg.potrf(K + Id)
def quantile_losses(self, obs: Tensor, quantiles: Tensor, levels: Tensor) -> Tensor: """ Computes quantile losses for all the quantiles specified. Parameters ---------- obs Ground truth observation. Shape: `(batch_size, seq_len, *event_shape)` quantiles Quantile values. Shape: `(batch_size, seq_len, *event_shape, num_quantiles)` levels Quantile levels. Shape: `(batch_size, seq_len, *event_shape, num_quantiles)` Returns ------- Tensor Quantile losses of shape: `(batch_size, seq_len, *event_shape, num_quantiles)` """ obs = obs.expand_dims(axis=-1) assert obs.shape[:-1] == quantiles.shape[:-1] assert obs.shape[:-1] == levels.shape[:-1] assert obs.shape[-1] == 1 return self.F.where( obs >= quantiles, levels * (obs - quantiles), (1 - levels) * (quantiles - obs), )
def exact_inference(self, x_train: Tensor, y_train: Tensor, x_test: Tensor) -> Tuple[Tensor, Tensor, Tensor]: """ Parameters ---------- x_train Training set of features of shape (batch_size, context_length, num_features). y_train Training labels of shape (batch_size, context_length). x_test Test set of features of shape (batch_size, prediction_length, num_features). Returns ------- Tuple Tensor Predictive GP samples of shape (batch_size, prediction_length, num_samples). Tensor Predictive mean of the GP of shape (batch_size, prediction_length). Tensor Predictive standard deviation of the GP of shape (batch_size, prediction_length). """ assert (self.context_length is not None), "The value of `context_length` must be set." assert (self.prediction_length is not None), "The value of `prediction_length` must be set." # Compute Cholesky factorization of training kernel matrix l_train = self._compute_cholesky_gp( self.kernel.kernel_matrix(x_train, x_train), self.context_length) lower_tri_solve = self.F.linalg.trsm( l_train, self.kernel.kernel_matrix(x_train, x_test)) predictive_mean = self.F.linalg.gemm2( lower_tri_solve, self.F.linalg.trsm(l_train, y_train.expand_dims(axis=-1)), transpose_a=True, ).squeeze(axis=-1) # Can rewrite second term as # :math:`||L^-1 * K(x_train,x_test||_2^2` # and only solve 1 equation predictive_covariance = self.kernel.kernel_matrix( x_test, x_test) - self.F.linalg.gemm2( lower_tri_solve, lower_tri_solve, transpose_a=True) # Extract diagonal entries of covariance matrix predictive_std = batch_diagonal( self.F, predictive_covariance, self.prediction_length, self.float_type, ) # If self.sample_noise = True, predictive covariance has sigma^2 on the diagonal if self.sample_noise: predictive_std = self.F.broadcast_add(predictive_std, self.sigma**2) predictive_std = self.F.sqrt(predictive_std).squeeze(axis=-1) # Compute sample from GP predictive distribution return ( self.sample(predictive_mean, predictive_covariance), predictive_mean, predictive_std, )
def _assemble_covariates( feat_dynamic_real: Tensor, feat_dynamic_cat: Tensor, feat_static_real: Tensor, feat_static_cat: Tensor, is_past: bool, ) -> Tensor: covariates = [] if feat_dynamic_real.shape[-1] > 0: covariates.append(feat_dynamic_real) if feat_static_real.shape[-1] > 0: covariates.append( feat_static_real.expand_dims(axis=1).repeat( axis=1, repeats=self.context_length if is_past else self.prediction_length, )) if len(covariates) > 0: covariates = F.concat(*covariates, dim=-1) covariates = self.covar_proj(covariates) else: covariates = None categories = [] if feat_dynamic_cat.shape[-1] > 0: categories.append(feat_dynamic_cat) if feat_static_cat.shape[-1] > 0: categories.append( feat_static_cat.expand_dims(axis=1).repeat( axis=1, repeats=self.context_length if is_past else self.prediction_length, )) if len(categories) > 0: categories = F.concat(*categories, dim=-1) embeddings = self.embedder(categories) embeddings = F.reshape(embeddings, shape=(0, 0, -4, self.d_hidden, -1)).sum(axis=-1) if covariates is not None: covariates = covariates + embeddings else: covariates = embeddings else: pass return covariates
def quantile_internal(self, x: Tensor, axis: Optional[int] = None) -> Tensor: r""" Evaluates the quantile function at the quantile levels contained in `x`. Parameters ---------- x Tensor of shape ``*gamma.shape`` if axis=None, or containing an additional axis on the specified position, otherwise. axis Index of the axis containing the different quantile levels which are to be computed. Returns ------- Tensor Quantiles tensor, of the same shape as x. """ F = self.F # shapes of self # self.gamma: (*batch_shape) # self.knot_positions, self.b: (*batch_shape, num_pieces) # axis=None - passed at inference when num_samples is None # The shape of x is (*batch_shape). # The shapes of the parameters should be: # gamma: (*batch_shape), knot_positions, b: (*batch_shape, num_pieces) # They match the self. counterparts so no reshaping is needed # axis=0 - passed at inference when num_samples is not None # The shape of x is (num_samples, *batch_shape). # The shapes of the parameters should be: # gamma: (num_samples, *batch_shape), knot_positions, b: (num_samples, *batch_shape, num_pieces), # They do not match the self. counterparts and we need to expand the axis=0 to all of them. # axis=-2 - passed at training when we evaluate quantiles at knot_positions in order to compute a_tilde # The shape of x is shape(x) = shape(knot_positions) = (*batch_shape, num_pieces). # The shape of the parameters shopuld be: # gamma: (*batch_shape, 1), knot_positions: (*batch_shape, 1, num_pieces), b: (*batch_shape, 1, num_pieces) # They do not match the self. counterparts and we need to expand axis=-1 for gamma and axis=-2 for the rest. if axis is not None: gamma = self.gamma.expand_dims(axis=axis if axis == 0 else -1) knot_positions = self.knot_positions.expand_dims(axis=axis) b = self.b.expand_dims(axis=axis) else: gamma, knot_positions, b = self.gamma, self.knot_positions, self.b x_minus_knots = F.broadcast_minus(x.expand_dims(axis=-1), knot_positions) quantile = F.broadcast_add( gamma, F.sum(F.broadcast_mul(b, F.relu(x_minus_knots)), axis=-1)) return quantile
def mahalanobis_distance( F, W: Tensor, D: Tensor, capacitance_tril: Tensor, x: Tensor ) -> Tensor: r""" Uses the Woodbury matrix identity .. math:: (W W^T + D)^{-1} = D^{-1} - D^{-1} W C^{-1} W^T D^{-1}, where :math:`C` is the capacitance matrix :math:`I + W^T D^{-1} W`, to compute the squared Mahalanobis distance :math:`x^T (W W^T + D)^{-1} x`. Parameters ---------- F W (..., dim, rank) D (..., dim) capacitance_tril (..., rank, rank) x (..., dim) Returns ------- """ xx = x.expand_dims(axis=-1) # (..., rank, 1) Wt_Dinv_x = F.linalg_gemm2( F.broadcast_div(W, D.expand_dims(axis=-1)), xx, transpose_a=True ) # compute x^T D^-1 x, (...,) maholanobis_D_inv = F.broadcast_div(x.square(), D).sum(axis=-1) # (..., rank) L_inv_Wt_Dinv_x = F.linalg_trsm(capacitance_tril, Wt_Dinv_x).squeeze( axis=-1 ) maholanobis_L = L_inv_Wt_Dinv_x.square().sum(axis=-1).squeeze() return F.broadcast_minus(maholanobis_D_inv, maholanobis_L)
def quantile(self, level: Tensor) -> Tensor: F = self.F for _ in range(self.all_dim): level = level.expand_dims(axis=-1) condition = F.broadcast_greater(level, level.zeros_like() + 0.5) u = F.where(condition, F.log(2.0 * level), -F.log(2.0 - 2.0 * level)) return F.broadcast_add(self.mu, F.broadcast_mul(self.b, u))
def quantile(self, level: Tensor): F = self.F # we consider level to be an independent axis and so expand it # to shape (num_levels, 1, 1, ...) for _ in range(self.all_dim): level = level.expand_dims(axis=-1) x_shifted = F.broadcast_div(F.power(1 - level, -self.xi) - 1, self.xi) x = F.broadcast_mul(x_shifted, self.beta) return x
def quantile(self, level: Tensor) -> Tensor: F = self.F # we consider level to be an independent axis and so expand it # to shape (num_levels, 1, 1, ...) for _ in range(self.all_dim): level = level.expand_dims(axis=-1) return F.broadcast_add( self.mu, F.broadcast_mul(self.sigma, math.sqrt(2.0) * F.erfinv(2.0 * level - 1.0)), )
def quantile(self, level: Tensor) -> Tensor: F = self.F # self.bin_probs.shape = (batch_shape, num_bins) probs = self.bin_probs.transpose() # (num_bins, batch_shape.T) # (batch_shape) zeros_batch_size = F.zeros_like( F.slice_axis(self.bin_probs, axis=-1, begin=0, end=1).squeeze( axis=-1 ) ) level = level.expand_dims(axis=0) # cdf shape (batch_size.T, levels) zeros_cdf = F.broadcast_add( zeros_batch_size.transpose().expand_dims(axis=-1), level.zeros_like(), ) start_state = (zeros_cdf, zeros_cdf.astype("int32")) def step(p, state): cdf, idx = state cdf = F.broadcast_add(cdf, p.expand_dims(axis=-1)) idx = F.where(F.broadcast_greater(cdf, level), idx, idx + 1) return zeros_batch_size, (cdf, idx) _, states = F.contrib.foreach(step, probs, start_state) _, idx = states # idx.shape = (batch.T, levels) # centers.shape = (batch, num_bins) # # expand centers to shape -> (levels, batch, num_bins) # so we can use pick with idx.T.shape = (levels, batch) # # zeros_cdf.shape (batch.T, levels) centers_expanded = F.broadcast_add( self.bin_centers.transpose().expand_dims(axis=-1), zeros_cdf.expand_dims(axis=0), ).transpose() # centers_expanded.shape = (levels, batch, num_bins) # idx.shape (batch.T, levels) a = centers_expanded.pick(idx.transpose(), axis=-1) return a
def test_mixture(distr1: Distribution, distr2: Distribution, p: Tensor, serialize_fn) -> None: # sample from component distributions, and select samples samples1 = distr1.sample(num_samples=NUM_SAMPLES_LARGE) samples2 = distr2.sample(num_samples=NUM_SAMPLES_LARGE) # TODO: for multivariate case, test should not sample elements from different components in the event_dim dimension rand = mx.nd.random.uniform(shape=(NUM_SAMPLES_LARGE, *p.shape)) choice = (rand < p.expand_dims(axis=0)).broadcast_like(samples1) samples_ref = mx.nd.where(choice, samples1, samples2) # construct mixture distribution and sample from it mixture_probs = mx.nd.stack(p, 1.0 - p, axis=-1) mixture = MixtureDistribution(mixture_probs=mixture_probs, components=[distr1, distr2]) mixture = serialize_fn(mixture) samples_mix = mixture.sample(num_samples=NUM_SAMPLES_LARGE) # check that shapes are right assert (samples1.shape == samples2.shape == samples_mix.shape == samples_ref.shape) # check mean and stddev calc_mean = mixture.mean.asnumpy() calc_std = mixture.stddev.asnumpy() sample_mean = samples_mix.asnumpy().mean(axis=0) sample_std = samples_mix.asnumpy().std(axis=0) assert np.allclose(calc_mean, sample_mean, atol=1e-1) assert np.allclose(calc_std, sample_std, atol=2e-1) # check that histograms are close assert (diff(histogram(samples_mix.asnumpy()), histogram(samples_ref.asnumpy())) < 0.05) # can only calculated cdf for gaussians currently if isinstance(distr1, Gaussian) and isinstance(distr2, Gaussian): emp_cdf, edges = empirical_cdf(samples_mix.asnumpy()) calc_cdf = mixture.cdf(mx.nd.array(edges)).asnumpy() assert np.allclose(calc_cdf[1:, :], emp_cdf, atol=1e-2)
def cdf(self, x: Tensor) -> Tensor: r""" Computes the quantile level :math:`\alpha` such that :math:`q(\alpha) = x`. Parameters ---------- x Tensor of shape gamma.shape Returns ------- Tensor Tensor of shape gamma.shape """ F = self.F gamma, b, knot_positions = self.gamma, self.b, self.knot_positions quantiles_at_knots = self.quantile_internal(knot_positions, axis=-2) # Mask to nullify the terms corresponding to knots larger than l_0, # which is the largest knot(quantile level) such that the quantile at # l_0, s(l_0) < x.(..., num_pieces) mask = F.broadcast_lesser(quantiles_at_knots, x.expand_dims(axis=-1)) slope_l0 = F.sum(b * mask, axis=-1, keepdims=False) # slope_l0 can be zero in which case a_tilde = 0. The following is to # circumvent mxnet issue with "where" operator which returns nan even # if the statement you are interested in does not result in nan # (but the "else" statement evaluates to nan). slope_l0_nz = F.where(slope_l0 == F.zeros_like(slope_l0), F.ones_like(x), slope_l0) a_tilde = F.where( slope_l0 == F.zeros_like(slope_l0), F.zeros_like(x), (x - gamma + F.sum(b * knot_positions * mask, axis=-1, keepdims=False)) / slope_l0_nz, ) return F.broadcast_minimum(F.ones_like(a_tilde), a_tilde)
def _assemble_inputs( self, F, target: Tensor, static_features: Tensor, dynamic_features: Tensor, ) -> Tensor: """ Assemble features from target, static features, and the dynamic features. Parameters ---------- F A module that can either refer to the Symbol API or the NDArray API in MXNet. target target time series, shape (batch_size, sequence_length, 1) static_features static features, shape (batch_size, num_feat_static) dynamic_features dynamic_features, shape (batch_size, sequence_length, num_feat_dynamic) Returns ------- Tensor combined features, shape (batch_size, sequence_length, num_feat_static + num_feat_dynamic + 1) """ helper_ones = F.ones_like(target) # Ones of (N, T, 1) tiled_static_features = F.batch_dot( helper_ones, static_features.expand_dims(1)) # (N, T, C) inputs = F.concat(target, tiled_static_features, dynamic_features, dim=2) # (N, T, C) return inputs
def make_nd_diag(F, x: Tensor, d: int) -> Tensor: """ Make a diagonal tensor, given the diagonal Parameters ---------- F The function space to use. x Diagonal to use, shape :math:`(..., d)`. d Last dimension of `x`. Returns ------- Tensor A tensor y of shape :math:`(..., d, d)` such that :math:`y[..., i, i] = x[..., i]`. """ return F.broadcast_mul(F.eye(d), x.expand_dims(axis=-1))
def quantile(self, level: Tensor) -> Tensor: F = self.F # we consider level to be an independent axis and so expand it # to shape (num_levels, 1, 1, ...) for _ in range(self.all_dim): level = level.expand_dims(axis=-1) quantiles = F.broadcast_mul(self.value, level.ones_like()) level = F.broadcast_mul(quantiles.ones_like(), level) minus_inf = -quantiles.ones_like() / 0.0 quantiles = F.where( F.broadcast_logical_or(level != 0, F.contrib.isnan(quantiles)), quantiles, minus_inf, ) nans = level.zeros_like() / 0.0 quantiles = F.where(level != level, nans, quantiles) return quantiles
def prepare_inputs_imputation_step( self, F, begin_state: List[Tensor], imputed_sequence: Tensor, sequence_length: int, subsequences_length: int, scale: Tensor, target: Tensor, target_observed_values: Tensor, time_feat: Tensor, repeated_static_feat: Tensor, is_padded_indicator: Tensor, state, i: int, ) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]: """ Prepares inputs for the next LSTM unrolling step at step i. """ lags = self.get_lagged_subsequences( F=F, sequence=imputed_sequence, sequence_length=sequence_length, indices=self.lags_seq, subsequences_length=subsequences_length, ) # (batch_size, sub_seq_len, *target_shape, num_lags) lags_scaled = F.broadcast_div(lags, scale.expand_dims(axis=-1)) # from (batch_size, sub_seq_len, *target_shape, num_lags) # to (batch_size, sub_seq_len, prod(target_shape) * num_lags) input_lags = F.reshape( data=lags_scaled, shape=( -1, subsequences_length, len(self.lags_seq) * prod(self.target_shape), ), ) # (batch_size, sub_seq_len, input_dim) inputs = F.concat(input_lags, time_feat, repeated_static_feat, dim=-1) is_pad = is_padded_indicator.slice_axis(axis=1, begin=i, end=i + 1) current_observed_indicator = target_observed_values.slice_axis(axis=1, begin=i, end=i + 1) current_target = target.slice_axis(axis=1, begin=i, end=i + 1) pre_sequence = imputed_sequence.slice_axis(axis=1, begin=0, end=-subsequences_length + i) post_sequence = imputed_sequence.slice_axis( axis=1, begin=-subsequences_length + i + 1, end=None) # Reset the state to the begin state if the current target is padded state = [ F.where(is_pad.repeat(repeats=self.num_cells, axis=1), bs, s) for bs, s in zip(begin_state, state) ] return ( inputs, is_pad, current_observed_indicator, current_target, pre_sequence, post_sequence, state, )
def kalman_filter_step( F, target: Tensor, prior_mean: Tensor, prior_cov: Tensor, emission_coeff: Tensor, residual: Tensor, noise_std: Tensor, latent_dim: int, output_dim: int, ): """ One step of the Kalman filter. This function computes the filtered state (mean and covariance) given the linear system coefficients the prior state (mean and variance), as well as observations. Parameters ---------- F target Observations of the system output, shape (batch_size, output_dim) prior_mean Prior mean of the latent state, shape (batch_size, latent_dim) prior_cov Prior covariance of the latent state, shape (batch_size, latent_dim, latent_dim) emission_coeff Emission coefficient, shape (batch_size, output_dim, latent_dim) residual Residual component, shape (batch_size, output_dim) noise_std Standard deviation of the output noise, shape (batch_size, output_dim) latent_dim Dimension of the latent state vector Returns ------- Tensor Filtered_mean, shape (batch_size, latent_dim) Tensor Filtered_covariance, shape (batch_size, latent_dim, latent_dim) Tensor Log probability, shape (batch_size, ) """ # output_mean: mean of the target (batch_size, obs_dim) output_mean = F.linalg_gemm2( emission_coeff, prior_mean.expand_dims(axis=-1)).squeeze(axis=-1) # noise covariance noise_cov = make_nd_diag(F=F, x=noise_std * noise_std, d=output_dim) S_hh_x_A_tr = F.linalg_gemm2(prior_cov, emission_coeff, transpose_b=True) # covariance of the target output_cov = F.linalg_gemm2(emission_coeff, S_hh_x_A_tr) + noise_cov # compute the Cholesky decomposition output_cov = LL^T L_output_cov = F.linalg_potrf(output_cov) # Compute Kalman gain matrix K: # K = S_hh X with X = A^T output_cov^{-1} # We have X = A^T output_cov^{-1} => X output_cov = A^T => X LL^T = A^T # We can thus obtain X by solving two linear systems involving L kalman_gain = F.linalg_trsm( L_output_cov, F.linalg_trsm(L_output_cov, S_hh_x_A_tr, rightside=True, transpose=True), rightside=True, ) # compute the error target_minus_residual = target - residual delta = target_minus_residual - output_mean # filtered estimates filtered_mean = prior_mean.expand_dims(axis=-1) + F.linalg_gemm2( kalman_gain, delta.expand_dims(axis=-1)) filtered_mean = filtered_mean.squeeze(axis=-1) # Joseph's symmetrized update for covariance: ImKA = F.broadcast_sub(F.eye(latent_dim), F.linalg_gemm2(kalman_gain, emission_coeff)) filtered_cov = F.linalg_gemm2( ImKA, F.linalg_gemm2( prior_cov, ImKA, transpose_b=True)) + F.linalg_gemm2( kalman_gain, F.linalg_gemm2(noise_cov, kalman_gain, transpose_b=True)) # likelihood term: (batch_size,) log_p = MultivariateGaussian(output_mean, L_output_cov).log_prob(target_minus_residual) return filtered_mean, filtered_cov, log_p
def unroll( self, F, lags: Tensor, scale: Tensor, time_feat: Tensor, target_dimension_indicator: Tensor, unroll_length: int, begin_state: Optional[List[Tensor]], ) -> Tuple[Tensor, Tensor, Tensor, Tensor]: """ Prepares the input to the RNN and unrolls it the given number of time steps. Parameters ---------- F lags Input lags (batch_size, sub_seq_len, target_dim, num_lags) scale Mean scale (batch_size, 1, target_dim) time_feat Additional time features target_dimension_indicator Indices of the target dimension (batch_size, target_dim) unroll_length length to unroll begin_state State to start the unrolling of the RNN Returns ------- outputs RNN outputs (batch_size, seq_len, num_cells) states RNN states. Nested list with (batch_size, num_cells) tensors with dimensions target_dim x num_layers x (batch_size, num_cells) lags_scaled Scaled lags(batch_size, sub_seq_len, target_dim, num_lags) inputs inputs to the RNN """ # (batch_size, sub_seq_len, target_dim, num_lags) lags_scaled = F.broadcast_div(lags, scale.expand_dims(axis=-1)) assert_shape( lags_scaled, (-1, unroll_length, self.target_dim, len(self.lags_seq)), ) input_lags = F.reshape( data=lags_scaled, shape=(-1, unroll_length, len(self.lags_seq) * self.target_dim), ) # (batch_size, target_dim, embed_dim) index_embeddings = self.embed(target_dimension_indicator) assert_shape(index_embeddings, (-1, self.target_dim, self.embed_dim)) # (batch_size, seq_len, target_dim * embed_dim) repeated_index_embeddings = ( index_embeddings.expand_dims(axis=1) .repeat(axis=1, repeats=unroll_length) .reshape((-1, unroll_length, self.target_dim * self.embed_dim)) ) # (batch_size, sub_seq_len, input_dim) inputs = F.concat( input_lags, repeated_index_embeddings, time_feat, dim=-1 ) # unroll encoder outputs, state = self.rnn.unroll( inputs=inputs, length=unroll_length, layout="NTC", merge_outputs=True, begin_state=begin_state, ) assert_shape(outputs, (-1, unroll_length, self.num_cells)) for s in state: assert_shape(s, (-1, self.num_cells)) assert_shape( lags_scaled, (-1, unroll_length, self.target_dim, len(self.lags_seq)), ) return outputs, state, lags_scaled, inputs
def cumsum( F, x: Tensor, exclusive: bool = False, reverse: bool = False ) -> Tensor: r""" Find cumulative sum on the last axis by multiplying with lower triangular ones-matrix: .. math:: \operatorname{cumsum}(x) = \begin{cases} \operatorname{ltr\_ones} \times x & \text{for cumulative sum}\\ x \times \operatorname{ltr\_ones} & \text{for cumulative sum in the reverse order} \end{cases} Also supports `exclusive` flag to start the cumsum with zero. For example, if :math:`x = [a, b, c]`, we have .. math:: \operatorname{cumsum}(x) = \begin{cases} [a, a + b, a + b + c] & \text{if }\mathit{reverse = False, exclusive = False}\\ [0, a, a + b] & \text{if }\mathit{reverse = False, exclusive = True}\\ [a + b + c, b + c, c] & \text{if }\mathit{reverse = True, exclusive = False}\\ [b + c, c, 0] & \text{if }\mathit{reverse = True, exclusive = True}\\ \end{cases} Parameters ---------- F The function space to use. x A tensor with shape :math:`(..., n)`. exclusive If `True`, the cumulative sum starts with zero. reverse If `True`, the cumulative sum is performed in the opposite direction. Returns ------- Tensor: A modified tensor with identical shape and cumulative sums in the last axis. """ # Create a new axis (for matrix multiplication) either at last location or # last-but-one location (for reverse mode) exp_dim = -2 if reverse else -1 # (..., 1, n) if reverse is True and (..., n, 1) otherwise x = x.expand_dims(axis=exp_dim) # Ones_matrix (..., n, n) ones_matrix = F.linalg_gemm2( F.ones_like(x), F.ones_like(x), transpose_a=reverse, transpose_b=not reverse, ) cumulative_sum = F.linalg_trmm(ones_matrix, x, rightside=reverse) if exclusive: cumulative_sum = cumulative_sum - x return cumulative_sum.squeeze(axis=exp_dim)
def reconcile_samples(self, samples: Tensor) -> Tensor: """ Computes coherent samples by projecting unconstrained `samples` using the matrix `self.M`. Parameters ---------- samples Unconstrained samples Returns ------- Coherent samples Tensor, shape same as that of `samples`. """ proj_matrix_shape = self.M.shape # (num_ts, num_ts) num_iter_dims = len(self.seq_axis) if self.seq_axis else 0 # Expand `M` depending on the shape of samples: # If seq_axis = None, during training the first axis is only `batch_size`, in which case `M` would be expanded # 3 times; during prediction it would be expanded 2 times since the first axis is # `batch_size x num_parallel_samples`. M_expanded = self.M for i in range(len(samples.shape[num_iter_dims:-1])): M_expanded = M_expanded.expand_dims(axis=0) # If seq_axis = None broadcast M to (num_samples, batch_size, seq_len, m, m) during training # and to (num_samples * batch_size, seq_len, m, m) during prediction # Else broadcast to the appropriate remaining dimension _shape = (list(samples.shape[:-1]) if not self.seq_axis else [ samples.shape[i] for i in range(len(samples.shape[:-1])) if i not in self.seq_axis ]) self.M_broadcast = mx.nd.broadcast_to( M_expanded, shape=_shape + list(proj_matrix_shape), ) if self.seq_axis: # bring the axis to iterate in the beginning samples = mx.nd.moveaxis(samples, self.seq_axis, list(range(len(self.seq_axis)))) out = [] for idx in product(*[ range(x) for x in [samples.shape[d] for d in range(len(self.seq_axis))] ]): s = samples[idx] out.append( mx.nd.linalg.gemm2(self.M_broadcast, s.expand_dims(-1)).squeeze(axis=-1)) # put the axis in the correct order again out = mx.nd.concat(*out, dim=0).reshape(samples.shape) out = mx.nd.moveaxis(out, list(range(len(self.seq_axis))), self.seq_axis) return out else: return mx.nd.linalg.gemm2(self.M_broadcast, samples.expand_dims(-1)).squeeze(axis=-1)
def cdf(self, x: Tensor) -> Tensor: F = self.F x = x.expand_dims(axis=-1) # left_edges = self.bin_edges.slice_axis(axis=-1, begin=0, end=-1) mask = F.broadcast_lesser_equal(self.bin_centers, x) return F.broadcast_mul(self.bin_probs, mask).sum(axis=-1)
def hybrid_forward( self, F, feat_static_cat: Tensor, past_target: Tensor, past_observed_values: Tensor, past_time_feat: Tensor, future_time_feat: Tensor, scale: Tensor, ) -> Tensor: """ Computes the training loss for the wavenet model. Parameters ---------- F feat_static_cat Static categorical features: (batch_size, num_cat_features) past_target Past target: (batch_size, receptive_field) past_observed_values Observed value indicator for the past target: (batch_size, receptive_field) past_time_feat Past time features: (batch_size, num_time_features, receptive_field) future_time_feat Future time features: (batch_size, num_time_features, pred_length) scale scale of the time series: (batch_size, 1) Returns ------- Tensor Prediction samples with shape (batch_size, num_samples, pred_length) """ def blow_up(u): """ Expand to (batch_size x num_samples) """ return F.repeat(u, repeats=self.num_samples, axis=0) past_target = past_target.astype("int32") full_features = self.get_full_features( F, feat_static_cat=feat_static_cat, past_observed_values=past_observed_values, past_time_feat=past_time_feat, future_time_feat=future_time_feat, future_observed_values=None, scale=scale, ) # To compute queues for the first step, we need features from # -self.pred_length - self.receptive_field + 1 to -self.pred_length + 1 features_end_ix = (-self.pred_length + 1 if self.pred_length > 1 else None) queues = self.get_initial_conv_queues( F, past_target=F.slice_axis(past_target, begin=-self.receptive_field, end=None, axis=-1), features=F.slice_axis( full_features, begin=-self.pred_length - self.receptive_field + 1, end=features_end_ix, axis=-1, ), ) queues = [blow_up(queue) for queue in queues] res = F.slice_axis(past_target, begin=-2, end=None, axis=-1) res = blow_up(res) for n in range(self.pred_length): # Generate one-step ahead predictions. The input consists of target # and features corresponding to the last two time steps. current_target = F.slice_axis(res, begin=-2, end=None, axis=-1) current_features = F.slice_axis( full_features, begin=self.receptive_field + n - 1, end=self.receptive_field + n + 1, axis=-1, ) embedding = self.target_feature_embedding( F, target=current_target, features=blow_up(current_features), ) # (batch_size, 1, num_bins) where 1 corresponds to the time axis. unnormalized_outputs, queues = self.base_net( F, embedding, one_step_prediction=True, queues=queues) if self.temperature > 0: # (batch_size, 1, num_bins) where 1 corresponds to the time # axis. probs = F.softmax(unnormalized_outputs / self.temperature, axis=-1) # (batch_size, 1) y = F.sample_multinomial(probs) else: # (batch_size, 1) y = F.argmax(unnormalized_outputs, axis=-1) y = y.astype("int32") res = F.concat(res, y, num_args=2, dim=-1) samples = F.slice_axis(res, begin=-self.pred_length, end=None, axis=-1) samples = samples.reshape(shape=(-1, self.num_samples, self.pred_length)) samples = self.post_transform(samples) samples = F.broadcast_mul(scale.expand_dims(axis=1), samples) return samples
def unroll_encoder( self, F, past_time_feat: Tensor, past_target_cdf: Tensor, past_observed_values: Tensor, past_is_pad: Tensor, future_time_feat: Optional[Tensor], future_target_cdf: Optional[Tensor], target_dimension_indicator: Tensor, ) -> Tuple[Tensor, List[Tensor], Tensor, Tensor, Tensor]: """ Unrolls the RNN encoder over past and, if present, future data. Returns outputs and state of the encoder, plus the scale of past_target_cdf and a vector of static features that was constructed and fed as input to the encoder. All tensor arguments should have NTC layout. Parameters ---------- F past_time_feat Past time features (batch_size, history_length, num_features) past_target_cdf Past marginal CDF transformed target values (batch_size, history_length, target_dim) past_observed_values Indicator whether or not the values were observed (batch_size, history_length, target_dim) past_is_pad Indicator whether the past target values have been padded (batch_size, history_length) future_time_feat Future time features (batch_size, prediction_length, num_features) future_target_cdf Future marginal CDF transformed target values (batch_size, prediction_length, target_dim) target_dimension_indicator Dimensionality of the time series (batch_size, target_dim) Returns ------- outputs RNN outputs (batch_size, seq_len, num_cells) states RNN states. Nested list with (batch_size, num_cells) tensors with dimensions target_dim x num_layers x (batch_size, num_cells) scale Mean scales for the time series (batch_size, 1, target_dim) lags_scaled Scaled lags(batch_size, sub_seq_len, target_dim, num_lags) inputs inputs to the RNN """ past_observed_values = F.broadcast_minimum( past_observed_values, 1 - past_is_pad.expand_dims(axis=-1) ) if future_time_feat is None or future_target_cdf is None: time_feat = past_time_feat.slice_axis( axis=1, begin=-self.context_length, end=None ) sequence = past_target_cdf sequence_length = self.history_length subsequences_length = self.context_length else: time_feat = F.concat( past_time_feat.slice_axis( axis=1, begin=-self.context_length, end=None ), future_time_feat, dim=1, ) sequence = F.concat(past_target_cdf, future_target_cdf, dim=1) sequence_length = self.history_length + self.prediction_length subsequences_length = self.context_length + self.prediction_length # (batch_size, sub_seq_len, target_dim, num_lags) lags = self.get_lagged_subsequences( F=F, sequence=sequence, sequence_length=sequence_length, indices=self.lags_seq, subsequences_length=subsequences_length, ) # scale is computed on the context length last units of the past target # scale shape is (batch_size, 1, target_dim) _, scale = self.scaler( past_target_cdf.slice_axis( axis=1, begin=-self.context_length, end=None ), past_observed_values.slice_axis( axis=1, begin=-self.context_length, end=None ), ) outputs, states, lags_scaled, inputs = self.unroll( F=F, lags=lags, scale=scale, time_feat=time_feat, target_dimension_indicator=target_dimension_indicator, unroll_length=subsequences_length, begin_state=None, ) return outputs, states, scale, lags_scaled, inputs
def _tensor_cdf_bisection(self, level: Tensor, tol=1e-6, max_iter=120) -> Tensor: r""" Returns a Tensor of shape (len(level), *batch_size) with the corresponding quantiles. """ F = self.F local_max_support_val = min(1e16, MAX_SUPPORT_VAL) try: support_lb, support_ub = self.support_min_max support_lb = F.broadcast_maximum( F.broadcast_minimum( support_lb, F.ones(self.batch_shape) * local_max_support_val, ), F.ones(self.batch_shape) * -local_max_support_val, ) support_ub = F.broadcast_maximum( F.broadcast_minimum( support_ub, F.ones(self.batch_shape) * local_max_support_val, ), F.ones(self.batch_shape) * -local_max_support_val, ) upper_bound = F.broadcast_like(support_lb.expand_dims(axis=0), level, lhs_axes=0, rhs_axes=0) lower_bound = F.broadcast_like(support_ub.expand_dims(axis=0), level, lhs_axes=0, rhs_axes=0) except NotImplementedError: # default to R if not defined upper_bound = (F.ones( (len(level), *self.batch_shape)) * local_max_support_val) lower_bound = (F.ones( (len(level), *self.batch_shape)) * -local_max_support_val) for _ in range(self.all_dim): level = level.expand_dims(axis=-1) q = 0.5 * F.broadcast_add(upper_bound, lower_bound) val = self.cdf(q) - level cnt = 0 while F.sum(F.abs(val) > tol) > 0 and cnt < max_iter: mask_g = F.greater(val, tol) mask_l = F.lesser(val, -tol) mask_done = F.lesser_equal(F.abs(val), tol) upper_bound = (F.broadcast_mul(q, mask_g) + F.broadcast_mul(upper_bound, mask_l) + F.broadcast_mul(q, mask_done)) lower_bound = (F.broadcast_mul(q, mask_l) + F.broadcast_mul(lower_bound, mask_g) + F.broadcast_mul(q, mask_done)) q = 0.5 * F.broadcast_add(upper_bound, lower_bound) val = self.cdf(q) - level cnt += 1 return q
def train_hybrid_forward( self, F, target_dimension_indicator: Tensor, past_time_feat: Tensor, past_target_cdf: Tensor, past_observed_values: Tensor, past_is_pad: Tensor, future_time_feat: Tensor, future_target_cdf: Tensor, future_observed_values: Tensor, ) -> Tuple[Tensor, ...]: """ Computes the loss for training DeepVAR, all inputs tensors representing time series have NTC layout. Parameters ---------- F target_dimension_indicator Indices of the target dimension (batch_size, target_dim) past_time_feat Dynamic features of past time series (batch_size, history_length, num_features) past_target_cdf Past marginal CDF transformed target values (batch_size, history_length, target_dim) past_observed_values Indicator whether or not the values were observed (batch_size, history_length, target_dim) past_is_pad Indicator whether the past target values have been padded (batch_size, history_length) future_time_feat Future time features (batch_size, prediction_length, num_features) future_target_cdf Future marginal CDF transformed target values (batch_size, prediction_length, target_dim) future_observed_values Indicator whether or not the future values were observed (batch_size, prediction_length, target_dim) Returns ------- distr Loss with shape (batch_size, 1) likelihoods Likelihoods for each time step (batch_size, context + prediction_length, 1) distr_args Distribution arguments (context + prediction_length, number_of_arguments) """ seq_len = self.context_length + self.prediction_length # unroll the decoder in "training mode", i.e. by providing future data # as well rnn_outputs, _, scale, lags_scaled, inputs = self.unroll_encoder( F=F, past_time_feat=past_time_feat, past_target_cdf=past_target_cdf, past_observed_values=past_observed_values, past_is_pad=past_is_pad, future_time_feat=future_time_feat, future_target_cdf=future_target_cdf, target_dimension_indicator=target_dimension_indicator, ) # put together target sequence # (batch_size, seq_len, target_dim) target = F.concat( past_target_cdf.slice_axis( axis=1, begin=-self.context_length, end=None ), future_target_cdf, dim=1, ) # assert_shape(target, (-1, seq_len, self.target_dim)) distr, distr_args = self.distr( time_features=inputs, rnn_outputs=rnn_outputs, scale=scale, lags_scaled=lags_scaled, target_dimension_indicator=target_dimension_indicator, seq_len=self.context_length + self.prediction_length, ) # we sum the last axis to have the same shape for all likelihoods # (batch_size, subseq_length, 1) likelihoods = -distr.log_prob(target).expand_dims(axis=-1) assert_shape(likelihoods, (-1, seq_len, 1)) past_observed_values = F.broadcast_minimum( past_observed_values, 1 - past_is_pad.expand_dims(axis=-1) ) # (batch_size, subseq_length, target_dim) observed_values = F.concat( past_observed_values.slice_axis( axis=1, begin=-self.context_length, end=None ), future_observed_values, dim=1, ) # mask the loss at one time step if one or more observations is missing # in the target dimensions (batch_size, subseq_length, 1) loss_weights = observed_values.min(axis=-1, keepdims=True) assert_shape(loss_weights, (-1, seq_len, 1)) loss = weighted_average( F=F, x=likelihoods, weights=loss_weights, axis=1 ) assert_shape(loss, (-1, -1, 1)) self.distribution = distr return (loss, likelihoods) + distr_args
def process_static_real(self, F, feature: Tensor) -> Tensor: return F.tile(feature.expand_dims(axis=1), reps=(1, self.T, 1))
def predict_hybrid_forward( self, F, target_dimension_indicator: Tensor, past_time_feat: Tensor, past_target_cdf: Tensor, past_observed_values: Tensor, past_is_pad: Tensor, future_time_feat: Tensor, ) -> Tensor: """ Predicts samples given the trained DeepVAR model. All tensors should have NTC layout. Parameters ---------- F target_dimension_indicator Indices of the target dimension (batch_size, target_dim) past_time_feat Dynamic features of past time series (batch_size, history_length, num_features) past_target_cdf Past marginal CDF transformed target values (batch_size, history_length, target_dim) past_observed_values Indicator whether or not the values were observed (batch_size, history_length, target_dim) past_is_pad Indicator whether the past target values have been padded (batch_size, history_length) future_time_feat Future time features (batch_size, prediction_length, num_features) Returns ------- sample_paths : Tensor A tensor containing sampled paths (1, num_sample_paths, prediction_length, target_dim). """ # mark padded data as unobserved # (batch_size, target_dim, seq_len) past_observed_values = F.broadcast_minimum( past_observed_values, 1 - past_is_pad.expand_dims(axis=-1) ) # unroll the decoder in "prediction mode", i.e. with past data only _, state, scale, _, inputs = self.unroll_encoder( F=F, past_time_feat=past_time_feat, past_target_cdf=past_target_cdf, past_observed_values=past_observed_values, past_is_pad=past_is_pad, future_time_feat=None, future_target_cdf=None, target_dimension_indicator=target_dimension_indicator, ) return self.sampling_decoder( F=F, past_target_cdf=past_target_cdf, target_dimension_indicator=target_dimension_indicator, time_feat=future_time_feat, scale=scale, begin_states=state, )