def sampling_decoder( self, F, static_feat: Tensor, past_target: Tensor, time_feat: Tensor, scale: Tensor, enc_out: Tensor, ) -> Tensor: """ Computes sample paths by unrolling the LSTM starting with a initial input and state. Parameters ---------- static_feat : Tensor static features. Shape: (batch_size, num_static_features). past_target : Tensor target history. Shape: (batch_size, history_length, 1). time_feat : Tensor time features. Shape: (batch_size, prediction_length, num_time_features). scale : Tensor tensor containing the scale of each element in the batch. Shape: (batch_size, ). enc_out: Tensor output of the encoder. Shape: (batch_size, num_cells) Returns -------- sample_paths : Tensor a tensor containing sampled paths. Shape: (batch_size, num_sample_paths, prediction_length). """ # blows-up the dimension of each tensor to batch_size * # self.num_parallel_samples for increasing parallelism repeated_past_target = past_target.repeat( repeats=self.num_parallel_samples, axis=0) repeated_time_feat = time_feat.repeat( repeats=self.num_parallel_samples, axis=0) repeated_static_feat = static_feat.repeat( repeats=self.num_parallel_samples, axis=0).expand_dims(axis=1) repeated_enc_out = enc_out.repeat(repeats=self.num_parallel_samples, axis=0).expand_dims(axis=1) repeated_scale = scale.repeat(repeats=self.num_parallel_samples, axis=0) future_samples = [] # for each future time-units we draw new samples for this time-unit and # update the state for k in range(self.prediction_length): lags = self.get_lagged_subsequences( F=F, sequence=repeated_past_target, sequence_length=self.history_length + k, indices=self.shifted_lags, subsequences_length=1, ) # (batch_size * num_samples, 1, *target_shape, num_lags) lags_scaled = F.broadcast_div(lags, repeated_scale.expand_dims(axis=-1)) # from (batch_size * num_samples, 1, *target_shape, num_lags) # to (batch_size * num_samples, 1, prod(target_shape) * num_lags) input_lags = F.reshape( data=lags_scaled, shape=(-1, 1, prod(self.target_shape) * len(self.lags_seq)), ) # (batch_size * num_samples, 1, prod(target_shape) * num_lags + # num_time_features + num_static_features) dec_input = F.concat( input_lags, repeated_time_feat.slice_axis(axis=1, begin=k, end=k + 1), repeated_static_feat, dim=-1, ) dec_output = self.decoder(dec_input, repeated_enc_out, None, False) distr_args = self.proj_dist_args(dec_output) # compute likelihood of target given the predicted parameters distr = self.distr_output.distribution(distr_args, scale=repeated_scale) # (batch_size * num_samples, 1, *target_shape) new_samples = distr.sample() # (batch_size * num_samples, seq_len, *target_shape) repeated_past_target = F.concat(repeated_past_target, new_samples, dim=1) future_samples.append(new_samples) # reset cache of the decoder self.decoder.cache_reset() # (batch_size * num_samples, prediction_length, *target_shape) samples = F.concat(*future_samples, dim=1) # (batch_size, num_samples, *target_shape, prediction_length) return samples.reshape(shape=((-1, self.num_parallel_samples) + self.target_shape + (self.prediction_length, )))
def create_network_input( self, F, feat_static_cat: Tensor, # (batch_size, num_features) past_time_feat: Tensor, # (batch_size, num_features, history_length) past_target: Tensor, # (batch_size, history_length, 1) past_observed_values: Tensor, # (batch_size, history_length) future_time_feat: Optional[ Tensor], # (batch_size, num_features, prediction_length) future_target: Optional[Tensor], # (batch_size, prediction_length) ) -> Tuple[Tensor, Tensor, Tensor]: """ Creates inputs for the transformer network. All tensor arguments should have NTC layout. """ if future_time_feat is None or future_target is None: time_feat = past_time_feat.slice_axis( axis=1, begin=self.history_length - self.context_length, end=None, ) sequence = past_target sequence_length = self.history_length subsequences_length = self.context_length else: time_feat = F.concat( past_time_feat.slice_axis( axis=1, begin=self.history_length - self.context_length, end=None, ), future_time_feat, dim=1, ) sequence = F.concat(past_target, future_target, dim=1) sequence_length = self.history_length + self.prediction_length subsequences_length = self.context_length + self.prediction_length # (batch_size, sub_seq_len, *target_shape, num_lags) lags = self.get_lagged_subsequences( F=F, sequence=sequence, sequence_length=sequence_length, indices=self.lags_seq, subsequences_length=subsequences_length, ) # scale is computed on the context length last units of the past target # scale shape is (batch_size, 1, *target_shape) _, scale = self.scaler( past_target.slice_axis(axis=1, begin=-self.context_length, end=None), past_observed_values.slice_axis(axis=1, begin=-self.context_length, end=None), ) embedded_cat = self.embedder(feat_static_cat) # in addition to embedding features, use the log scale as it can help # prediction too(batch_size, num_features + prod(target_shape)) static_feat = F.concat( embedded_cat, F.log(scale) if len(self.target_shape) == 0 else F.log( scale.squeeze(axis=1)), dim=1, ) repeated_static_feat = static_feat.expand_dims(axis=1).repeat( axis=1, repeats=subsequences_length) # (batch_size, sub_seq_len, *target_shape, num_lags) lags_scaled = F.broadcast_div(lags, scale.expand_dims(axis=-1)) # from (batch_size, sub_seq_len, *target_shape, num_lags) # to (batch_size, sub_seq_len, prod(target_shape) * num_lags) input_lags = F.reshape( data=lags_scaled, shape=( -1, subsequences_length, len(self.lags_seq) * prod(self.target_shape), ), ) # (batch_size, sub_seq_len, input_dim) inputs = F.concat(input_lags, time_feat, repeated_static_feat, dim=-1) return inputs, scale, static_feat
def sampling_decoder( self, F, static_feat: Tensor, past_target: Tensor, time_feat: Tensor, scale: Tensor, begin_states: List, ) -> Tensor: """ Computes sample paths by unrolling the LSTM starting with a initial input and state. Parameters ---------- static_feat : Tensor static features. Shape: (batch_size, num_static_features). past_target : Tensor target history. Shape: (batch_size, history_length). time_feat : Tensor time features. Shape: (batch_size, prediction_length, num_time_features). scale : Tensor tensor containing the scale of each element in the batch. Shape: (batch_size, 1, 1). begin_states : List list of initial states for the LSTM layers. the shape of each tensor of the list should be (batch_size, num_cells) Returns -------- Tensor A tensor containing sampled paths. Shape: (batch_size, num_sample_paths, prediction_length). """ # blows-up the dimension of each tensor to batch_size * self.num_parallel_samples for increasing parallelism repeated_past_target = past_target.repeat( repeats=self.num_parallel_samples, axis=0) repeated_time_feat = time_feat.repeat( repeats=self.num_parallel_samples, axis=0) repeated_static_feat = static_feat.repeat( repeats=self.num_parallel_samples, axis=0).expand_dims(axis=1) repeated_scale = scale.repeat(repeats=self.num_parallel_samples, axis=0) repeated_states = [ s.repeat(repeats=self.num_parallel_samples, axis=0) for s in begin_states ] future_samples = [] # for each future time-units we draw new samples for this time-unit and update the state for k in range(self.prediction_length): # (batch_size * num_samples, 1, *target_shape, num_lags) lags = self.get_lagged_subsequences( F=F, sequence=repeated_past_target, sequence_length=self.history_length + k, indices=self.shifted_lags, subsequences_length=1, ) # (batch_size * num_samples, 1, *target_shape, num_lags) lags_scaled = F.broadcast_div(lags, repeated_scale.expand_dims(axis=-1)) # from (batch_size * num_samples, 1, *target_shape, num_lags) # to (batch_size * num_samples, 1, prod(target_shape) * num_lags) input_lags = F.reshape( data=lags_scaled, shape=(-1, 1, prod(self.target_shape) * len(self.lags_seq)), ) # (batch_size * num_samples, 1, prod(target_shape) * num_lags + num_time_features + num_static_features) decoder_input = F.concat( input_lags, repeated_time_feat.slice_axis(axis=1, begin=k, end=k + 1), # observed_values.expand_dims(axis=1), repeated_static_feat, dim=-1, ) # output shape: (batch_size * num_samples, 1, num_cells) # state shape: (batch_size * num_samples, num_cells) rnn_outputs, repeated_states = self.rnn.unroll( inputs=decoder_input, length=1, begin_state=repeated_states, layout="NTC", merge_outputs=True, ) distr_args = self.proj_distr_args(rnn_outputs) # compute likelihood of target given the predicted parameters distr = self.distr_output.distribution(distr_args, scale=repeated_scale) # (batch_size * num_samples, 1, *target_shape) new_samples = distr.sample(dtype=self.dtype) # (batch_size * num_samples, seq_len, *target_shape) repeated_past_target = F.concat(repeated_past_target, new_samples, dim=1) future_samples.append(new_samples) # (batch_size * num_samples, prediction_length, *target_shape) samples = F.concat(*future_samples, dim=1) # (batch_size, num_samples, prediction_length, *target_shape) return samples.reshape(shape=((-1, self.num_parallel_samples) + (self.prediction_length, ) + self.target_shape))
def unroll_encoder_default( self, F, feat_static_cat: Tensor, # (batch_size, num_features) feat_static_real: Tensor, # (batch_size, num_features) past_time_feat: Tensor, # (batch_size, history_length, num_features) past_target: Tensor, # (batch_size, history_length, *target_shape) past_observed_values: Tensor, # (batch_size, history_length, *target_shape) past_is_pad: Tensor, future_observed_values: Optional[Tensor], future_time_feat: Optional[ Tensor], # (batch_size, prediction_length, num_features) future_target: Optional[ Tensor], # (batch_size, prediction_length, *target_shape) ) -> Tuple[Tensor, List, Tensor, Tensor, Tensor]: """ Unrolls the LSTM encoder over past and, if present, future data. Returns outputs and state of the encoder, plus the scale of past_target and a vector of static features that was constructed and fed as input to the encoder. All tensor arguments should have NTC layout. """ if future_time_feat is None or future_target is None: time_feat = past_time_feat.slice_axis( axis=1, begin=self.history_length - self.context_length, end=None, ) is_padded_indicator = past_is_pad.slice_axis( axis=1, begin=self.history_length - self.context_length, end=None, ) sequence = past_target sequence_length = self.history_length subsequences_length = self.context_length else: time_feat = F.concat( past_time_feat.slice_axis( axis=1, begin=self.history_length - self.context_length, end=None, ), future_time_feat, dim=1, ) is_padded_indicator = F.concat( past_is_pad.slice_axis( axis=1, begin=self.history_length - self.context_length, end=None, ), F.zeros_like(future_observed_values), dim=1, ) sequence = F.concat(past_target, future_target, dim=1) sequence_length = self.history_length + self.prediction_length subsequences_length = self.context_length + self.prediction_length # (batch_size, sub_seq_len, *target_shape, num_lags) lags = self.get_lagged_subsequences( F=F, sequence=sequence, sequence_length=sequence_length, indices=self.lags_seq, subsequences_length=subsequences_length, ) # scale is computed on the context length last units of the past target # scale shape is (batch_size, 1, *target_shape) _, scale = self.scaler( past_target.slice_axis(axis=1, begin=-self.context_length, end=None), past_observed_values.slice_axis(axis=1, begin=-self.context_length, end=None), ) # (batch_size, num_features) embedded_cat = self.embedder(feat_static_cat) # in addition to embedding features, use the log scale as it can help # prediction too # (batch_size, num_features + prod(target_shape)) static_feat = F.concat( embedded_cat, feat_static_real, F.log(scale) if len(self.target_shape) == 0 else F.log( scale.squeeze(axis=1)), dim=1, ) # (batch_size, subsequences_length, num_features + 1) repeated_static_feat = static_feat.expand_dims(axis=1).repeat( axis=1, repeats=subsequences_length) # (batch_size, sub_seq_len, *target_shape, num_lags) lags_scaled = F.broadcast_div(lags, scale.expand_dims(axis=-1)) # from (batch_size, sub_seq_len, *target_shape, num_lags) # to (batch_size, sub_seq_len, prod(target_shape) * num_lags) input_lags = F.reshape( data=lags_scaled, shape=( -1, subsequences_length, len(self.lags_seq) * prod(self.target_shape), ), ) # (batch_size, sub_seq_len, input_dim) inputs = F.concat(input_lags, time_feat, repeated_static_feat, dim=-1) begin_state = self.rnn.begin_state( func=F.zeros, dtype=self.dtype, batch_size=inputs.shape[0] if isinstance(inputs, mx.nd.NDArray) else 0, ) state = begin_state # This is a dummy computation to avoid deferred initialization error # when past_is_pad is not used in the computation graph in default # unrolling mode. state = [ F.where( is_padded_indicator.slice_axis(axis=1, begin=0, end=1).repeat( repeats=self.num_cells, axis=1), bs, s, ) for bs, s in zip(begin_state, state) ] # unroll encoder outputs, state = self.rnn.unroll( inputs=inputs, length=subsequences_length, layout="NTC", merge_outputs=True, begin_state=state, ) # outputs: (batch_size, seq_len, num_cells) # state: list of (batch_size, num_cells) tensors # scale: (batch_size, 1, *target_shape) # static_feat: (batch_size, num_features + prod(target_shape)) return outputs, state, scale, static_feat, sequence
def unroll_encoder_imputation( self, F, feat_static_cat: Tensor, # (batch_size, num_features) feat_static_real: Tensor, # (batch_size, num_features) past_time_feat: Tensor, # (batch_size, history_length, num_features) past_target: Tensor, # (batch_size, history_length, *target_shape) past_observed_values: Tensor, # (batch_size, history_length, *target_shape) past_is_pad: Tensor, # (batch_size, history_length, *target_shape) future_observed_values: Optional[ Tensor], # (batch_size, history_length, *target_shape) future_time_feat: Optional[ Tensor], # (batch_size, prediction_length, num_features) future_target: Optional[ Tensor], # (batch_size, prediction_length, *target_shape) ) -> Tuple[Tensor, List, Tensor, Tensor, Tensor]: """ Unrolls the RNN encoder in "imputation mode" which will fill imputed values with samples from the DeepAR model. """ if future_time_feat is None or future_target is None: time_feat = past_time_feat.slice_axis( axis=1, begin=self.history_length - self.context_length, end=None, ) is_padded_indicator = past_is_pad.slice_axis( axis=1, begin=self.history_length - self.context_length, end=None, ) target = past_target.slice_axis( axis=1, begin=self.history_length - self.context_length, end=None, ) target_observed_values = past_observed_values.slice_axis( axis=1, begin=self.history_length - self.context_length, end=None, ) sequence = past_target sequence_length = self.history_length subsequences_length = self.context_length else: time_feat = F.concat( past_time_feat.slice_axis( axis=1, begin=self.history_length - self.context_length, end=None, ), future_time_feat, dim=1, ) is_padded_indicator = F.concat( past_is_pad.slice_axis( axis=1, begin=self.history_length - self.context_length, end=None, ), F.zeros_like(future_observed_values), dim=1, ) target = F.concat( past_target.slice_axis( axis=1, begin=self.history_length - self.context_length, end=None, ), future_target, dim=1, ) target_observed_values = F.concat( past_observed_values.slice_axis( axis=1, begin=self.history_length - self.context_length, end=None, ), future_observed_values, dim=1, ) sequence = F.concat(past_target, future_target, dim=1) sequence_length = self.history_length + self.prediction_length subsequences_length = self.context_length + self.prediction_length # (batch_size, sub_seq_len, *target_shape, num_lags) lags = self.get_lagged_subsequences( F=F, sequence=sequence, sequence_length=sequence_length, indices=self.lags_seq, subsequences_length=subsequences_length, ) # scale is computed on the context length last units of the past target # scale shape is (batch_size, 1, *target_shape) _, scale = self.scaler( past_target.slice_axis(axis=1, begin=-self.context_length, end=None), past_observed_values.slice_axis(axis=1, begin=-self.context_length, end=None), ) # (batch_size, num_features) embedded_cat = self.embedder(feat_static_cat) # in addition to embedding features, use the log scale as it can help # prediction too # (batch_size, num_features + prod(target_shape)) static_feat = F.concat( embedded_cat, feat_static_real, F.log(scale) if len(self.target_shape) == 0 else F.log( scale.squeeze(axis=1)), dim=1, ) # (batch_size, subsequences_length, num_features + 1) repeated_static_feat = static_feat.expand_dims(axis=1).repeat( axis=1, repeats=subsequences_length) # (batch_size, sub_seq_len, *target_shape, num_lags) lags_scaled = F.broadcast_div(lags, scale.expand_dims(axis=-1)) # from (batch_size, sub_seq_len, *target_shape, num_lags) # to (batch_size, sub_seq_len, prod(target_shape) * num_lags) input_lags = F.reshape( data=lags_scaled, shape=( -1, subsequences_length, len(self.lags_seq) * prod(self.target_shape), ), ) # (batch_size, sub_seq_len, input_dim) inputs = F.concat(input_lags, time_feat, repeated_static_feat, dim=-1) # Set initial state begin_state = self.rnn.begin_state( func=F.zeros, dtype=self.dtype, batch_size=inputs.shape[0] if isinstance(inputs, mx.nd.NDArray) else 0, ) unroll_results = self.imputation_rnn_unroll( F, begin_state=begin_state, sequence=sequence, sequence_length=sequence_length, subsequences_length=subsequences_length, scale=scale, target=target, target_observed_values=target_observed_values, time_feat=time_feat, repeated_static_feat=repeated_static_feat, is_padded_indicator=is_padded_indicator, ) outputs, state, imputed_sequence = unroll_results # outputs: (batch_size, seq_len, num_cells) # state: list of (batch_size, num_cells) tensors # scale: (batch_size, 1, *target_shape) # static_feat: (batch_size, num_features + prod(target_shape)) out = F.concat(*outputs, dim=1) return out, state, scale, static_feat, imputed_sequence
def prepare_inputs_imputation_step( self, F, begin_state: List[Tensor], imputed_sequence: Tensor, sequence_length: int, subsequences_length: int, scale: Tensor, target: Tensor, target_observed_values: Tensor, time_feat: Tensor, repeated_static_feat: Tensor, is_padded_indicator: Tensor, state, i: int, ) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]: """ Prepares inputs for the next LSTM unrolling step at step i. """ lags = self.get_lagged_subsequences( F=F, sequence=imputed_sequence, sequence_length=sequence_length, indices=self.lags_seq, subsequences_length=subsequences_length, ) # (batch_size, sub_seq_len, *target_shape, num_lags) lags_scaled = F.broadcast_div(lags, scale.expand_dims(axis=-1)) # from (batch_size, sub_seq_len, *target_shape, num_lags) # to (batch_size, sub_seq_len, prod(target_shape) * num_lags) input_lags = F.reshape( data=lags_scaled, shape=( -1, subsequences_length, len(self.lags_seq) * prod(self.target_shape), ), ) # (batch_size, sub_seq_len, input_dim) inputs = F.concat(input_lags, time_feat, repeated_static_feat, dim=-1) is_pad = is_padded_indicator.slice_axis(axis=1, begin=i, end=i + 1) current_observed_indicator = target_observed_values.slice_axis(axis=1, begin=i, end=i + 1) current_target = target.slice_axis(axis=1, begin=i, end=i + 1) pre_sequence = imputed_sequence.slice_axis(axis=1, begin=0, end=-subsequences_length + i) post_sequence = imputed_sequence.slice_axis( axis=1, begin=-subsequences_length + i + 1, end=None) # Reset the state to the begin state if the current target is padded state = [ F.where(is_pad.repeat(repeats=self.num_cells, axis=1), bs, s) for bs, s in zip(begin_state, state) ] return ( inputs, is_pad, current_observed_indicator, current_target, pre_sequence, post_sequence, state, )