def lstm_unit(hidden_t_prev, cell_t_prev, gates, seq_lengths, timestep, forget_bias=0.0, drop_states=False): D = cell_t_prev.shape[2] G = gates.shape[2] N = gates.shape[1] t = (timestep * np.ones(shape=(N, D))).astype(np.int32) assert t.shape == (N, D) seq_lengths = (np.ones(shape=(N, D)) * seq_lengths.reshape(N, 1)).astype(np.int32) assert seq_lengths.shape == (N, D) assert G == 4 * D # Resize to avoid broadcasting inconsistencies with NumPy gates = gates.reshape(N, 4, D) cell_t_prev = cell_t_prev.reshape(N, D) i_t = gates[:, 0, :].reshape(N, D) f_t = gates[:, 1, :].reshape(N, D) o_t = gates[:, 2, :].reshape(N, D) g_t = gates[:, 3, :].reshape(N, D) i_t = sigmoid(i_t) f_t = sigmoid(f_t + forget_bias) o_t = sigmoid(o_t) g_t = tanh(g_t) valid = (t < seq_lengths).astype(np.int32) assert valid.shape == (N, D) cell_t = ((f_t * cell_t_prev) + (i_t * g_t)) * (valid) + \ (1 - valid) * cell_t_prev * (1 - drop_states) assert cell_t.shape == (N, D) hidden_t = (o_t * tanh(cell_t)) * valid + hidden_t_prev * ( 1 - valid) * (1 - drop_states) hidden_t = hidden_t.reshape(1, N, D) cell_t = cell_t.reshape(1, N, D) return hidden_t, cell_t
def basic_rnn_reference(input, hidden_initial, i2h_w, i2h_b, gate_w, gate_b, seq_lengths, drop_states, use_sequence_lengths): D = hidden_initial.shape[-1] T = input.shape[0] N = input.shape[1] if seq_lengths is not None: seq_lengths = (np.ones(shape=(N, D)) * seq_lengths.reshape(N, 1)).astype(np.int32) ret = [] hidden_prev = hidden_initial for t in range(T): input_fc = np.dot(input[t], i2h_w.T) + i2h_b recur_fc = np.dot(hidden_prev, gate_w.T) + gate_b hidden_t = tanh(input_fc + recur_fc) if seq_lengths is not None: valid = (t < seq_lengths).astype(np.int32) assert valid.shape == (N, D), (valid.shape, (N, D)) hidden_t = hidden_t * valid + \ hidden_prev * (1 - valid) * (1 - drop_states) ret.append(hidden_t) hidden_prev = hidden_t return ret
def basic_rnn_reference(input, hidden_initial, i2h_w, i2h_b, gate_w, gate_b, seq_lengths, drop_states, use_sequence_lengths): D = hidden_initial.shape[-1] T = input.shape[0] N = input.shape[1] if seq_lengths is not None: seq_lengths = (np.ones(shape=(N, D)) * seq_lengths.reshape(N, 1)).astype(np.int32) ret = [] hidden_prev = hidden_initial for t in range(T): input_fc = np.dot(input[t], i2h_w.T) + i2h_b recur_fc = np.dot(hidden_prev, gate_w.T) + gate_b hidden_t = tanh(input_fc + recur_fc) if seq_lengths is not None: valid = (t < seq_lengths).astype(np.int32) assert valid.shape == (N, D), (valid.shape, (N, D)) hidden_t = hidden_t * valid + \ hidden_prev * (1 - valid) * (1 - drop_states) ret.append(hidden_t) hidden_prev = hidden_t return ret
def gru_unit(*args, **kwargs): ''' Implements one GRU unit, for one time step Shapes: hidden_t_prev.shape = (1, N, D) gates_out_t.shape = (1, N, G) seq_lenths.shape = (N,) ''' drop_states = kwargs.get('drop_states', False) sequence_lengths = kwargs.get('sequence_lengths', True) if sequence_lengths: hidden_t_prev, gates_out_t, seq_lengths, timestep = args else: hidden_t_prev, gates_out_t, timestep = args N = hidden_t_prev.shape[1] D = hidden_t_prev.shape[2] G = gates_out_t.shape[2] t = (timestep * np.ones(shape=(N, D))).astype(np.int32) assert t.shape == (N, D) assert G == 3 * D # Calculate reset, update, and output gates separately # because output gate depends on reset gate. gates_out_t = gates_out_t.reshape(N, 3, D) reset_gate_t = gates_out_t[:, 0, :].reshape(N, D) update_gate_t = gates_out_t[:, 1, :].reshape(N, D) output_gate_t = gates_out_t[:, 2, :].reshape(N, D) # Calculate gate outputs. reset_gate_t = sigmoid(reset_gate_t) update_gate_t = sigmoid(update_gate_t) output_gate_t = tanh(output_gate_t) if sequence_lengths: seq_lengths = (np.ones(shape=(N, D)) * seq_lengths.reshape(N, 1)).astype(np.int32) assert seq_lengths.shape == (N, D) valid = (t < seq_lengths).astype(np.int32) else: valid = np.ones(shape=(N, D)) assert valid.shape == (N, D) hidden_t = update_gate_t * hidden_t_prev + (1 - update_gate_t) * output_gate_t hidden_t = hidden_t * valid + hidden_t_prev * (1 - valid) * (1 - drop_states) hidden_t = hidden_t.reshape(1, N, D) return (hidden_t, )
def gru_unit(*args, **kwargs): ''' Implements one GRU unit, for one time step Shapes: hidden_t_prev.shape = (1, N, D) gates_out_t.shape = (1, N, G) seq_lenths.shape = (N,) ''' drop_states = kwargs.get('drop_states', False) sequence_lengths = kwargs.get('sequence_lengths', True) if sequence_lengths: hidden_t_prev, gates_out_t, seq_lengths, timestep = args else: hidden_t_prev, gates_out_t, timestep = args N = hidden_t_prev.shape[1] D = hidden_t_prev.shape[2] G = gates_out_t.shape[2] t = (timestep * np.ones(shape=(N, D))).astype(np.int32) assert t.shape == (N, D) assert G == 3 * D # Calculate reset, update, and output gates separately # because output gate depends on reset gate. gates_out_t = gates_out_t.reshape(N, 3, D) reset_gate_t = gates_out_t[:, 0, :].reshape(N, D) update_gate_t = gates_out_t[:, 1, :].reshape(N, D) output_gate_t = gates_out_t[:, 2, :].reshape(N, D) # Calculate gate outputs. reset_gate_t = sigmoid(reset_gate_t) update_gate_t = sigmoid(update_gate_t) output_gate_t = tanh(output_gate_t) if sequence_lengths: seq_lengths = (np.ones(shape=(N, D)) * seq_lengths.reshape(N, 1)).astype(np.int32) assert seq_lengths.shape == (N, D) valid = (t < seq_lengths).astype(np.int32) else: valid = np.ones(shape=(N, D)) assert valid.shape == (N, D) hidden_t = update_gate_t * hidden_t_prev + (1 - update_gate_t) * output_gate_t hidden_t = hidden_t * valid + hidden_t_prev * (1 - valid) * (1 - drop_states) hidden_t = hidden_t.reshape(1, N, D) return (hidden_t, )