def test_stacked_birnn_construction(recurrent_input, output_size, weight_initializer, sum_outputs, concatenate_outputs): """ Tests that birnns can be stacked in all of their configurations. If they cannot, an error will be thrown, so no assertions are needed. """ # Generate ngraph RNN rnn1 = BiRNN(output_size, init=weight_initializer, activation=Tanh(), reset_cells=True, return_sequence=True, sum_out=sum_outputs, concat_out=concatenate_outputs) rnn2 = BiRNN(output_size, init=weight_initializer, activation=Tanh(), reset_cells=True, return_sequence=True, sum_out=sum_outputs, concat_out=concatenate_outputs) out = rnn1(recurrent_input) rnn2(out)
def define_recurrent_layers(out_axes=None, celltype='RNN', recurrent_units=[32], init=GlorotInit(), return_sequence=True): layers = [] for e, i in enumerate(recurrent_units): layer_return_sequence = e < len(recurrent_units) - 1 or return_sequence if celltype == 'RNN': layers.append( Recurrent(nout=i, init=init, backward=False, activation=Tanh(), return_sequence=layer_return_sequence)) elif celltype == 'LSTM': layers.append( LSTM(nout=i, init=init, backward=False, activation=Tanh(), gate_activation=Logistic(), return_sequence=layer_return_sequence)) if out_axes is not None: affine_layer = Affine(weight_init=init, bias_init=init, activation=Identity(), axes=out_axes) layers.append(affine_layer) return layers
def test_change_recurrent_axis_length(recurrent_layer_cls, batch_size, sequence_length, input_size, hidden_size): """ Recurrent layer support for changing REC axis length (needed by seq2seq inference) """ # create three identical recurrent layers with same weights W_input_val = np.random.normal(size=(hidden_size, input_size)) W_recur_val = np.random.normal(size=(hidden_size, hidden_size)) rec1 = recurrent_layer_cls(nout=hidden_size, init=ConstantInit(W_input_val), init_inner=ConstantInit(W_recur_val), activation=Tanh()) rec2 = recurrent_layer_cls(nout=hidden_size, init=ConstantInit(W_input_val), init_inner=ConstantInit(W_recur_val), activation=Tanh()) rec3 = recurrent_layer_cls(nout=hidden_size, init=ConstantInit(W_input_val), init_inner=ConstantInit(W_recur_val), activation=Tanh()) # create input placeholders and values # sequence length greater than 1 N = ng.make_axis(length=batch_size, name='N') REC = ng.make_axis(length=sequence_length, name='REC') M = ng.make_axis(length=input_size, name='M') xn_axes = ng.make_axes([M, REC, N]) xn = ng.placeholder(axes=xn_axes) xn_val = np.random.normal(size=(input_size, sequence_length, batch_size)) # sequence length 1 REC1 = ng.make_axis(length=1, name='REC') x1_axes = ng.make_axes([M, REC1, N]) x1 = ng.placeholder(axes=x1_axes) x1_val = np.random.normal(size=(input_size, 1, batch_size)) # check results of switching REC axis of a layer's input # computations switching REC axis y1_n = rec1(xn) y1_1 = rec1(x1) # check against not switching y2_n = rec2(xn) y3_1 = rec3(x1) with ExecutorFactory() as ex: y1_n_comp = ex.executor(y1_n, xn) y1_1_comp = ex.executor(y1_1, x1) y2_n_comp = ex.executor(y2_n, xn) y3_1_comp = ex.executor(y3_1, x1) ng.testing.assert_allclose(y1_n_comp(xn_val), y2_n_comp(xn_val)) ng.testing.assert_allclose(y1_1_comp(x1_val), y3_1_comp(x1_val))
def test_rnn_fprop(sequence_length, input_size, hidden_size, batch_size, return_sequence, weight_initializer, bias_initializer, init_state, extra_axes, backward, transformer_factory): assert batch_size == 1, "the recurrent reference implementation only support batch size 1" # Get input placeholder and numpy array input_placeholder, input_value = make_placeholder(input_size, sequence_length, batch_size, extra_axes=extra_axes) # Construct network weights and initial state, if desired W_in, W_rec, b, init_state, init_state_value = make_weights( input_placeholder, hidden_size, weight_initializer, bias_initializer, init_state) # Compute reference numpy RNN rnn_ref = RefRecurrent(input_size, hidden_size, return_sequence=return_sequence) rnn_ref.set_weights(W_in.reshape(rnn_ref.Wxh.shape), W_rec, b.reshape(rnn_ref.bh.shape)) # Compute reference numpy RNN input_shape = (input_size, sequence_length, batch_size) h_ref_list = rnn_ref.fprop_only(input_value.reshape(input_shape).transpose( [1, 0, 2]), init_states=init_state_value, backward=backward) # Generate ngraph RNN rnn_ng = Recurrent(hidden_size, init=W_in, init_inner=W_rec, activation=Tanh(), reset_cells=True, return_sequence=return_sequence, backward=backward) # fprop ngraph RNN out_ng = rnn_ng(input_placeholder, init_state=init_state) with ExecutorFactory() as ex: # Create computation and execute if init_state is not None: fprop_neon_fun = ex.executor(out_ng, input_placeholder, init_state) fprop_neon = fprop_neon_fun(input_value, init_state_value) else: fprop_neon_fun = ex.executor(out_ng, input_placeholder) fprop_neon = fprop_neon_fun(input_value) # Compare output with reference implementation if return_sequence is True: fprop_neon = fprop_neon[:, :, 0] ng.testing.assert_allclose(fprop_neon, h_ref_list, rtol=fprop_rtol, atol=fprop_atol)
def test_birnn_fprop(sequence_length, input_size, hidden_size, batch_size, return_sequence, weight_initializer, bias_initializer, init_state, sum_out, concat_out, transformer_factory): assert batch_size == 1, "the recurrent reference implementation only support batch size 1" # Get input placeholder and numpy array input_placeholder, input_value = make_placeholder(input_size, sequence_length, batch_size) # Construct network weights and initial state, if desired W_in, W_rec, b, init_state, init_state_value = make_weights( input_placeholder, hidden_size, weight_initializer, bias_initializer, init_state) # Compute reference numpy RNN rnn_ref = RefBidirectional(input_size, hidden_size, return_sequence=return_sequence, sum_out=sum_out, concat_out=concat_out) rnn_ref.set_weights(W_in, W_rec, b.reshape(rnn_ref.fwd_rnn.bh.shape)) h_ref_list = rnn_ref.fprop(input_value.transpose([1, 0, 2]), init_states=init_state_value) # Generate ngraph RNN rnn_ng = BiRNN(hidden_size, init=W_in, init_inner=W_rec, activation=Tanh(), reset_cells=True, return_sequence=return_sequence, sum_out=sum_out, concat_out=concat_out) # fprop ngraph RNN out_ng = rnn_ng(input_placeholder, init_state=init_state) with ExecutorFactory() as ex: # Create computation and execute if init_state is not None: fprop_neon_fun = ex.executor(out_ng, input_placeholder, init_state) fprop_neon = fprop_neon_fun(input_value, init_state_value) else: fprop_neon_fun = ex.executor(out_ng, input_placeholder) fprop_neon = fprop_neon_fun(input_value) # Compare output with reference implementation if not isinstance(fprop_neon, tuple): fprop_neon = [fprop_neon] h_ref_list = [h_ref_list] for ii, output in enumerate(fprop_neon): if return_sequence is True: output = output[:, :, 0] ng.testing.assert_allclose(output, h_ref_list[ii], rtol=fprop_rtol, atol=fprop_atol)
def test_birnn_output_types(recurrent_input, output_size, weight_initializer, sum_outputs, concatenate_outputs): """ Tests that birnns output ops of the right type """ # Generate ngraph RNN rnn1 = BiRNN(output_size, init=weight_initializer, activation=Tanh(), reset_cells=True, return_sequence=True, sum_out=sum_outputs, concat_out=concatenate_outputs) out = rnn1(recurrent_input) if concatenate_outputs: assert isinstance(out, ng.ConcatOp), \ "Output is of type {} instead of {}".format(type(out), ng.ConcatOp) elif sum_outputs: assert isinstance(out, ng.Add), \ "Output is of type {} instead of {}".format(type(out), ng.Add) else: assert isinstance(out, tuple), \ "Output is of type {} instead of {}".format(type(out), tuple)
def test_birnn_deriv_numerical(sequence_length, input_size, hidden_size, batch_size, return_sequence, weight_initializer, bias_initializer, sum_out, concat_out): # Get input placeholder and numpy array input_placeholder, input_value = make_placeholder(input_size, sequence_length, batch_size) # Construct network weights and initial state, if desired W_in, W_rec, b, init_state, init_state_value = make_weights(input_placeholder, hidden_size, weight_initializer, bias_initializer) # Generate ngraph RNN rnn_ng = BiRNN(hidden_size, init=W_in, init_inner=W_rec, activation=Tanh(), reset_cells=True, return_sequence=return_sequence, sum_out=sum_out, concat_out=concat_out) # fprop ngraph RNN out_ng = rnn_ng.train_outputs(input_placeholder) w_in_f = rnn_ng.fwd_rnn.W_input w_rec_f = rnn_ng.fwd_rnn.W_recur b_f = rnn_ng.fwd_rnn.b w_in_b = rnn_ng.bwd_rnn.W_input w_rec_b = rnn_ng.bwd_rnn.W_recur b_b = rnn_ng.bwd_rnn.b params_f = [(w_in_f, W_in), (w_rec_f, W_rec), (b_f, b)] params_b = [(w_in_b, W_in), (w_rec_b, W_rec), (b_b, b)] if sum_out or concat_out: out_ng = [out_ng] params_birnn = [params_f + params_b] else: # in this case out_ng will be a list params_birnn = [params_f, params_b] with ExecutorFactory() as ex: # Create derivative computations and execute param_updates = list() dep_list = list() for output, dependents in zip(out_ng, params_birnn): for px, _ in dependents: update = (ex.derivative(output, px, input_placeholder), ex.numeric_derivative(output, px, delta, input_placeholder)) param_updates.append(update) dep_list += dependents for ii, ((deriv_s, deriv_n), (_, val)) in enumerate(zip(param_updates, dep_list)): ng.testing.assert_allclose(deriv_s(val, input_value), deriv_n(val, input_value), rtol=num_rtol, atol=num_atol)
def test_rnn_deriv_numerical(sequence_length, input_size, hidden_size, batch_size, return_sequence, weight_initializer, bias_initializer, backward, init_state, transformer_factory): # Get input placeholder and numpy array input_placeholder, input_value = make_placeholder(input_size, sequence_length, batch_size) # Construct network weights and initial state, if desired W_in, W_rec, b, init_state, init_state_value = make_weights( input_placeholder, hidden_size, weight_initializer, bias_initializer, init_state) # Generate ngraph RNN rnn_ng = Recurrent(hidden_size, init=W_in, init_inner=W_rec, activation=Tanh(), reset_cells=True, return_sequence=return_sequence, backward=backward) # fprop ngraph RNN out_ng = rnn_ng(input_placeholder, init_state=init_state) params = [(rnn_ng.W_input, W_in), (rnn_ng.W_recur, W_rec), (rnn_ng.b, b)] with ExecutorFactory() as ex: # Create derivative computations and execute param_updates = list() for px, _ in params: if init_state is not None: update = (ex.derivative(out_ng, px, input_placeholder, init_state), ex.numeric_derivative(out_ng, px, delta, input_placeholder, init_state)) else: update = (ex.derivative(out_ng, px, input_placeholder), ex.numeric_derivative(out_ng, px, delta, input_placeholder)) param_updates.append(update) for (deriv_s, deriv_n), (_, val) in zip(param_updates, params): if init_state is not None: ng.testing.assert_allclose(deriv_s(val, input_value, init_state_value), deriv_n(val, input_value, init_state_value), rtol=num_rtol, atol=num_atol) else: ng.testing.assert_allclose(deriv_s(val, input_value), deriv_n(val, input_value), rtol=num_rtol, atol=num_atol)
def make_generator(bn=True): # TODO # add affine before conv once that is corrected # https://github.com/NervanaSystems/private-ngraph/issues/2054 deconv_layers = [ Deconvolution((1, 1, 16), filter_init, strides=1, padding=0, activation=relu, batch_norm=bn), Deconvolution((3, 3, 192), filter_init, strides=1, padding=0, activation=relu, batch_norm=bn, deconv_out_shape=(1, 5, 5)), Deconvolution((3, 3, 192), filter_init, strides=2, padding=0, activation=relu, batch_norm=bn, deconv_out_shape=(1, 11, 11)), Deconvolution((3, 3, 192), filter_init, strides=1, padding=0, activation=relu, batch_norm=bn, deconv_out_shape=(1, 13, 13)), Deconvolution((3, 3, 96), filter_init, strides=2, padding=0, activation=relu, batch_norm=bn, deconv_out_shape=(1, 27, 27)), Deconvolution((3, 3, 96), filter_init, strides=1, padding=0, activation=relu, batch_norm=bn, deconv_out_shape=(1, 28, 28)), Deconvolution((3, 3, 1), filter_init, strides=1, padding=1, activation=Tanh(), batch_norm=False, deconv_out_shape=(1, 28, 28)) ] return Sequential(deconv_layers, name="Generator")
def test_rnn_deriv_ref(sequence_length, input_size, hidden_size, batch_size, return_sequence, weight_initializer, bias_initializer, transformer_factory): assert batch_size == 1, "the recurrent reference implementation only support batch size 1" assert return_sequence is True, "the reference rnn only supports sequences for deriv" # Get input placeholder and numpy array input_placeholder, input_value = make_placeholder(input_size, sequence_length, batch_size) # Construct network weights and initial state, if desired W_in, W_rec, b, init_state, init_state_value = make_weights(input_placeholder, hidden_size, weight_initializer, bias_initializer) # Compute reference numpy RNN rnn_ref = RefRecurrent(input_size, hidden_size, return_sequence=return_sequence) rnn_ref.set_weights(W_in, W_rec, b.reshape(rnn_ref.bh.shape)) # Prepare deltas for gradient check output_shape = (hidden_size, sequence_length, batch_size) # generate random deltas tensor deltas = np.random.randn(*output_shape) # the reference code expects these shapes: # input_shape: (seq_len, input_size, batch_size) # output_shape: (seq_len, hidden_size, batch_size) dW_in, dW_rec, db = rnn_ref.lossFun(input_value.transpose([1, 0, 2]), deltas.copy().transpose([1, 0, 2]), init_states=init_state_value)[:3] # Generate ngraph RNN rnn_ng = Recurrent(hidden_size, init=W_in, init_inner=W_rec, activation=Tanh(), reset_cells=True, return_sequence=return_sequence) # fprop ngraph RNN out_ng = rnn_ng.train_outputs(input_placeholder) deltas_constant = ng.constant(deltas, axes=out_ng.axes) params = [(rnn_ng.W_input, W_in), (rnn_ng.W_recur, W_rec), (rnn_ng.b, b)] with ExecutorFactory() as ex: # Create derivative computations and execute param_updates = list() for px, _ in params: update = ng.deriv(out_ng, px, error=deltas_constant) param_updates.append(ex.executor(update, input_placeholder)) for update_fun, ref_val in zip(param_updates, [dW_in, dW_rec, db]): ng.testing.assert_allclose(update_fun(input_value), ref_val.squeeze(), rtol=bprop_rtol, atol=bprop_atol)
def make_generator_gp(bn=True, n_extra_layers=0, bias_init=None): deconv_layers = [ Deconvolution((4, 4, 512), filter_init, strides=1, padding=0, activation=relu, batch_norm=bn, bias_init=bias_init), Deconvolution((4, 4, 256), filter_init, strides=2, padding=1, activation=relu, batch_norm=bn, bias_init=bias_init), Deconvolution((4, 4, 128), filter_init, strides=2, padding=1, activation=relu, batch_norm=bn, bias_init=bias_init), Deconvolution((4, 4, 64), filter_init, strides=2, padding=1, activation=relu, batch_norm=bn, bias_init=bias_init) ] for i in range(n_extra_layers): deconv_layers.append( Convolution((3, 3, 64), filter_init, strides=1, padding=1, activation=lrelu, batch_norm=bn, bias_init=bias_init)) deconv_layers.append( Deconvolution((4, 4, 3), filter_init, strides=2, padding=1, activation=Tanh(), batch_norm=False, bias_init=bias_init)) return Sequential(deconv_layers, name="Generator")
def test_rnn_deriv_numerical(sequence_length, input_size, hidden_size, batch_size, return_sequence, weight_initializer, bias_initializer, backward, init_state): # Get input placeholder and numpy array input_placeholder, input_value = make_placeholder(input_size, sequence_length, batch_size) # Construct network weights and initial state, if desired W_in, W_rec, b, init_state, init_state_value = make_weights(input_placeholder, hidden_size, weight_initializer, bias_initializer, init_state) # Generate ngraph RNN rnn_ng = RNNCell(hidden_size, init=W_in, init_h2h=W_rec, activation=Tanh(), reset_cells=True) # fprop ngraph RNN num_steps = input_placeholder.axes.recurrent_axis().length init_states = {'h': init_state} if init_state is not None else init_state out_ng = unroll(rnn_ng, num_steps, input_placeholder, init_states=init_states, return_sequence=return_sequence) params = [(rnn_ng.i2h.linear.W, W_in), (rnn_ng.h2h.W, W_rec), # (rnn_ng.i2h.bias.W, b) ] with ExecutorFactory() as ex: # Create derivative computations and execute param_updates = list() for px, _ in params: if init_state is not None: update = (ex.derivative(out_ng, px, input_placeholder, init_state), ex.numeric_derivative(out_ng, px, delta, input_placeholder, init_state)) else: update = (ex.derivative(out_ng, px, input_placeholder), ex.numeric_derivative(out_ng, px, delta, input_placeholder)) param_updates.append(update) for (deriv_s, deriv_n), (_, val) in zip(param_updates, params): if init_state is not None: ng.testing.assert_allclose(deriv_s(val, input_value, init_state_value), deriv_n(val, input_value, init_state_value), rtol=num_rtol, atol=num_atol) else: ng.testing.assert_allclose(deriv_s(val, input_value), deriv_n(val, input_value), rtol=num_rtol, atol=num_atol)
def make_generator(bn=True, bias_init=None): deconv_layers = [ Affine(weight_init=filter_init, activation=None, batch_norm=False, axes=ng.make_axes({ "C": 1024, "H": 4, "W": 4 })), Deconvolution((4, 4, 512), filter_init, strides=2, padding=1, activation=relu, batch_norm=bn, bias_init=bias_init), Deconvolution((4, 4, 256), filter_init, strides=2, padding=1, activation=relu, batch_norm=bn, bias_init=bias_init), Deconvolution((4, 4, 128), filter_init, strides=2, padding=1, activation=relu, batch_norm=bn, bias_init=bias_init) ] deconv_layers.append( Deconvolution((4, 4, 3), filter_init, strides=2, padding=1, activation=Tanh(), batch_norm=False, bias_init=bias_init)) return Sequential(deconv_layers, name="Generator")
def __init__(self, input_placeholder, output_size, RNN, bn_params): # Set up axes F, T, N = tuple(input_placeholder.axes) H = ng.make_axis(length=output_size, name="hidden") H2 = ng.make_axis(length=output_size, name="hidden_tmp") self.input_placeholder = input_placeholder # Make reference placeholder self.reference_input = ng.placeholder(axes=[H, T, N]) # Create weight matrices w_rec_axes = ng.make_axes([H, H2]) w_in_axes = ng.make_axes([H, F]) self.W_rec = rng.uniform(-1, 1, w_rec_axes) self.W_in = rng.uniform(-1, 1, w_in_axes) self.W_id = np.eye(output_size).astype("float32") self.rnn_args = dict(nout=output_size, init_inner=self.W_rec, return_sequence=True, activation=Tanh()) self.reference_rnn = RNN(init=self.W_id, **self.rnn_args) self.rnn = RNN(init=self.W_in, batch_norm=True, **self.rnn_args) if self.has_gates: self.batch_norm_dict = self.rnn.batch_norm else: self.batch_norm_dict = {'gate': self.rnn.batch_norm} self.default_gate = list(self.batch_norm_dict.keys())[0] for bn in self.batch_norm_dict.values(): bn.__dict__.update(bn_params)
valid_set = SequentialArrayIterator(ptb_data['valid'], batch_size=args.batch_size, time_steps=time_steps) inputs = train_set.make_placeholders() ax.Y.length = len(tree_bank_data.vocab) def expand_onehot(x): return ng.one_hot(x, axis=ax.Y) # weight initialization init = UniformInit(low=-0.08, high=0.08) if args.layer_type == "lstm": rlayer1 = LSTM(hidden_size, init, activation=Tanh(), gate_activation=Logistic(), return_sequence=True) rlayer2 = LSTM(hidden_size, init, activation=Tanh(), gate_activation=Logistic(), return_sequence=True) # model initialization seq1 = Sequential([Preprocess(functor=expand_onehot), rlayer1, rlayer2, Affine(init, activation=Softmax(), bias_init=init, axes=(ax.Y,))]) optimizer = RMSProp(gradient_clip_value=gradient_clip_value) train_prob = seq1(inputs['inp_txt']) train_loss = ng.cross_entropy_multi(train_prob, ng.one_hot(inputs['tgt_txt'], axis=ax.Y),
def expand_onehot(x): return ng.one_hot(x, axis=ax.Y) # weight initialization init = UniformInit(low=-0.08, high=0.08) if args.use_lut: layer_0 = LookupTable(50, 100, init, update=True, pad_idx=0) else: layer_0 = Preprocess(functor=lambda x: ng.one_hot(x, axis=ax.Y)) if args.layer_type == "rnn": rlayer = Recurrent(hidden_size, init, activation=Tanh()) elif args.layer_type == "birnn": rlayer = BiRNN(hidden_size, init, activation=Tanh(), return_sequence=True, sum_out=True) if args.use_lut: layer_0 = LookupTable(50, 100, init, update=False) else: layer_0 = Preprocess(functor=expand_onehot) # model initialization seq1 = Sequential([layer_0, rlayer, Affine(init, activation=Softmax(), bias_init=init, axes=(ax.Y,))]) optimizer = RMSProp()
def __init__(self): super(BiRNNLayer, self).__init__() self.layer = BiRNN(nout=16, init=ConstantInit(0.0), activation=Tanh())
def test_seq2seq_deriv_ref(batch_size, sequence_length_enc, sequence_length_dec, input_size, hidden_size, weight_initializer, bias_initializer, transformer_factory): # TODO: are these assumptions true? assert batch_size == 1, "the seq2seq reference implementation only support batch size 1" # Get input placeholders and numpy arrays input_placeholder_enc, input_value_enc, = \ make_placeholder(input_size, sequence_length_enc, batch_size) input_placeholder_dec, input_value_dec, = \ make_placeholder(input_size, sequence_length_dec, batch_size) # Construct encoder weights W_in_enc, W_rec_enc, b_enc, _, _ = make_weights(input_placeholder_enc, hidden_size, weight_initializer, bias_initializer, init_state=False) # Construct decoder weights W_in_dec, W_rec_dec, b_dec, _, _ = make_weights(input_placeholder_dec, hidden_size, weight_initializer, bias_initializer, init_state=False) # Reference numpy seq2seq seq2seq_ref = RefSeq2Seq(input_size, hidden_size, decoder_return_sequence=True) seq2seq_ref.set_weights(W_in_enc, W_rec_enc, b_enc.reshape(seq2seq_ref.bh_enc.shape), W_in_dec, W_rec_dec, b_dec.reshape(seq2seq_ref.bh_dec.shape)) # Prepare deltas for gradient check output_shape = (hidden_size, sequence_length_dec, batch_size) # generate random deltas tensor deltas = np.random.randn(*output_shape) # the reference code expects these shapes: # input_shape: (seq_len, input_size, batch_size) # output_shape: (seq_len, hidden_size, batch_size) dW_in_enc, dW_rec_enc, db_enc, dW_in_dec, dW_rec_dec, db_dec, encoding_ref, hs_return_dec = \ seq2seq_ref.lossFun(input_value_enc.transpose([1, 0, 2]), input_value_dec.transpose([1, 0, 2]), deltas.copy().transpose([1, 0, 2])) # Generate ngraph Seq2Seq rnn_enc_ng = Recurrent(hidden_size, init=W_in_enc, init_inner=W_rec_enc, activation=Tanh(), reset_cells=True, return_sequence=False) rnn_dec_ng = Recurrent(hidden_size, init=W_in_dec, init_inner=W_rec_dec, activation=Tanh(), reset_cells=True, return_sequence=True) # ngraph fprop graph encoding_ng = rnn_enc_ng(input_placeholder_enc, init_state=None) output_ng = rnn_dec_ng(input_placeholder_dec, init_state=encoding_ng) deltas_constant = ng.constant(deltas, axes=output_ng.axes) params = [(rnn_dec_ng.b, db_dec), (rnn_dec_ng.W_input, dW_in_dec), (rnn_dec_ng.W_recur, dW_rec_dec), (rnn_enc_ng.b, db_enc), (rnn_enc_ng.W_input, dW_in_enc), (rnn_enc_ng.W_recur, dW_rec_enc)] with ExecutorFactory() as ex: # fprop computations fprop_fun = ex.executor([encoding_ng, output_ng], input_placeholder_enc, input_placeholder_dec) # gradient computations update_funs = [] for px, _ in params: update = ng.deriv(output_ng, px, error=deltas_constant) update_funs.append( ex.executor(update, input_placeholder_enc, input_placeholder_dec)) # check forward pass encoding, output = fprop_fun(input_value_enc, input_value_dec) ng.testing.assert_allclose(encoding, encoding_ref) ng.testing.assert_allclose(np.squeeze(output), np.squeeze(hs_return_dec)) # check gradient computations for update_fun, (_, deriv_ref_val) in zip(update_funs, params): grad_neon = update_fun(input_value_enc, input_value_dec) ng.testing.assert_allclose(grad_neon, deriv_ref_val.squeeze(), rtol=bprop_rtol, atol=1e-4)
train_set = ArrayIterator(imdb_data['train'], batch_size=args.batch_size, total_iterations=args.num_iterations) valid_set = ArrayIterator(imdb_data['valid'], batch_size=args.batch_size) inputs = train_set.make_placeholders() ax.Y.length = imdb_dataset.nclass # weight initialization init = UniformInit(low=-0.08, high=0.08) if args.layer_type == "rnn": rlayer = Recurrent(hidden_size, init, activation=Tanh(), reset_cells=True, return_sequence=False) else: rlayer = BiRNN(hidden_size, init, activation=Tanh(), reset_cells=True, return_sequence=False, sum_out=True) # model initialization seq1 = Sequential([ LookupTable(vocab_size, embed_size, init, update=True, pad_idx=pad_idx), rlayer, Affine(init, activation=Softmax(), bias_init=init, axes=(ax.Y, ))
train_set = SequentialArrayIterator(ptb_data['train'], batch_size=args.batch_size, time_steps=time_steps, total_iterations=args.num_iterations) valid_set = SequentialArrayIterator(ptb_data['valid'], batch_size=args.batch_size, time_steps=time_steps) # weight initialization init = UniformInit(low=-0.08, high=0.08) # model initialization seq1 = Sequential([ Preprocess(functor=lambda x: ng.one_hot(x, axis=ax.Y)), Recurrent(hidden_size, init, activation=Tanh()), Affine(weight_init=init, activation=Softmax(), bias_init=init, axes=(ax.Y, ax.REC)) ]) # Bind axes lengths: ax.Y.length = len(tree_bank_data.vocab) ax.REC.length = time_steps ax.N.length = args.batch_size # placeholders with descriptive names inputs = dict(inp_txt=ng.placeholder([ax.REC, ax.N]), tgt_txt=ng.placeholder([ax.REC, ax.N]))
def __init__(self): super(LSTMLayer, self).__init__() self.layer = LSTM(nout=16, init=ConstantInit(0.0), activation=Tanh(), gate_activation=Tanh())
train_set = SequentialArrayIterator(ptb_data['train'], batch_size=args.batch_size, time_steps=time_steps, total_iterations=args.num_iterations) valid_set = SequentialArrayIterator(ptb_data['valid'], batch_size=args.batch_size, time_steps=time_steps) # weight initialization init = UniformInit(low=-0.08, high=0.08) # model initialization seq1 = Sequential([ Preprocess(functor=lambda x: ng.one_hot(x, axis=ax.Y)), Recurrent(hidden_size, init, activation=Tanh(), reset_cells=False), Affine(init, activation=Softmax(), bias_init=init, axes=(ax.Y, ax.REC)) ]) # Bind axes lengths: ax.Y.length = len(tree_bank_data.vocab) ax.REC.length = time_steps ax.N.length = args.batch_size # placeholders with descriptive names inputs = dict(inp_txt=ng.placeholder([ax.REC, ax.N]), tgt_txt=ng.placeholder([ax.REC, ax.N])) optimizer = RMSProp(decay_rate=0.95, learning_rate=2e-3, epsilon=1e-6,
def check_rnn(seq_len, input_size, hidden_size, batch_size, init_func, return_seq=True): # init_func is the initializer for the model params assert batch_size == 1, "the recurrent reference implementation only support batch size 1" # ========== neon model ========== Cin = ng.make_axis(input_size) REC = ng.make_axis(seq_len, recurrent=True) N = ng.make_axis(batch_size, batch=True) H = ng.make_axis(hidden_size) ax_s = ng.make_axes([H, N]) ex = ExecutorFactory() np.random.seed(0) rnn_ng = Recurrent(hidden_size, init_func, activation=Tanh(), reset_cells=True, return_sequence=return_seq) inp_ng = ng.placeholder([Cin, REC, N]) init_state_ng = ng.placeholder(ax_s) # fprop graph out_ng = rnn_ng.train_outputs(inp_ng, init_state=init_state_ng) out_ng.input = True rnn_W_input = rnn_ng.W_input rnn_W_input.input = True rnn_W_recur = rnn_ng.W_recur rnn_W_recur.input = True rnn_b = rnn_ng.b rnn_b.input = True fprop_neon_fun = ex.executor(out_ng, inp_ng, init_state_ng) dWrecur_s_fun = ex.derivative(out_ng, rnn_W_recur, inp_ng, rnn_W_input, rnn_b) dWrecur_n_fun = ex.numeric_derivative(out_ng, rnn_W_recur, delta, inp_ng, rnn_W_input, rnn_b) dWinput_s_fun = ex.derivative(out_ng, rnn_W_input, inp_ng, rnn_W_recur, rnn_b) dWinput_n_fun = ex.numeric_derivative(out_ng, rnn_W_input, delta, inp_ng, rnn_W_recur, rnn_b) dWb_s_fun = ex.derivative(out_ng, rnn_b, inp_ng, rnn_W_input, rnn_W_recur) dWb_n_fun = ex.numeric_derivative(out_ng, rnn_b, delta, inp_ng, rnn_W_input, rnn_W_recur) # fprop on random inputs input_value = rng.uniform(-1, 1, inp_ng.axes) init_state_value = rng.uniform(-1, 1, init_state_ng.axes) fprop_neon = fprop_neon_fun(input_value, init_state_value).copy() # after the rnn graph has been executed, can get the W values. Get copies so # shared values don't confuse derivatives Wxh_neon = rnn_ng.W_input.value.get(None).copy() Whh_neon = rnn_ng.W_recur.value.get(None).copy() bh_neon = rnn_ng.b.value.get(None).copy() # bprop derivs dWrecur_s = dWrecur_s_fun(Whh_neon, input_value, Wxh_neon, bh_neon) dWrecur_n = dWrecur_n_fun(Whh_neon, input_value, Wxh_neon, bh_neon) np.testing.assert_allclose(dWrecur_s, dWrecur_n, rtol=rtol, atol=atol) dWb_s = dWb_s_fun(bh_neon, input_value, Wxh_neon, Whh_neon) dWb_n = dWb_n_fun(bh_neon, input_value, Wxh_neon, Whh_neon) np.testing.assert_allclose(dWb_s, dWb_n, rtol=rtol, atol=atol) dWinput_s = dWinput_s_fun(Wxh_neon, input_value, Whh_neon, bh_neon) dWinput_n = dWinput_n_fun(Wxh_neon, input_value, Whh_neon, bh_neon) np.testing.assert_allclose(dWinput_s, dWinput_n, rtol=rtol, atol=atol) # ========= reference model ========== output_shape = (hidden_size, seq_len * batch_size) # generate random deltas tensor deltas = np.random.randn(*output_shape) # the reference code expects these shapes: # input_shape: (seq_len, input_size, batch_size) # output_shape: (seq_len, hidden_size, batch_size) deltas_ref = deltas.copy().T.reshape(seq_len, batch_size, hidden_size).swapaxes(1, 2) inp_ref = input_value.transpose([1, 0, 2]) # reference numpy RNN rnn_ref = RefRecurrent(input_size, hidden_size) rnn_ref.Wxh[:] = Wxh_neon rnn_ref.Whh[:] = Whh_neon rnn_ref.bh[:] = bh_neon.reshape(rnn_ref.bh.shape) (dWxh_ref, dWhh_ref, db_ref, h_ref_list, dh_ref_list, d_out_ref) = rnn_ref.lossFun(inp_ref, deltas_ref, init_states=init_state_value) # comparing outputs if return_seq is False: h_ref_list = h_ref_list[:, -1].reshape(-1, 1) else: fprop_neon = fprop_neon[:, :, 0] np.testing.assert_allclose(fprop_neon, h_ref_list, rtol=0.0, atol=1.0e-5) return
out_axes = ng.make_axes([batch_axis, out_axis]) # Build placeholders for the created axes inputs = { 'X': ng.placeholder(in_axes), 'y': ng.placeholder(out_axes), 'iteration': ng.placeholder(axes=()) } # Network Definition seq1 = Sequential([ LSTM(nout=recurrent_units, init=init_uni, backward=False, activation=Logistic(), gate_activation=Tanh(), return_sequence=predict_seq), Affine(weight_init=init_uni, bias_init=init_uni, activation=Identity(), axes=out_axis) ]) # Optimizer # Following policy will set the initial learning rate to 0.05 (base_lr) # At iteration (num_iterations // 5), learning rate is multiplied by gamma (new lr = .005) # At iteration (num_iterations // 2), it will be reduced by gamma again (new lr = .0005) schedule = [num_iterations // 5, num_iterations // 2] learning_rate_policy = { 'name': 'schedule', 'schedule': schedule,
# number of classes ax.Y.length = time_steps # create iterator and placeholders for training data train_set = TSPSequentialArrayIterator(data_arrays=tsp_data['train'], nfeatures=num_features, batch_size=args.batch_size, time_steps=time_steps, total_iterations=args.num_iterations) inputs = train_set.make_placeholders() # weight initializationn init = UniformInit(low=-0.08, high=0.08) # build computational graph enc = LSTM(args.hs, init, activation=Tanh(), reset_cells=True, gate_activation=Logistic(), return_sequence=True) dec = LSTM(args.hs, init, activation=Tanh(), reset_cells=True, gate_activation=Logistic(), return_sequence=True) if args.emb is True: # encoder input embedding hidden_feature_axis = ng.make_axis(length=args.hs, name='hidden_feature_axis') feature_axis = ng.make_axis(length=num_features, name='feature_axis') W_emb = ng.variable(axes=[hidden_feature_axis, feature_axis], initial_value=init) emb_enc_inputs = ng.dot(W_emb, inputs['inp_txt']) # decoder input embedding emb_dec_input = [] ax.N.length = args.batch_size
def check_lstm(seq_len, input_size, hidden_size, batch_size, init_func, return_seq=True, backward=False, reset_cells=False, num_iter=2): Cin = ng.make_axis(input_size) REC = ng.make_axis(seq_len, name='R') N = ng.make_axis(batch_size, name='N') with ExecutorFactory() as ex: np.random.seed(0) inp_ng = ng.placeholder([Cin, REC, N]) lstm_ng = LSTM(hidden_size, init_func, activation=Tanh(), gate_activation=Logistic(), reset_cells=reset_cells, return_sequence=return_seq, backward=backward) out_ng = lstm_ng.train_outputs(inp_ng) fprop_neon_fun = ex.executor(out_ng, inp_ng) fprop_neon_list = [] input_value_list = [] for i in range(num_iter): # fprop on random inputs input_value = rng.uniform(-1, 1, inp_ng.axes) fprop_neon = fprop_neon_fun(input_value).copy() if return_seq is True: fprop_neon = fprop_neon[:, :, 0] input_value_list.append(input_value) fprop_neon_list.append(fprop_neon) if reset_cells is False: # look at the last hidden states assert ng.testing.allclose(fprop_neon[:, -1].reshape(-1, 1), lstm_ng.h_init.value.get(None), rtol=rtol, atol=atol) # after the rnn graph has been executed, can get the W values. Get copies so # shared values don't confuse derivatives # concatenate weights to i, f, o, g together (in this order) gates = ['i', 'f', 'o', 'g'] Wxh_neon = [lstm_ng.W_input[k].value.get(None).copy().T for k in gates] Whh_neon = [lstm_ng.W_recur[k].value.get(None).copy().T for k in gates] bh_neon = [lstm_ng.b[k].value.get(None).copy() for k in gates] # reference numpy LSTM lstm_ref = RefLSTM() WLSTM = lstm_ref.init(input_size, hidden_size) # make ref weights and biases with neon model WLSTM[0, :] = np.concatenate(bh_neon) WLSTM[1:input_size + 1, :] = np.concatenate(Wxh_neon, 1) WLSTM[input_size + 1:] = np.concatenate(Whh_neon, 1) # transpose input X and do fprop fprop_ref_list = [] c0 = h0 = None for i in range(num_iter): input_value = input_value_list[i] inp_ref = input_value.copy().transpose([1, 2, 0]) (Hout_ref, cprev, hprev, batch_cache) = lstm_ref.forward(inp_ref, WLSTM, c0, h0) if reset_cells is False: c0 = cprev h0 = hprev # the output needs transpose as well Hout_ref = Hout_ref.reshape(seq_len * batch_size, hidden_size).T fprop_ref_list.append(Hout_ref) for i in range(num_iter): assert ng.testing.allclose(fprop_neon_list[i], fprop_ref_list[i], rtol=rtol, atol=atol)
h_dim = 4 minibatch_discrimination = False num_iterations = 600 batch_size = 12 num_examples = num_iterations * batch_size # generator generator_layers = [ affine_layer(h_dim, Rectlin(), name='g0'), affine_layer(1, Identity(), name='g1') ] generator = Sequential(generator_layers) # discriminator discriminator_layers = [ affine_layer(2 * h_dim, Tanh(), name='d0'), affine_layer(2 * h_dim, Tanh(), name='d1') ] if minibatch_discrimination: raise NotImplementedError else: discriminator_layers.append(affine_layer(2 * h_dim, Tanh(), name='d2')) discriminator_layers.append(affine_layer(1, Logistic(), name='d3')) discriminator = Sequential(discriminator_layers) # TODO discriminator pre-training # dataloader np.random.seed(1) toy_gan_data = ToyGAN(batch_size, num_iterations) train_data = toy_gan_data.load_data()
def expand_onehot(x): return ng.one_hot(x, axis=ax.Y) # weight initialization init = UniformInit(low=-0.08, high=0.08) if args.use_lut: layer_0 = LookupTable(50, 100, init, update=True, pad_idx=0) else: layer_0 = Preprocess(functor=lambda x: ng.one_hot(x, axis=ax.Y)) if args.layer_type == "rnn": rlayer = Recurrent(hidden_size, init, activation=Tanh()) elif args.layer_type == "birnn": rlayer = BiRNN(hidden_size, init, activation=Tanh(), return_sequence=True, sum_out=True) # model initialization seq1 = Sequential([ layer_0, rlayer, Affine(init, activation=Softmax(), bias_init=init, axes=(ax.Y, )) ]) optimizer = RMSProp()
def check_stacked_lstm(seq_len, input_size, hidden_size, batch_size, init_func, return_seq=True, backward=False, reset_cells=False, num_iter=2): Cin = ng.make_axis(input_size) REC = ng.make_axis(seq_len, name='R') N = ng.make_axis(batch_size, name='N') with ExecutorFactory() as ex: np.random.seed(0) inp_ng = ng.placeholder([Cin, REC, N]) lstm_ng_1 = LSTM(hidden_size, init_func, activation=Tanh(), gate_activation=Logistic(), reset_cells=reset_cells, return_sequence=return_seq, backward=backward) lstm_ng_2 = LSTM(hidden_size, init_func, activation=Tanh(), gate_activation=Logistic(), reset_cells=reset_cells, return_sequence=return_seq, backward=backward) out_ng_1 = lstm_ng_1.train_outputs(inp_ng) out_ng_2 = lstm_ng_2.train_outputs(out_ng_1) fprop_neon_fun_2 = ex.executor(out_ng_2, inp_ng) # fprop on random inputs for multiple iterations fprop_neon_2_list = [] input_value_list = [] for i in range(num_iter): input_value = rng.uniform(-1, 1, inp_ng.axes) fprop_neon_2 = fprop_neon_fun_2(input_value).copy() # comparing outputs if return_seq is True: fprop_neon_2 = fprop_neon_2[:, :, 0] input_value_list.append(input_value) fprop_neon_2_list.append(fprop_neon_2) if reset_cells is False: # look at the last hidden states assert ng.testing.allclose(fprop_neon_2[:, -1].reshape(-1, 1), lstm_ng_2.h_init.value.get(None), rtol=rtol, atol=atol) # after the rnn graph has been executed, can get the W values. Get copies so # shared values don't confuse derivatives # concatenate weights to i, f, o, g together (in this order) gates = ['i', 'f', 'o', 'g'] Wxh_neon_1 = \ np.concatenate([lstm_ng_1.W_input[k].value.get(None).copy().T for k in gates], 1) Whh_neon_1 = \ np.concatenate([lstm_ng_1.W_recur[k].value.get(None).copy().T for k in gates], 1) bh_neon_1 = \ np.concatenate([lstm_ng_1.b[k].value.get(None).copy() for k in gates]) Wxh_neon_2 = \ np.concatenate([lstm_ng_2.W_input[k].value.get(None).copy().T for k in gates], 1) Whh_neon_2 = \ np.concatenate([lstm_ng_2.W_recur[k].value.get(None).copy().T for k in gates], 1) bh_neon_2 = \ np.concatenate([lstm_ng_2.b[k].value.get(None).copy() for k in gates]) # reference numpy LSTM lstm_ref_1 = RefLSTM() lstm_ref_2 = RefLSTM() WLSTM_1 = lstm_ref_1.init(input_size, hidden_size) WLSTM_2 = lstm_ref_2.init(hidden_size, hidden_size) # make ref weights and biases the same with neon model WLSTM_1[0, :] = bh_neon_1 WLSTM_1[1:input_size + 1, :] = Wxh_neon_1 WLSTM_1[input_size + 1:] = Whh_neon_1 WLSTM_2[0, :] = bh_neon_2 WLSTM_2[1:hidden_size + 1, :] = Wxh_neon_2 WLSTM_2[hidden_size + 1:] = Whh_neon_2 # transpose input X and do fprop fprop_ref_2_list = [] c0_1 = h0_1 = None c0_2 = h0_2 = None for i in range(num_iter): input_value = input_value_list[i] inp_ref = input_value.copy().transpose([1, 2, 0]) (Hout_ref_1, cprev_1, hprev_1, batch_cache) = lstm_ref_1.forward(inp_ref, WLSTM_1, c0_1, h0_1) (Hout_ref_2, cprev_2, hprev_2, batch_cache) = lstm_ref_2.forward(Hout_ref_1, WLSTM_2, c0_2, h0_2) if reset_cells is False: c0_1 = cprev_1 h0_1 = hprev_1 c0_2 = cprev_2 h0_2 = hprev_2 # the output needs transpose as well Hout_ref_2 = Hout_ref_2.reshape(seq_len * batch_size, hidden_size).T fprop_ref_2_list.append(Hout_ref_2) for i in range(num_iter): assert ng.testing.allclose(fprop_neon_2_list[i], fprop_ref_2_list[i], rtol=rtol, atol=atol)
get_prev_target=True) inputs = train_set.make_placeholders() ax.Y.length = len(tree_bank_data.vocab) def expand_onehot(x): return ng.one_hot(x, axis=ax.Y) # weight initialization init = UniformInit(low=-0.08, high=0.08) # model initialization one_hot_enc = Preprocess(functor=expand_onehot) enc = Recurrent(hidden_size, init, activation=Tanh(), reset_cells=True, return_sequence=False) one_hot_dec = Preprocess(functor=expand_onehot) dec = Recurrent(hidden_size, init, activation=Tanh(), reset_cells=True, return_sequence=True) linear = Affine(init, activation=Softmax(), bias_init=init, axes=(ax.Y)) optimizer = RMSProp(decay_rate=0.95, learning_rate=2e-3, epsilon=1e-6, gradient_clip_value=gradient_clip_value) # build network graph one_hot_enc_out = one_hot_enc(inputs['inp_txt']) one_hot_dec_out = one_hot_dec(inputs['prev_tgt']) enc_out = enc(one_hot_enc_out) dec_out = dec(one_hot_dec_out, init_state=enc_out) output_prob = linear(dec_out) loss = ng.cross_entropy_multi(output_prob,