def test_reward_match_label(self): data = np.array([[12, 10, 10, 0], [12, 8, 7, 0], [13, 4, 11, 0], [8, 13, 9, 0], [7, 5, 11, 0], [0, 12, 12, 0], [0, 10, 0, 0], [0, 0, 0, 0]]) features = dstruct.Seq2SeqFeatureTuple(*(None, None, None, None)) labels = dstruct.SeqLabelTuple(*(data, None, None)) batch = dstruct.BatchTuple(features, labels, None, False) sample = np.array([[12, 10, 10, 0], [12, 8, 7, 0], [11, 4, 11, 0], [8, 13, 9, 0], [7, 5, 11, 0], [0, 12, 12, 0], [0, 1, 0, 0], [0, 1, 0, 0], [0, 0, 0, 0]]) exact_match = np.array([[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 0], [0, 0, 0, 0], [0, 0, 0, 0]]) parti_match = np.array([[1, 1, 1, 0], [1, 1, 1, 0], [0, 1, 1, 0], [1, 1, 1, 0], [1, 1, 1, 0], [1, 1, 1, 0], [0, 0, 1, 0], [0, 0, 0, 0], [0, 0, 0, 0]]) sample_len = np.array([6., 9., 7., 0.]) _sample_len = np.array([6., 9., 7., 1.]) # for division m, avg = generator.reward_match_label(sample, batch, partial_match=False) np.testing.assert_array_equal(m, exact_match, 'label exact match reward') self.assertEqual(avg, 1 / 3, 'average correct') m, avg = generator.reward_match_label(sample, batch, partial_match=True) np.testing.assert_array_equal(m, parti_match / _sample_len, 'label match reward') self.assertEqual( avg, np.sum(parti_match / _sample_len) / np.sum(sample_len > 0), 'average correct')
def test_get_batch_data(self): data = np.array([[12, 10, 10, 0], [12, 8, 7, 0], [13, 4, 11, 0], [8, 13, 9, 0], [7, 5, 11, 0], [0, 12, 12, 0], [0, 10, 0, 0], [0, 0, 0, 0]]) seq_len = np.array([6, 8, 7, 0]) features = dstruct.Seq2SeqFeatureTuple(*(data, seq_len, None, None)) labels = dstruct.SeqLabelTuple(*(None, None, None)) batch = dstruct.BatchTuple(features, labels, None, False) new_batch = generator.get_batch_data( batch, data, start_id=1, seq_len_idx=1, input_key='dec_inputs', seq_len_key='dec_seq_len', unmasked_token_weight=np.ones_like(data) * 2) _f, _l, _n, _k = new_batch self.assertIs(data, _f.enc_inputs, 'enc data is the same object.') self.assertIs(seq_len, _f.enc_seq_len, 'enc seq len is the same object.') np.testing.assert_array_equal(data[:-1, :], _f.dec_inputs[1:, :], err_msg='dec input is shifted data.') np.testing.assert_array_equal(data, _l.label, err_msg='dec output is data.') np.testing.assert_array_equal(seq_len, _f.dec_seq_len, err_msg='dec seq len is correct.') self.assertEqual(sum(seq_len), _n, 'num tokens is correct.') w = np.array([[1, 1, 1, 0], [1, 1, 1, 0], [1, 1, 1, 0], [1, 1, 1, 0], [1, 1, 1, 0], [1, 1, 1, 0], [0, 1, 1, 0], [0, 1, 0, 0]]) np.testing.assert_array_equal(w * 2, new_batch.labels.label_weight, err_msg='token weight')
def get_batch_data(batch, y_arr, unmasked_token_weight=None, unmasked_seq_weight=None, start_id=1, seq_len_idx=1, input_key='inputs', seq_len_key='seq_len'): y_len = np.argmin(y_arr, axis=0) + 1 y_len[batch.features[seq_len_idx] <= 0] = 0 seq_weight = np.where(y_len > 0, 1, 0).astype(np.float32) if unmasked_seq_weight is not None: seq_weight *= unmasked_seq_weight token_weight, num_tokens = util.masked_full_like(y_arr, 1, y_len) if unmasked_token_weight is not None: token_weight *= unmasked_token_weight start = np.full((1, len(y_len)), start_id, dtype=np.int32) * seq_weight x_arr = np.vstack((start.astype(np.int32), y_arr))[:-1, :] features = batch.features._replace(**{ input_key: x_arr, seq_len_key: y_len }) labels = ds.SeqLabelTuple(y_arr, token_weight, seq_weight) batch = ds.BatchTuple(features, labels, num_tokens, batch.keep_state) return batch
def lseq2seq_batch_iter(enc_data, dec_data, label_data, mask_data, batch_size=1, shuffle=True): """same as seq2seq_batch_iter, just add label""" data_tuple = (enc_data, dec_data, label_data, mask_data) for x, y, L, M in batch_iter(batch_size, shuffle, *data_tuple, pad=[[], [], 0, 2]): enc, enc_len = util.hstack_list(x) dec, dec_len = util.hstack_list(y) label = np.array(L, dtype=np.int32) mask = np.array(M, dtype=np.int32) in_dec = dec[:-1, :] out_dec = dec[1:, :] seq_weight = np.where(dec_len > 0, 1, 0) dec_len -= seq_weight token_weight, num_tokens = util.masked_full_like( out_dec, 1, num_non_padding=dec_len) seq_weight = seq_weight.astype(np.float32) features = ds.LSeq2SeqFeatureTuple(enc, enc_len, in_dec, dec_len, label, mask) labels = ds.SeqLabelTuple(out_dec, token_weight, seq_weight) yield ds.BatchTuple(features, labels, num_tokens, False)
def seq_batch_iter(in_data, out_data, weights, batch_size=1, shuffle=True, keep_sentence=True): """wrapper of batch_iter to format seq data""" keep_state = not keep_sentence # add one more argumennt and pass it to "batch_iter" below # also add 0 for the padding if weights: # import pdb; pdb.set_trace() for x, y, w in batch_iter(batch_size, shuffle, in_data, out_data, weights, pad=[[], [], 0]): x_arr, x_len = util.hstack_list(x) y_arr, y_len = util.hstack_list(y) # w_arr, w_len = util.hstack_list(w) # change seq_weight to be the input weight seq_weight = np.where(y_len > 0, w, 0).astype(np.float32) # import pdb; pdb.set_trace() token_weight, num_tokens = util.masked_full_like( y_arr, w, num_non_padding=y_len) features = ds.SeqFeatureTuple(x_arr, x_len) labels = ds.SeqLabelTuple(y_arr, token_weight, seq_weight) yield ds.BatchTuple(features, labels, num_tokens, keep_state) else: for x, y in batch_iter(batch_size, shuffle, in_data, out_data, pad=[[], []]): x_arr, x_len = util.hstack_list(x) y_arr, y_len = util.hstack_list(y) seq_weight = np.where(y_len > 0, 1, 0).astype(np.float32) token_weight, num_tokens = util.masked_full_like( y_arr, 1, num_non_padding=y_len) features = ds.SeqFeatureTuple(x_arr, x_len) labels = ds.SeqLabelTuple(y_arr, token_weight, seq_weight) yield ds.BatchTuple(features, labels, num_tokens, keep_state)
def concat_seq_batch(batch1, batch2): _f1, _l1, _n1, _k1 = batch1 _f2, _l2, _n2, _k2 = batch2 inputs = util.hstack_with_padding(_f1.inputs, _f2.inputs) seq_len = np.concatenate((_f1.seq_len, _f2.seq_len)) f = ds.SeqFeatureTuple(inputs, seq_len) label = util.hstack_with_padding(_l1.label, _l2.label) label_weight = util.hstack_with_padding(_l1.label_weight, _l2.label_weight) _l2.seq_weight[:] = 0 seq_weight = np.concatenate((_l1.seq_weight, _l2.seq_weight)) l = ds.SeqLabelTuple(label, label_weight, seq_weight) return ds.BatchTuple(f, l, _n1 + _n2, _k1)
def _format_word2def(x, w, c, y, sw): enc, enc_len = util.hstack_list(x) dec, dec_len = util.hstack_list(y) word = np.array(w, dtype=np.int32) char, char_len = util.vstack_list(c) in_dec = dec[:-1, :] out_dec = dec[1:, :] seq_weight = np.array(sw, dtype=np.float32) dec_len -= np.where(dec_len > 0, 1, 0) token_weight, num_tokens = util.masked_full_like(out_dec, 1, num_non_padding=dec_len) seq_weight = seq_weight.astype(np.float32) features = ds.Word2DefFeatureTuple(enc, enc_len, word, char, char_len, in_dec, dec_len) labels = ds.SeqLabelTuple(out_dec, token_weight, seq_weight) return ds.BatchTuple(features, labels, num_tokens, False)
def concat_word2def_batch(batch1, batch2): _f1, _l1, _n1, _k1 = batch1 _f2, _l2, _n2, _k2 = batch2 enc_inputs = util.hstack_with_padding(_f1.enc_inputs, _f2.enc_inputs) enc_seq_len = np.concatenate((_f1.enc_seq_len, _f2.enc_seq_len)) words = np.concatenate((_f1.words, _f2.words)) chars = util.vstack_with_padding(_f1.chars, _f2.chars) char_len = np.concatenate((_f1.char_len, _f2.char_len)) dec_inputs = util.hstack_with_padding(_f1.dec_inputs, _f2.dec_inputs) dec_seq_len = np.concatenate((_f1.dec_seq_len, _f2.dec_seq_len)) f = ds.Word2DefFeatureTuple(enc_inputs, enc_seq_len, words, chars, char_len, dec_inputs, dec_seq_len) label = util.hstack_with_padding(_l1.label, _l2.label) label_weight = util.hstack_with_padding(_l1.label_weight, _l2.label_weight) seq_weight = np.concatenate((_l1.seq_weight, _l2.seq_weight)) l = ds.SeqLabelTuple(label, label_weight, seq_weight) return ds.BatchTuple(f, l, _n1 + _n2, False)
def seq2seq_batch_iter(enc_data, dec_data, batch_size=1, shuffle=True): """wrapper of batch_iter to format seq2seq data""" for x, y in batch_iter(batch_size, shuffle, enc_data, dec_data, pad=[[], []]): enc, enc_len = util.hstack_list(x) dec, dec_len = util.hstack_list(y) in_dec = dec[:-1, :] out_dec = dec[1:, :] seq_weight = np.where(dec_len > 0, 1, 0) dec_len -= seq_weight token_weight, num_tokens = util.masked_full_like( out_dec, 1, num_non_padding=dec_len) seq_weight = seq_weight.astype(np.float32) features = ds.Seq2SeqFeatureTuple(enc, enc_len, in_dec, dec_len) labels = ds.SeqLabelTuple(out_dec, token_weight, seq_weight) yield ds.BatchTuple(features, labels, num_tokens, False)
def _build_logit(self, opt, reuse_scope, collect_kwargs, emb_vars, cell_output): # logit logit_w_ = emb_vars if opt['share:input_emb_logit'] else None logit_opt = util.dict_with_key_startswith(opt, 'logit:') with tfg.maybe_scope(reuse_scope[self._RSK_LOGIT_]) as scope: logit_, temperature_, logit_w_, logit_b_ = tfg.get_logit_layer( cell_output, logit_w=logit_w_, **logit_opt, **collect_kwargs) dist_, dec_max_, dec_sample_ = tfg.select_from_logit(logit_) # label label_, token_weight_, seq_weight_ = tfg.get_seq_label_placeholders( label_dtype=tf.int32, **collect_kwargs) # format predict_fetch = { 'logit': logit_, 'dist': dist_, 'dec_max': dec_max_, 'dec_max_id': dec_max_.index, 'dec_sample': dec_sample_, 'dec_sample_id': dec_sample_.index} label_feed = dstruct.SeqLabelTuple(label_, token_weight_, seq_weight_) nodes = util.dict_with_key_endswith(locals(), '_') return logit_, label_feed, predict_fetch, nodes
def seq_batch_iter(in_data, out_data, batch_size=1, shuffle=True, keep_sentence=True): """wrapper of batch_iter to format seq data""" keep_state = not keep_sentence for x, y in batch_iter(batch_size, shuffle, in_data, out_data, pad=[[], []]): x_arr, x_len = util.hstack_list(x) y_arr, y_len = util.hstack_list(y) seq_weight = np.where(y_len > 0, 1, 0).astype(np.float32) token_weight, num_tokens = util.masked_full_like(y_arr, 1, num_non_padding=y_len) features = ds.SeqFeatureTuple(x_arr, x_len) labels = ds.SeqLabelTuple(y_arr, token_weight, seq_weight) yield ds.BatchTuple(features, labels, num_tokens, keep_state)