def linear(args, output_size, bias, bias_start=0.0, scope=None, squeeze=False, wd=0.0, input_keep_prob=1.0, is_train=None): if args is None or (isinstance(args, (tuple, list)) and not args): raise ValueError("`args` must be specified") if not isinstance(args, (tuple, list)): args = [args] flat_args = [flatten(arg, 1) for arg in args] # for dense layer [(-1, d)] if input_keep_prob < 1.0: assert is_train is not None flat_args = [ tf.cond(is_train, lambda: tf.nn.dropout(arg, input_keep_prob), lambda: arg) # for dense layer [(-1, d)] for arg in flat_args ] flat_out = _linear(flat_args, output_size, bias, bias_start=bias_start, scope=scope) # dense out = reconstruct(flat_out, args[0], 1) # () if squeeze: out = tf.squeeze(out, [len(args[0].get_shape().as_list()) - 1]) if wd: add_reg_without_bias() return out
def contextual_bi_rnn(tensor_rep, mask_rep, hn, cell_type, only_final=False, wd=0., keep_prob=1., is_train=None, scope=None): """ fusing contextual information using bi-direction rnn :param tensor_rep: [..., sl, vec] :param mask_rep: [..., sl] :param hn: :param cell_type: 'gru', 'lstm', basic_lstm' and 'basic_rnn' :param only_final: True or False :param wd: :param keep_prob: :param is_train: :param scope: :return: """ with tf.variable_scope(scope or 'contextual_bi_rnn'): # correct reuse = None if not tf.get_variable_scope().reuse else True #print(reuse) if cell_type == 'gru': cell_fw = tf.contrib.rnn.GRUCell(hn, reuse=reuse) cell_bw = tf.contrib.rnn.GRUCell(hn, reuse=reuse) elif cell_type == 'lstm': cell_fw = tf.contrib.rnn.LSTMCell(hn, reuse=reuse) cell_bw = tf.contrib.rnn.LSTMCell(hn, reuse=reuse) elif cell_type == 'basic_lstm': cell_fw = tf.contrib.rnn.BasicLSTMCell(hn, reuse=reuse) cell_bw = tf.contrib.rnn.BasicLSTMCell(hn, reuse=reuse) elif cell_type == 'basic_rnn': cell_fw = tf.contrib.rnn.BasicRNNCell(hn, reuse=reuse) cell_bw = tf.contrib.rnn.BasicRNNCell(hn, reuse=reuse) else: raise AttributeError('no cell type \'%s\'' % cell_type) cell_dp_fw = SwitchableDropoutWrapper(cell_fw, is_train, keep_prob) cell_dp_bw = SwitchableDropoutWrapper(cell_bw, is_train, keep_prob) tensor_len = tf.reduce_sum(tf.cast(mask_rep, tf.int32), -1) # [bs] (outputs_fw, output_bw), _ = bidirectional_dynamic_rnn(cell_dp_fw, cell_dp_bw, tensor_rep, tensor_len, dtype=tf.float32) rnn_outputs = tf.concat([outputs_fw, output_bw], -1) # [...,sl,2hn] if wd > 0: add_reg_without_bias() if not only_final: return rnn_outputs # [....,sl, 2hn] else: return get_last_state(rnn_outputs, mask_rep) # [...., 2hn]
def one_direction_rnn(tensor_rep, mask_rep, hn, cell_type, only_final=False, wd=0., keep_prob=1., is_train=None, is_forward=True, scope=None): assert not is_forward # todo: waiting to be implemented with tf.variable_scope(scope or '%s_rnn' % 'forward' if is_forward else 'backward'): reuse = None if not tf.get_variable_scope().reuse else True # print(reuse) if cell_type == 'gru': cell = tf.contrib.rnn.GRUCell(hn, reuse=reuse) elif cell_type == 'lstm': cell = tf.contrib.rnn.LSTMCell(hn, reuse=reuse) elif cell_type == 'basic_lstm': cell = tf.contrib.rnn.BasicLSTMCell(hn, reuse=reuse) elif cell_type == 'basic_rnn': cell = tf.contrib.rnn.BasicRNNCell(hn, reuse=reuse) else: raise AttributeError('no cell type \'%s\'' % cell_type) cell_dp = SwitchableDropoutWrapper(cell, is_train, keep_prob) tensor_len = tf.reduce_sum(tf.cast(mask_rep, tf.int32), -1) # [bs] rnn_outputs, _ = dynamic_rnn(cell_dp, tensor_rep, tensor_len, dtype=tf.float32) if wd > 0: add_reg_without_bias() if not only_final: return rnn_outputs # [....,sl, 2hn] else: return get_last_state(rnn_outputs, mask_rep) # [...., 2hn]
def multi_head_attention(rep_tensor, rep_mask, head_num=8, hidden_units_num=64, scope=None, is_train=None, keep_prob=1., wd=0.): bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape( rep_tensor)[2] ivec = rep_tensor.get_shape().as_list()[2] with tf.variable_scope(scope or 'multi_head_attention'): with tf.variable_scope('positional_encoding'): seq_idxs = tf.tile(tf.expand_dims(tf.range(sl), 1), [1, ivec]) # sl, ivec feature_idxs = tf.tile(tf.expand_dims(tf.range(ivec), 0), [sl, 1]) # sl, ivec pos_enc = tf.where( tf.equal(tf.mod(feature_idxs, 2), 0), tf.sin( tf.cast(seq_idxs, tf.float32) / tf.pow( 10000., 2.0 * tf.cast(feature_idxs, tf.float32) / (1.0 * ivec))), tf.cos( tf.cast(seq_idxs, tf.float32) / tf.pow( 10000., 2.0 * tf.cast(feature_idxs - 1, tf.float32) / (1.0 * ivec))), ) rep_tensor_pos = mask_for_high_rank(rep_tensor + pos_enc, rep_mask) # bs, sl, ivec with tf.variable_scope('multi_head_attention'): W = tf.get_variable('W', [3, head_num, ivec, hidden_units_num], tf.float32) rep_tile = tf.tile( tf.expand_dims(tf.expand_dims(rep_tensor_pos, 0), 0), [3, head_num, 1, 1, 1]) # 3,head_num,bs,sl,ivec rep_tile_reshape = tf.reshape( rep_tile, [3, head_num, bs * sl, ivec]) # head_num,bs*sl,ivec maps = tf.reshape( # 3,head_num,bs*sl,hn -> 3,head_num,bs,sl,hn tf.matmul(dropout(rep_tile_reshape, keep_prob, is_train), W), [3, head_num, bs, sl, hidden_units_num]) Q_map, K_map, V_map = tf.split(maps, 3, 0) Q_map = tf.squeeze(Q_map, [0]) # head_num,bs,sl,hn K_map = tf.squeeze(K_map, [0]) # head_num,bs,sl,hn V_map = tf.squeeze(V_map, [0]) # head_num,bs,sl,hn # head_num,bs,sl,sl # similarity_mat = tf.reduce_sum(Q_map_tile * K_map_tile, -1) / math.sqrt(1. * hidden_units_num) similarity_mat = tf.matmul(Q_map, tf.transpose( K_map, [0, 1, 3, 2])) / math.sqrt(1. * hidden_units_num) # mask: bs,sl -> head_num,bs,sl multi_mask = tf.tile(tf.expand_dims(rep_mask, 0), [head_num, 1, 1]) # head_num,bs,sl multi_mask_tile_1 = tf.expand_dims(multi_mask, 2) # head_num,bs,1,sl multi_mask_tile_2 = tf.expand_dims(multi_mask, 3) # head_num,bs,sl,1 multi_mask_tile = tf.logical_and( multi_mask_tile_1, multi_mask_tile_2) # head_num,bs,sl,sl similarity_mat_masked = exp_mask( similarity_mat, multi_mask_tile) # head_num,bs,sl,sl prob_dist = tf.nn.softmax( similarity_mat_masked) # head_num,bs,sl,sl prob_dist_dp = dropout(prob_dist, keep_prob, is_train) attn_res = tf.matmul(prob_dist_dp, V_map) # head_num,bs,sl,hn attn_res_tran = tf.transpose(attn_res, [1, 2, 0, 3]) output = tf.reshape(attn_res_tran, [bs, sl, head_num * hidden_units_num]) if wd > 0.: add_reg_without_bias() return output
def build_tree_structure(normal_data, op_lists, reduce_mats, method='dy_tree_lstm.v1', hn=None, wd=0., is_train=None, keep_prob=1., swap_memory=False, scope=None): """ get shift reduce stacked mat from data and tree info :param normal_data: rank is 3 with shape [bs,sl,vec] :param op_lists: rank is 2 with shape [bs,ol], 1 for shift, 2 for reduce and 3 for padding :param reduce_mats: rank is 3 with shape [bs,ol,mc], indicate the reduce indices in stack matrix, -1 for padding :param method: 'concat' 'mean' 'merge' 'lstm' :param hn: hn for some func :param wd: weight decay :param is_train: :param keep_prob: :param swap_memory: use physical memory :param scope: :return: [bs,ol,hn] """ # todo: add new generate method method_class_list = [ GeneBiLSTM, GeneBTTreeLSTM, GeneBTMerge, GeneDyTreeLSTMv0, GeneDyTreeLSTMv1 ] with tf.variable_scope(scope or 'build_tree_structure', reuse=None): # tanspose op_lists = tf.transpose(op_lists, [1, 0]) # [ol,bs] reduce_mats = tf.transpose(reduce_mats, [1, 0, 2]) # [ol,bs,mc] # len parameters bs, sl, d = tf.shape(normal_data)[0], tf.shape( normal_data)[1], tf.shape(normal_data)[2] ol = tf.shape(op_lists)[0] mc = tf.shape(reduce_mats)[2] gene = None for gene_class in method_class_list: if gene_class.method_type == method: gene = gene_class(hn, keep_prob, is_train, wd) break assert gene is not None, 'no shift reduce method %s' % method hn = gene.update_tree_hn() # elems for scan elems_tensors = [op_lists, reduce_mats] # non-sequence batch_indices = tf.range(0, bs, dtype=tf.int32) # bs batch_indices_mat = tf.tile(tf.expand_dims(batch_indices, 1), [1, mc]) # bs,mc data_extend = tf.concat( [normal_data, tf.zeros(shape=[bs, 1, d], dtype=tf.float32)], axis=1) # pointer will be 'data_len+1' at last # scan variable init t_init = tf.constant(0, tf.int32) # indicate the stack mat index data_pointer_init = tf.zeros( [bs], tf.int32) # indicate the stack which data should be shifted next stack_mat_init = tf.zeros([ol, bs, hn], tf.float32) scan_init = (t_init, data_pointer_init, stack_mat_init) def main_scan_body(iter_vars, elems_vars): # get tensors # # iter: 1.t 2. data_pointer 3. stack_mat t = iter_vars[0] data_pointer = iter_vars[1] stack_mat = iter_vars[2] # ol,bs,d # # elems: 1.op_list 2.reduce mat op_list = elems_vars[0] # bs reduce_mat = elems_vars[1] # bs mc # for shift shift_data_coordinates = tf.stack([batch_indices, data_pointer], axis=1) # bs,2 data_for_shift = tf.gather_nd( data_extend, shift_data_coordinates) # coord:[bs,2] data: [bs,sl,d]->bs,d # # TODO: add processing for shifted data processed_shifted_data = gene.do_shift(data_for_shift) assert processed_shifted_data is not None # # mask shifted data for change un-shifted data into zero ==> need to add masked_shifted_data = tf.where( tf.equal(op_list, tf.ones_like(op_list, tf.int32)), processed_shifted_data, tf.zeros_like(processed_shifted_data)) # bs,d # # data_pointer update data_pointer = tf.where( tf.equal(op_list, tf.ones_like(op_list, tf.int32)), data_pointer + 1, data_pointer) # for reduce # # mask generation reduce_data_coordinates = tf.stack([reduce_mat, batch_indices_mat], axis=2) # bs,mc,2 data_for_reduce = tf.gather_nd(stack_mat, reduce_data_coordinates) # bs,mc,d mask_for_reduce = tf.not_equal( reduce_mat, tf.ones_like(reduce_mat) * -1) # (reduce_mats[t] != -1) # [bs,mc] # TODO: add processing for reduced data processed_reduced_data = gene.do_reduce(data_for_reduce, mask_for_reduce) masked_reduced_data = tf.where( tf.equal(op_list, tf.ones_like(op_list, tf.int32) * 2), processed_reduced_data, tf.zeros_like(processed_reduced_data)) # bs,d sr_data = masked_shifted_data + masked_reduced_data # bs,d # new update method for shift and reduce result sr_data = tf.scatter_nd(indices=[[t]], updates=[sr_data], shape=[ol, bs, hn]) stack_mat = stack_mat + sr_data return t + 1, data_pointer, stack_mat output = tf.scan(main_scan_body, elems_tensors, scan_init, parallel_iterations=1, swap_memory=swap_memory) output_stack_mats = output[2] # ol,ol,bs,v output_stack_mat = tf.transpose(output_stack_mats[-1], [1, 0, 2]) # bs,ol,hn output_stack_mat = gene.fetch_output(output_stack_mat) if wd > 0: add_reg_without_bias() return output_stack_mat