def forward(self): config = self.config tree_lstm = self.tree_lstm hiddens = tree_lstm.generate_hiddens() nodes_size = tree_lstm.nodes_size self.nodes_size = nodes_size output_layer = tree_lstm.output_layer max_l = tree_lstm.max_l mask = tf.sequence_mask(nodes_size, max_l, dtype=tf.float32) with tf.variable_scope('graph_lstm'): encoder = graph_encoder_utils.GraphEncoder( (hiddens, mask), nodes_size, self.is_train, self.config) self.encoder = encoder graph_hidden = encoder.graph_hiddens hidden_shape = tf.shape(graph_hidden) hidden_flt_shape = [hidden_shape[0] * hidden_shape[1], hidden_shape[-1]] logits_flt = tf.nn.xw_plus_b(tf.reshape(graph_hidden, hidden_flt_shape), output_layer._weights, output_layer._bias) # logits = tf.reshape(logits, [tf.shape(logits)[0]*tf.shape(logits)[1], tf.shape(logits)[-1]]) labels_flt = tf.reshape(self.labels, [-1]) mask_flt = tf.reshape(mask, [-1]) loss_grp = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits_flt, labels=labels_flt) loss_grp = tf.reduce_sum(loss_grp * mask_flt) rglz_items = [tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'bias' not in v.name and 'b_' not in v.name] loss_grp += tf.add_n(rglz_items) * 0.0005 if config['all_vars_trained']: vars_graph = None else: vars_graph = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="graph_lstm") vars_graph += [output_layer._weights, output_layer._bias] metrics, metrics_num = self.eval_function(logits_flt, labels_flt, mask_flt, nodes_size, self.labels) return loss_grp, metrics, metrics_num, vars_graph
def __init__(self, word_vocab, char_vocab, Edgelabel_vocab, options=None, mode='ce_train'): # here 'mode', whose value can be: # 'ce_train', # 'rl_train', # 'evaluate', # 'evaluate_bleu', # 'decode'. # it is different from 'mode_gen' in generator_utils.py # value of 'mode_gen' can be ['ce_loss', 'rl_loss', 'greedy' or 'sample'] self.mode = mode # is_training controls whether to use dropout is_training = True if mode in ('ce_train', ) else False self.options = options self.word_vocab = word_vocab # encode the input instance # encoder.graph_hidden [batch, node_num, vsize] # encoder.graph_cell [batch, node_num, vsize] self.encoder = graph_encoder_utils.GraphEncoder( word_vocab=word_vocab, edge_label_vocab=Edgelabel_vocab, char_vocab=char_vocab, is_training=is_training, options=options) # ============== Choices of attention memory ================ if options.attention_type == 'hidden': self.encoder_dim = options.neighbor_vector_dim self.encoder_states = self.encoder.graph_hiddens elif options.attention_type == 'hidden_cell': self.encoder_dim = options.neighbor_vector_dim * 2 self.encoder_states = tf.concat( [self.encoder.graph_hiddens, self.encoder.graph_cells], 2) elif options.attention_type == 'hidden_embed': self.encoder_dim = options.neighbor_vector_dim + self.encoder.input_dim self.encoder_states = tf.concat([ self.encoder.graph_hiddens, self.encoder.node_representations ], 2) else: assert False, '%s not supported yet' % options.attention_type # ============== Choices of initializing decoder state ============= if options.way_init_decoder == 'zero': new_c = tf.zeros( [self.encoder.batch_size, options.gen_hidden_size]) new_h = tf.zeros( [self.encoder.batch_size, options.gen_hidden_size]) elif options.way_init_decoder == 'all': new_c = tf.reduce_sum(self.encoder.graph_cells, axis=1) new_h = tf.reduce_sum(self.encoder.graph_hiddens, axis=1) elif options.way_init_decoder == 'root': new_c = self.encoder.graph_cells[:, 0, :] new_h = self.encoder.graph_hiddens[:, 0, :] else: assert False, 'way to initial decoder (%s) not supported' % options.way_init_decoder self.init_decoder_state = tf.contrib.rnn.LSTMStateTuple(new_c, new_h) # prepare AMR-side input for decoder self.nodes = self.encoder.passage_nodes self.nodes_num = self.encoder.passage_nodes_size if options.with_char: self.nodes_chars = self.encoder.passage_nodes_chars self.nodes_chars_num = self.encoder.passage_nodes_chars_size self.nodes_mask = self.encoder.passage_nodes_mask self.in_neigh_indices = self.encoder.passage_in_neighbor_indices self.in_neigh_edges = self.encoder.passage_in_neighbor_edges self.in_neigh_mask = self.encoder.passage_in_neighbor_mask self.out_neigh_indices = self.encoder.passage_out_neighbor_indices self.out_neigh_edges = self.encoder.passage_out_neighbor_edges self.out_neigh_mask = self.encoder.passage_out_neighbor_mask self.create_placeholders(options) loss_weights = tf.sequence_mask( self.answer_len, options.max_answer_len, dtype=tf.float32) # [batch_size, gen_steps] with variable_scope.variable_scope("generator"): # create generator self.generator = generator_utils.CovCopyAttenGen( self, options, word_vocab) # calculate encoder_features self.encoder_features = self.generator.calculate_encoder_features( self.encoder_states, self.encoder_dim) if mode == 'decode': self.context_t_1 = tf.placeholder( tf.float32, [None, self.encoder_dim], name='context_t_1') # [batch_size, encoder_dim] self.coverage_t_1 = tf.placeholder( tf.float32, [None, None], name='coverage_t_1') # [batch_size, encoder_dim] self.word_t = tf.placeholder(tf.int32, [None], name='word_t') # [batch_size] (self.state_t, self.context_t, self.coverage_t, self.attn_dist_t, self.p_gen_t, self.ouput_t, self.topk_log_probs, self.topk_ids, self.greedy_prediction, self.multinomial_prediction) = self.generator.decode_mode( word_vocab, options.beam_size, self.init_decoder_state, self.context_t_1, self.coverage_t_1, self.word_t, self.encoder_states, self.encoder_features, self.nodes, self.nodes_mask) # not buiding training op for this mode return elif mode == 'evaluate_bleu': _, _, self.greedy_words = self.generator.train_mode( word_vocab, self.encoder_dim, self.encoder_states, self.encoder_features, self.nodes, self.nodes_mask, self.init_decoder_state, self.answer_inp, self.answer_ref, loss_weights, mode_gen='greedy') # not buiding training op for this mode return elif mode in ( 'ce_train', 'evaluate', ): self.accu, self.loss, _ = self.generator.train_mode( word_vocab, self.encoder_dim, self.encoder_states, self.encoder_features, self.nodes, self.nodes_mask, self.init_decoder_state, self.answer_inp, self.answer_ref, loss_weights, mode_gen='ce_loss') if mode == 'evaluate': return # not buiding training op for evaluation elif mode == 'rl_train': _, self.loss, _ = self.generator.train_mode( word_vocab, self.encoder_dim, self.encoder_states, self.encoder_features, self.nodes, self.nodes_mask, self.init_decoder_state, self.answer_inp, self.answer_ref, loss_weights, mode_gen='rl_loss') tf.get_variable_scope().reuse_variables() _, _, self.sampled_words = self.generator.train_mode( word_vocab, self.encoder_dim, self.encoder_states, self.encoder_features, self.nodes, self.nodes_mask, self.init_decoder_state, self.answer_inp, self.answer_ref, None, mode_gen='sample') _, _, self.greedy_words = self.generator.train_mode( word_vocab, self.encoder_dim, self.encoder_states, self.encoder_features, self.nodes, self.nodes_mask, self.init_decoder_state, self.answer_inp, self.answer_ref, None, mode_gen='greedy') if options.optimize_type == 'adadelta': clipper = 50 optimizer = tf.train.AdadeltaOptimizer( learning_rate=options.learning_rate) tvars = tf.trainable_variables() if options.lambda_l2 > 0.0: l2_loss = tf.add_n([ tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1 ]) self.loss = self.loss + options.lambda_l2 * l2_loss grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) elif options.optimize_type == 'adam': clipper = 50 optimizer = tf.train.AdamOptimizer( learning_rate=options.learning_rate) tvars = tf.trainable_variables() if options.lambda_l2 > 0.0: l2_loss = tf.add_n([ tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1 ]) self.loss = self.loss + options.lambda_l2 * l2_loss grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) extra_train_ops = [] train_ops = [self.train_op] + extra_train_ops self.train_op = tf.group(*train_ops)
def __init__(self, word_vocab, char_vocab, Edgelabel_vocab, options=None, mode='train'): # the value of 'mode' can be: # 'train', # 'evaluate' self.mode = mode # is_training controls whether to use dropout is_training = True if mode in ('train', ) else False self.options = options self.word_vocab = word_vocab # encode the input instance # encoder.graph_hidden [batch, node_num, vsize] # encoder.graph_cell [batch, node_num, vsize] self.encoder = graph_encoder_utils.GraphEncoder( word_vocab = word_vocab, edge_label_vocab = Edgelabel_vocab, char_vocab = char_vocab, is_training = is_training, options = options) # ============== Choices of attention memory ================ if options.attention_type == 'hidden': self.encoder_dim = options.neighbor_vector_dim self.encoder_states = self.encoder.graph_hiddens elif options.attention_type == 'hidden_cell': self.encoder_dim = options.neighbor_vector_dim * 2 self.encoder_states = tf.concat([self.encoder.graph_hiddens, self.encoder.graph_cells], 2) elif options.attention_type == 'hidden_embed': self.encoder_dim = options.neighbor_vector_dim + self.encoder.input_dim self.encoder_states = tf.concat([self.encoder.graph_hiddens, self.encoder.node_representations], 2) else: assert False, '%s not supported yet' % options.attention_type self.nodes = self.encoder.passage_nodes self.nodes_num = self.encoder.passage_nodes_size if options.with_char: self.nodes_chars = self.encoder.passage_nodes_chars self.nodes_chars_num = self.encoder.passage_nodes_chars_size self.nodes_mask = self.encoder.passage_nodes_mask self.in_neigh_indices = self.encoder.passage_in_neighbor_indices self.in_neigh_edges = self.encoder.passage_in_neighbor_edges self.in_neigh_mask = self.encoder.passage_in_neighbor_mask self.out_neigh_indices = self.encoder.passage_out_neighbor_indices self.out_neigh_edges = self.encoder.passage_out_neighbor_edges self.out_neigh_mask = self.encoder.passage_out_neighbor_mask ## generating prediction results self.entity_indices = tf.placeholder(tf.int32, [None, None, None], name="entity_indices") self.entity_indices_mask = tf.placeholder(tf.float32, [None, None, None], name="entity_indices_mask") batch_size = tf.shape(self.encoder_states)[0] node_num = tf.shape(self.encoder_states)[1] dim = tf.shape(self.encoder_states)[2] entity_num = tf.shape(self.entity_indices)[1] entity_size = tf.shape(self.entity_indices)[2] # self.encoder_states [batch, node_num, encoder_dim] # entity_states [batch, 3, 5, dim] entity_states = collect_by_indices(self.encoder_states, self.entity_indices) # applying mask entity_states = entity_states * tf.expand_dims(self.entity_indices_mask, axis=-1) # average within each entity: [batch, 3, encoder_dim] entity_states = tf.reduce_mean(entity_states, axis=2) # flatten: [batch, 3*encoder_dim] entity_states = tf.reshape(entity_states, [batch_size, entity_num*dim]) w_linear = tf.get_variable("w_linear", [options.entity_num*self.encoder_dim, options.class_num], dtype=tf.float32) b_linear = tf.get_variable("b_linear", [options.class_num], dtype=tf.float32) # [batch, class_num] prediction = tf.nn.softmax(tf.matmul(entity_states, w_linear) + b_linear) prediction = _clip_and_normalize(prediction, 1.0e-6) self.output = tf.argmax(prediction,axis=-1,output_type=tf.int32) ## calculating accuracy self.refs = tf.placeholder(tf.int32, [None,]) self.accu = tf.reduce_sum(tf.cast(tf.equal(self.output,self.refs),dtype=tf.float32)) ## calculating loss # xent: [batch] xent = -tf.reduce_sum( tf.one_hot(self.refs,options.class_num)*tf.log(prediction), axis=-1) self.loss = tf.reduce_mean(xent) if mode != 'train': print('Return from here, just evaluate') return if options.optimize_type == 'adadelta': clipper = 50 optimizer = tf.train.AdadeltaOptimizer(learning_rate=options.learning_rate) tvars = tf.trainable_variables() if options.lambda_l2>0.0: l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1]) self.loss = self.loss + options.lambda_l2 * l2_loss grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) elif options.optimize_type == 'adam': clipper = 50 optimizer = tf.train.AdamOptimizer(learning_rate=options.learning_rate) tvars = tf.trainable_variables() if options.lambda_l2>0.0: l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1]) self.loss = self.loss + options.lambda_l2 * l2_loss grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) extra_train_ops = [] train_ops = [self.train_op] + extra_train_ops self.train_op = tf.group(*train_ops)
def __init__(self, word_vocab, char_vocab, Edgelabel_vocab, options=None, mode='train'): # the value of 'mode' can be: # 'train', # 'evaluate' self.mode = mode # is_training controls whether to use dropout is_training = True if mode in ('train', ) else False self.options = options self.word_vocab = word_vocab # encode the input instance # encoder.graph_hidden [batch, node_num, vsize] # encoder.graph_cell [batch, node_num, vsize] with tf.variable_scope('encoder'): self.encoder = graph_encoder_utils.GraphEncoder( word_vocab = word_vocab, edge_label_vocab = Edgelabel_vocab, char_vocab = char_vocab, is_training = is_training, options = options) with tf.variable_scope('rev_encoder'): self.encoder_rev = graph_encoder_utils.GraphEncoder( word_vocab = word_vocab, edge_label_vocab = Edgelabel_vocab, char_vocab = char_vocab, is_training = is_training, options = options) with tf.variable_scope('entity_repre'): self.entity = entity_utils.Entity(self.encoder.graph_hiddens) self.entity_rev = entity_utils.Entity(self.encoder_rev.graph_hiddens) batch_size = tf.shape(self.encoder.graph_hiddens)[0] node_num = tf.shape(self.encoder.graph_hiddens)[1] dim = tf.shape(self.encoder.graph_hiddens)[2] entity_num = tf.shape(self.entity.entity_indices)[1] entity_size = tf.shape(self.entity.entity_indices)[2] self.encoder_dim = options.neighbor_vector_dim * 2 # [batch, 3, encoder_dim] entity_states = tf.concat( [self.entity.entity_states, self.entity_rev.entity_states], 2) # [batch, 3*encoder_dim] entity_states = tf.reshape(entity_states, [batch_size, entity_num*dim*2]) # placeholders self.nodes = self.encoder.passage_nodes self.nodes_num = self.encoder.passage_nodes_size if options.with_char: self.nodes_chars = self.encoder.passage_nodes_chars self.nodes_chars_num = self.encoder.passage_nodes_chars_size self.nodes_mask = self.encoder.passage_nodes_mask self.in_neigh_indices = self.encoder.passage_in_neighbor_indices self.in_neigh_hidden_indices = self.encoder.passage_in_neighbor_hidden_indices self.in_neigh_edges = self.encoder.passage_in_neighbor_edges self.in_neigh_mask = self.encoder.passage_in_neighbor_mask # rev placeholders self.rev_nodes = self.encoder_rev.passage_nodes self.rev_nodes_num = self.encoder_rev.passage_nodes_size if options.with_char: self.rev_nodes_chars = self.encoder_rev.passage_nodes_chars self.rev_nodes_chars_num = self.encoder_rev.passage_nodes_chars_size self.rev_nodes_mask = self.encoder_rev.passage_nodes_mask self.rev_in_neigh_indices = self.encoder_rev.passage_in_neighbor_indices self.rev_in_neigh_hidden_indices = self.encoder_rev.passage_in_neighbor_hidden_indices self.rev_in_neigh_edges = self.encoder_rev.passage_in_neighbor_edges self.rev_in_neigh_mask = self.encoder_rev.passage_in_neighbor_mask w_linear = tf.get_variable("w_linear", [options.entity_num*self.encoder_dim, options.class_num], dtype=tf.float32) b_linear = tf.get_variable("b_linear", [options.class_num], dtype=tf.float32) # [batch, class_num] logits = tf.matmul(entity_states, w_linear) + b_linear self.output = tf.argmax(logits, axis=-1, output_type=tf.int32) ## calculating accuracy self.answers = tf.placeholder(tf.int32, [None,]) self.accu = tf.reduce_sum( tf.cast( tf.equal(tf.argmax(logits,axis=-1,output_type=tf.int32),self.answers), dtype=tf.float32)) ## calculating loss self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits( logits=logits, labels=tf.one_hot(self.answers,options.class_num))) if mode != 'train': print('Return from here, just evaluate') return if options.optimize_type == 'adadelta': clipper = 50 optimizer = tf.train.AdadeltaOptimizer(learning_rate=options.learning_rate) tvars = tf.trainable_variables() if options.lambda_l2>0.0: l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1]) self.loss = self.loss + options.lambda_l2 * l2_loss grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) elif options.optimize_type == 'adam': clipper = 50 optimizer = tf.train.AdamOptimizer(learning_rate=options.learning_rate) tvars = tf.trainable_variables() if options.lambda_l2>0.0: l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1]) self.loss = self.loss + options.lambda_l2 * l2_loss grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) extra_train_ops = [] train_ops = [self.train_op] + extra_train_ops self.train_op = tf.group(*train_ops)
def __init__(self, word_vocab, char_vocab=None, POS_vocab=None, NER_vocab=None, options=None, \ has_ref=True, is_training=True): # is_training controls whether to use dropout and update parameters self.is_training = is_training # has_ref distinguish 'dev' evaluation from 'final test' evaluation self.has_ref = has_ref self.options = options self.word_vocab = word_vocab # separately encode passage and question self.passage_encoder = encoder_utils.SeqEncoder(options, word_vocab, POS_vocab=POS_vocab, NER_vocab=NER_vocab) self.question_encoder = encoder_utils.SeqEncoder(options, word_vocab, POS_vocab=POS_vocab, NER_vocab=NER_vocab, embed_reuse=True) with tf.variable_scope('passage'): passage_dim, passage_repre, passage_mask = self.passage_encoder.encode( is_training=is_training) with tf.variable_scope('question'): question_dim, question_repre, question_mask = self.question_encoder.encode( is_training=is_training) # modeling entities self.entity_starts = tf.placeholder(tf.int32, [None, None], 'entity_starts') self.entity_ends = tf.placeholder(tf.int32, [None, None], 'entity_ends') self.entity_lengths = tf.placeholder(tf.int32, [None], 'entity_lengths') batch_size = tf.shape(self.entity_starts)[0] entity_len_max = tf.shape(self.entity_starts)[1] entity_mask = tf.sequence_mask(self.entity_lengths, entity_len_max, dtype=tf.float32) # [batch, entity] entity_st_rep = operation_utils.collect_node( passage_repre, self.entity_starts) # [batch, entity, rep_dim] entity_ed_rep = operation_utils.collect_node( passage_repre, self.entity_ends) # [batch, entity, rep_dim] entity_rep = tf.concat([entity_st_rep, entity_ed_rep], axis=2) # [batch, entity, rep_dim * 2] entity_dim = passage_dim * 2 qfull_st_rep = question_repre[:, 0, :] # [batch, rep_dim] qfull_ed_rep = operation_utils.collect_final_step( question_repre, self.question_encoder.sequence_lengths - 1) # [batch, rep_dim] qfull_rep = tf.concat([qfull_st_rep, qfull_ed_rep], axis=1) # [batch, rep_dim * 2] qfull_dim = question_dim * 2 matching_results = [] rst_seq = self.perform_matching(entity_rep, entity_dim, entity_mask, question_repre, qfull_rep, question_dim, question_mask, scope_name='seq_match', options=options, is_training=is_training) matching_results.append(rst_seq) # encode entity representation with GRN if options.with_grn or options.with_gcn: # merge question representation into passage q4p_rep = tf.tile( tf.expand_dims(qfull_rep, 1), # [batch, 1, rep_dim * 2] [1, entity_len_max, 1]) # [batch, entity, rep_dim * 2] entity_rep = tf.concat([entity_rep, q4p_rep], axis=2) entity_dim = entity_dim + qfull_dim # compress before going to GRN merge_w = tf.get_variable('merge_w', [entity_dim, options.merge_dim]) merge_b = tf.get_variable('merge_b', [options.merge_dim]) entity_rep = tf.reshape(entity_rep, [-1, entity_dim]) entity_rep = tf.matmul(entity_rep, merge_w) + merge_b entity_rep = tf.reshape( entity_rep, [batch_size, entity_len_max, options.merge_dim]) entity_rep = entity_rep * tf.expand_dims(entity_mask, axis=-1) entity_dim = options.merge_dim # main part: encoding scope_name = 'GRN' if options.with_grn else 'GCN' with tf.variable_scope(scope_name): self.edges = tf.placeholder(tf.int32, [None, None, None], 'edges') self.edges_mask = tf.placeholder(tf.float32, [None, None, None], 'edges_mask') if options.with_grn: print("With Graph recurrent network as the graph encoder") self.graph_encoder = graph_encoder_utils.GraphEncoder( entity_rep, entity_mask, entity_dim, self.edges, self.edges_mask, is_training=is_training, options=options) else: print("With GCN as the graph encoder") self.graph_encoder = gcn_encoder_utils.GCNEncoder( entity_rep, entity_mask, entity_dim, self.edges, self.edges_mask, is_training=is_training, options=options) for i in range(options.num_grn_step): if options.grn_rep_type == 'hidden': entity_grn_rep = self.graph_encoder.grn_historys[ i] # [batch, entity, grn_dim] entity_grn_dim = options.grn_dim elif options.grn_rep_type == 'hidden_embed': entity_grn_rep = tf.concat( [self.graph_encoder.grn_historys[i], entity_rep], 2) # [batch, entity, grn_dim + merge_dim] entity_grn_dim = options.grn_dim + entity_dim else: assert False, '%s not supported yet' % options.grn_rep_type if options.with_multi_perspective: assert entity_grn_dim == question_dim rst_grn = self.perform_matching(entity_grn_rep, entity_grn_dim, entity_mask, question_repre, qfull_rep, question_dim, question_mask, scope_name='grn%d_match' % i, options=options, is_training=is_training) matching_results.append(rst_grn) self.candidates = tf.placeholder( tf.int32, [None, None, None], 'candidates') # [batch, c_num, c_occur] self.candidates_len = tf.placeholder(tf.float32, [None], 'candidates_len') # [batch] self.candidates_occur_mask = tf.placeholder( tf.float32, [None, None, None], 'candidates_occur_mask') # [batch, c_num, c_occur] # matching_results: list of [batch, cands] self.attn_dist = self.perform_integration(matching_results, scope_name='integration', options=options, is_training=is_training) cand_num = tf.shape(self.candidates)[1] self.topk_probs, self.topk_ids = tf.nn.top_k(self.attn_dist, k=cand_num, name='topK') self.out = tf.argmax(self.attn_dist, axis=-1, output_type=tf.int32) if not has_ref: return self.ref = tf.placeholder(tf.int32, [None], 'ref') self.accu = tf.reduce_sum( tf.cast(tf.equal(self.out, self.ref), dtype=tf.float32)) xent = -tf.reduce_sum( tf.one_hot(self.ref, cand_num) * tf.log(self.attn_dist), axis=-1) self.loss = tf.reduce_mean(xent) if not is_training: return with tf.variable_scope("training_op"), tf.device("/gpu:1"): if options.optimize_type == 'adadelta': optimizer = tf.train.AdadeltaOptimizer( learning_rate=options.learning_rate) elif options.optimize_type == 'adam': optimizer = tf.train.AdamOptimizer( learning_rate=options.learning_rate) clipper = 50 if not options.__dict__.has_key( "max_grad_norm") else options.max_grad_norm print("Max gradient norm {}".format(clipper)) tvars = tf.trainable_variables() if options.lambda_l2 > 0.0: l2_loss = tf.add_n([ tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1 ]) self.loss = self.loss + options.lambda_l2 * l2_loss grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) extra_train_ops = [] train_ops = [self.train_op] + extra_train_ops self.train_op = tf.group(*train_ops)
def __init__(self, word_vocab, char_vocab, pos_vocab, edgelabel_vocab, options, mode='train'): # the value of 'mode' can be: # 'train', # 'evaluate' self.mode = mode # is_training controls whether to use dropout is_training = True if mode in ('train', ) else False self.options = options self.word_vocab = word_vocab self.char_vocab = char_vocab self.pos_vocab = pos_vocab # sequential encoder that can take 0 LSTM layers self.encoder = encoder_utils.SeqEncoder(options, word_vocab, char_vocab, pos_vocab) word_repres, word_dim, sentence_repres, sentence_dim, seq_mask = \ self.encoder.encode(is_training=is_training) # encode the input instance # encoder.graph_hidden [batch, node_num, vsize] # encoder.graph_cell [batch, node_num, vsize] self.graph_encoder = graph_encoder_utils.GraphEncoder( options, word_repres, word_dim, sentence_repres, sentence_dim, seq_mask, edgelabel_vocab, is_training=is_training) # collect placeholders self.sentence_words = self.encoder.sentence_words self.sentence_lengths = self.encoder.sentence_lengths if options.with_char: self.sentence_chars = self.encoder.sentence_chars self.sentence_chars_lengths = self.encoder.sentence_chars_lengths if options.with_POS: self.sentence_POSs = self.encoder.sentence_POSs self.in_neigh_indices = self.graph_encoder.in_neighbor_indices self.in_neigh_edges = self.graph_encoder.in_neighbor_edges self.in_neigh_mask = self.graph_encoder.in_neighbor_mask self.out_neigh_indices = self.graph_encoder.out_neighbor_indices self.out_neigh_edges = self.graph_encoder.out_neighbor_edges self.out_neigh_mask = self.graph_encoder.out_neighbor_mask if options.forest_prob_aware and options.forest_type != '1best': self.in_neigh_prob = self.graph_encoder.in_neighbor_prob self.out_neigh_prob = self.graph_encoder.out_neighbor_prob self.entity_indices = tf.placeholder(tf.int32, [None, None, None], name="entity_indices") self.entity_indices_mask = tf.placeholder(tf.float32, [None, None, None], name="entity_indices_mask") # collect inputs for final classifier final_repres = self.graph_encoder.graph_hiddens final_shape = tf.shape(final_repres) batch_size = final_shape[0] sentence_size_max = final_shape[1] # [batch, 2, indices, sentence_dim] entity_repres = collect_by_indices(final_repres, self.entity_indices) entity_repres = entity_repres * tf.expand_dims( self.entity_indices_mask, axis=-1) # [batch, 2, sentence_dim] entity_repres = tf.reduce_mean(entity_repres, axis=2) # [batch, 2*sentence_dim] h_final = tf.reshape(entity_repres, [batch_size, 2 * sentence_dim]) ### regarding Zhang et al., EMNLP 2018 #h_sent = tf.reduce_max(final_repres, axis=1) #hsent_loss = None #if options.lambda_l2_hsent > 0.0: # hsent_loss = tf.reduce_mean( # tf.reduce_sum(h_sent * h_sent, axis=-1), axis=-1) #h_s = tf.reduce_max( # range_repres(final_repres, sentence_size_max, self.sbj_starts, self.sbj_ends), # axis=1) #h_o = tf.reduce_max( # range_repres(final_repres, sentence_size_max, self.obj_starts, self.obj_ends), # axis=1) #h_final = tf.concat([h_sent, h_s, h_o], axis=1) # [batch, sentence_dim*3] #h_final = tf.layers.dense(h_final, options.ffnn_size, name="ffnn_1", activation=tf.nn.relu) # [batch, ffnn_size] #h_final = tf.layers.dense(h_final, options.ffnn_size, name="ffnn_2", activation=tf.nn.relu) # [batch, ffnn_size] ## [batch, class_num] self.distribution = _clip_and_normalize( tf.layers.dense(h_final, options.num_relations, name="ffnn_out", activation=tf.nn.softmax), 1.0e-6) self.rsts = tf.argmax(self.distribution, axis=-1, output_type=tf.int32) ## calculating accuracy self.refs = tf.placeholder(tf.int32, [ None, ]) self.accu = tf.reduce_sum( tf.cast(tf.equal(self.rsts, self.refs), dtype=tf.float32)) ## calculating loss # xent: [batch] xent = -tf.reduce_sum(tf.one_hot(self.refs, options.num_relations) * tf.log(self.distribution), axis=-1) self.loss = tf.reduce_mean(xent) if mode != 'train': print('Return from here, just evaluate') return #if options.lambda_l2_hsent > 0.0: # self.loss += hsent_loss * options.lambda_l2_hsent clipper = 5 tvars = tf.trainable_variables() if options.lambda_l2 > 0.0: l2_loss = tf.add_n( [tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1]) self.loss += options.lambda_l2 * l2_loss if hasattr(options, "decay") and options.decay != "none": global_step = tf.Variable(0, trainable=False) if options.decay == 'piece': values, bounds = [ options.learning_rate, ], [] for i in range(10): values.append(values[-1] * 0.9) bounds.append(options.trn_bch_num * 10 * i) learning_rate = tf.train.piecewise_constant( global_step, bounds, values) elif options.decay == 'poly': decay_steps = options.trn_bch_num * options.max_epochs learning_rate = tf.train.polynomial_decay( options.learning_rate, global_step, decay_steps, end_learning_rate=0.1 * options.learning_rate, power=0.5) elif options.decay == 'cos': decay_steps = options.trn_bch_num * options.max_epochs learning_rate = tf.train.cosine_decay(options.learning_rate, global_step, decay_steps, alpha=0.1) else: assert False, 'not supported' else: global_step = None learning_rate = options.learning_rate if options.optimize_type == 'adadelta': optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate) elif options.optimize_type == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) else: assert False, 'not supported optimize type' grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper) train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) extra_train_ops = [] train_ops = [train_op] + extra_train_ops self.train_op = tf.group(*train_ops)