def loss_layer(self, project_logits, lengths, name=None): with tf.variable_scope("crf_loss" if not name else name): small = -1000.0 start_logits = tf.concat( [small * tf.ones(shape=[self.batch_size, 1, self.num_tags]), tf.zeros(shape=[self.batch_size, 1, 1])], axis=-1) pad_logits = tf.cast(small * tf.ones([self.batch_size, self.num_steps, 1]), tf.float32) logits = tf.concat([project_logits, pad_logits], axis=-1) logits = tf.concat([start_logits, logits], axis=1) targets = tf.concat( [tf.cast(self.num_tags * tf.ones([self.batch_size, 1]), tf.int32), self.targets], axis=-1) self.trans = tf.get_variable( "transitions", shape=[self.num_tags + 1, self.num_tags + 1], initializer=self.initializer) log_likelihood, self.trans = crf_log_likelihood( inputs=logits, tag_indices=targets, transition_params=self.trans, sequence_lengths=lengths + 1) return tf.reduce_mean(-log_likelihood)
def __init__(self, config, embeddings, ntags, nchars): self.config = config self.embeddings = embeddings self.nchars = nchars self.ntags = ntags self.logger = config.logger self.word_ids = tf.placeholder(tf.int32, shape=[None, None], name="word_ids") self.sequence_lengths = tf.placeholder(tf.int32, shape=[None], name="sequence_lengths") self.char_ids = tf.placeholder(tf.int32, shape=[None, None, None], name="char_ids") self.word_lengths = tf.placeholder(tf.int32, shape=[None, None], name="word_lengths") self.labels = tf.placeholder(tf.int32, shape=[None, None], name="labels") self.dropout = tf.placeholder(dtype=tf.float32, shape=[], name="dropout") self.lr = tf.placeholder(dtype=tf.float32, shape=[], name="lr") with tf.variable_scope("words"): _word_embeddings = tf.Variable( self.embeddings, name="_word_embeddings", dtype=tf.float32, trainable=self.config.train_embeddings) word_embeddings = tf.nn.embedding_lookup(_word_embeddings, self.word_ids, name="word_embeddings") print(word_embeddings) with tf.variable_scope("chars"): _char_embeddings = tf.get_variable( name="_char_embeddings", dtype=tf.float32, shape=[self.nchars, self.config.dim_char]) char_embeddings = tf.nn.embedding_lookup(_char_embeddings, self.char_ids, name="char_embeddings") shape = tf.shape(char_embeddings) char_embeddings = tf.reshape( char_embeddings, shape=[-1, shape[-2], self.config.dim_char]) word_lengths = tf.reshape(self.word_lengths, shape=[-1]) cell_fw = tf.contrib.rnn.LSTMCell(self.config.char_hidden_size, state_is_tuple=True) cell_bw = tf.contrib.rnn.LSTMCell(self.config.char_hidden_size, state_is_tuple=True) _, ((_, output_fw), (_, output_bw)) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, inputs=char_embeddings, sequence_length=word_lengths, dtype=tf.float32) output = tf.concat([output_fw, output_bw], axis=-1) output = tf.reshape( output, shape=[-1, shape[1], 2 * self.config.char_hidden_size]) # word_embeddings = tf.concat([word_embeddings, output], axis=-1) self.word_embeddings = tf.nn.dropout(output, self.dropout) with tf.variable_scope("bi-lstm"): cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size) cell_bw = tf.contrib.rnn.LSTMCell(self.config.hidden_size) # cell_fw = tf.contrib.rnn.MultiRNNCell([cell_fw] * 3, state_is_tuple=True) # print(self.word_embeddings) # cell_bw = tf.contrib.rnn.MultiRNNCell([cell_bw] * 3, state_is_tuple=True) (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, self.word_embeddings, sequence_length=self.sequence_lengths, dtype=tf.float32) output = tf.concat([output_fw, output_bw], axis=-1) output = tf.nn.dropout(output, self.dropout) with tf.variable_scope("proj"): W = tf.get_variable( "W", shape=[2 * self.config.hidden_size, self.ntags], dtype=tf.float32) b = tf.get_variable("b", shape=[self.ntags], dtype=tf.float32, initializer=tf.zeros_initializer()) ntime_steps = tf.shape( output )[1] # output.shape = [batch size, num tokens (i.e., num timesteps), word representation dim] output = tf.reshape(output, [-1, 2 * self.config.hidden_size]) # Highway Layer output = self.highway(output, 2 * self.config.hidden_size, tf.nn.relu) pred = tf.matmul( output, W ) + b # each word representation is transformed to a vector of 'ntags' (i.e., number of NER tags) dimensions # => 'pred' serves as logits values which be often pushed into a softmax layer. self.logits = tf.reshape(pred, [-1, ntime_steps, self.ntags]) log_likelihood, self.transition_params = crf_log_likelihood( self.logits, self.labels, self.sequence_lengths) self.loss = tf.reduce_mean(-log_likelihood) tf.summary.scalar("loss", self.loss) with tf.variable_scope("train_step"): optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.minimize(self.loss) self.init = tf.global_variables_initializer()
def __init__(self, config): self.cfg = config # Create folders if not os.path.exists(self.cfg["checkpoint_path"]): os.makedirs(self.cfg["checkpoint_path"]) if not os.path.exists(self.cfg["summary_path"]): os.makedirs(self.cfg["summary_path"]) #Create logger self.logger = get_logger( os.path.join(self.cfg["checkpoint_path"], "log.txt")) # Load dictionary dict_data = load_data(self.cfg["vocab"]) self.word_dict, self.char_dict = dict_data["word_dict"], dict_data[ "char_dict"] self.label_dict = dict_data["label_dict"] del dict_data self.word_vocab_size = len(self.word_dict) self.char_vocab_size = len(self.char_dict) self.label_vocab_size = len(self.label_dict) self.max_to_keep = self.cfg["max_to_keep"] self.checkpoint_path = self.cfg["checkpoint_path"] self.summary_path = self.cfg["summary_path"] self.word_embedding = self.cfg["word_embedding"] self.sess, self.saver = None, None # Add placeholder self.words = tf.placeholder( tf.int32, shape=[None, None], name="words") # shape = (batch_size, max_time) self.labels = tf.placeholder( tf.int32, shape=[None, None], name="label") # shape = (batch_size, max_time) self.seq_len = tf.placeholder(tf.int32, shape=[None], name="seq_len") # shape = (batch_size, max_time, max_word_length) self.chars = tf.placeholder(tf.int32, shape=[None, None, None], name="chars") self.char_seq_len = tf.placeholder(tf.int32, shape=[None, None], name="char_seq_len") # hyper-parameters self.is_train = tf.placeholder(tf.bool, shape=[], name="is_train") self.batch_size = tf.placeholder(tf.int32, name="batch_size") self.keep_prob = tf.placeholder(tf.float32, name="keep_probability") self.drop_rate = tf.placeholder(tf.float32, name="dropout_rate") self.lr = tf.placeholder(tf.float32, name="learning_rate") # Build embedding layer with tf.variable_scope("embeddings"): self.word_embeddings = tf.Variable(np.load( self.cfg["word_embedding"])["embeddings"], name="embedding", dtype=tf.float32, trainable=False) word_emb = tf.nn.embedding_lookup(self.word_embeddings, self.words, name="word_emb") print("Word embedding shape: {}".format( word_emb.get_shape().as_list())) self.char_embeddings = tf.get_variable( name="char_embedding", dtype=tf.float32, trainable=True, shape=[self.char_vocab_size, self.cfg["char_emb_dim"]]) char_emb = tf.nn.embedding_lookup(self.char_embeddings, self.chars, name="chars_emb") char_represent = multi_conv1d(char_emb, self.cfg["filter_sizes"], self.cfg["channel_sizes"], drop_rate=self.drop_rate, is_train=self.is_train) print("Chars representation shape: {}".format( char_represent.get_shape().as_list())) word_emb = tf.concat([word_emb, char_represent], axis=-1) self.word_emb = tf.layers.dropout(word_emb, rate=self.drop_rate, training=self.is_train) print("Word and chars concatenation shape: {}".format( self.word_emb.get_shape().as_list())) # Build model ops with tf.name_scope("BiLSTM"): with tf.variable_scope('forward'): lstm_fw_cell = tf.keras.layers.LSTMCell(self.cfg["num_units"]) with tf.variable_scope('backward'): lstm_bw_cell = tf.keras.layers.LSTMCell(self.cfg["num_units"]) rnn_outs, *_ = bidirectional_dynamic_rnn( lstm_fw_cell, lstm_bw_cell, self.word_emb, sequence_length=self.seq_len, dtype=tf.float32) # As we have a Bi-LSTM, we have two outputs which are not connected, so we need to merge them. rnn_outs = tf.concat(rnn_outs, axis=-1) # rnn_outs = tf.layers.dropout(rnn_outs, rate=self.drop_rate, training=self.is_train) outputs = rnn_outs print("Output shape: {}".format(outputs.get_shape().as_list())) self.logits = tf.layers.dense(outputs, units=self.label_vocab_size, use_bias=True) # self.logits = tf.nn.softmax(self.logits) print("Logits shape: {}".format(self.logits.get_shape().as_list())) # Define loss and optimizer crf_loss, self.trans_params = crf_log_likelihood( self.logits, self.labels, self.seq_len) # losses = focal_loss(self.gamma,self.alpha) # self.loss = losses(self.labels, self.logits) self.loss = tf.reduce_mean(-crf_loss) tf.summary.scalar("loss", self.loss) optimizer = tf.train.AdamOptimizer(learning_rate=self.lr) self.train_op = optimizer.minimize(self.loss) print('Params number: {}'.format( np.sum([ np.prod(v.get_shape().as_list()) for v in tf.trainable_variables() ]))) sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True self.sess = tf.Session(config=sess_config) self.saver = tf.train.Saver(max_to_keep=self.max_to_keep) self.sess.run(tf.global_variables_initializer())
def __init__(self,vocab_size,word_dim,hidden_dim, pad_word,init_embedding=None, num_classes=4,clip=5, lr=0.001,l2_reg_lamda=0.0,num_layers=1, rnn_cell='lstm',bi_direction=False, hidden_dim2=128,hyper_embedding_size=16 ): self.x=tf.placeholder(dtype=tf.int32,shape=[None,None,9],name='input_x') self.y=tf.placeholder(dtype=tf.int32,shape=[None,None],name='input_y') self.dict=tf.placeholder(dtype=tf.float32,shape=[None,None,8],name='dict') self.dropout_keep_prob=tf.placeholder(dtype=tf.float32,name='dropout_keep_prob') #dropout_keep_prob #例如: (batch_size=2) (x_batch).shape: (2, 163, 9) #得到batch_size个句子的长度: seq_length=[153 163] 2个句子的长度 :153 163 self.seq_length=tf.reduce_sum(tf.cast(tf.not_equal(self.x[:,:,2], tf.ones_like(self.x[:,:,2])*pad_word), tf.int32), 1) #bool类型转化为o/1. 通过与全1矩阵的比较,得到矩阵的各个点是否有字,没有字的地方即为0 self.weights=tf.cast(tf.not_equal(self.x[:,:,2], tf.ones_like(self.x[:,:,2])*pad_word), tf.float32) #(x_batch).shape (2, 163, 9) self.batch_size = tf.shape(self.x)[0] #??(batch_size=2): (x_batch).shape (2, 163, 9) #得到embedding if init_embedding is None: self.embedding=tf.get_variable(shape=[vocab_size,word_dim],dtype=tf.float32,name='embedding') else: self.embedding=tf.Variable(init_embedding,dtype=tf.float32,name='embedding') #将x与embedding匹配,x从id变成embedding with tf.variable_scope('embedding'): x=tf.nn.embedding_lookup(self.embedding,self.x) #x embedding #batch=2,reshape: 2 * n *(9*dimention) 将原先的x的9维合并为1维 x = tf.reshape(x, [self.batch_size, -1, 9 * word_dim]) def lstm_cell(dim): cell=rnn.BasicLSTMCell(dim) cell=rnn.DropoutWrapper(cell,output_keep_prob=self.dropout_keep_prob) return cell def hyperlstm_cell(dim): cell=HyperLSTMCell(num_units=hidden_dim,forget_bias=1.0,use_recurrent_dropout=False, dropout_keep_prob=1.0,use_layer_norm=False,hyper_num_units=hidden_dim2, hyper_embedding_size=hyper_embedding_size,hyper_use_recurrent_dropout=False) #运算符将dropout添加到给定单元格的输入和输出。 cell=rnn.DropoutWrapper(cell,output_keep_prob=self.dropout_keep_prob) return cell #第一层:输入9*100+8维的inputx,投入Bi-LSTM with tf.variable_scope('first_layer'): inputx=tf.concat([x,self.dict],axis=2) #沿一个维度连接张量 axis越小,连接的维度越靠外。 #0为最外层也就是维度的第一位,2则是(?, ?, 900) (?, ?, 8) concat 得到(?, ?, 908) #forward_output shape=(?, ?, 128) (forward_output,backword_output),_=tf.nn.bidirectional_dynamic_rnn( cell_fw=hyperlstm_cell(hidden_dim), #RNNCell的一个实例,用于前向 cell_bw=hyperlstm_cell(hidden_dim), #RNNCell的一个实例,用于反向 inputs=inputx, #RNN输入 sequence_length=self.seq_length, #包含批处理中每个序列的实际长度 dtype=tf.float32 #初始状态和预期输出的数据类型 ) output=tf.concat([forward_output,backword_output],axis=2) #得到合并的输出 shape=(?, ?, 256) with tf.variable_scope('loss'): self.output=layers.fully_connected( #全连接层 inputs=output, num_outputs=num_classes, activation_fn=None, #默认relu,none表示无 ) #crf #最大似然log_likelihood dtype=float32 #转换矩阵transition_params shape=(4, 4) dtype=float32_ref log_likelihood, self.transition_params = crf.crf_log_likelihood( self.output, self.y, self.seq_length) #计算张量维度的元素平均值 dtype=float32 loss = tf.reduce_mean(-log_likelihood) with tf.variable_scope('train_op'): self.optimizer=tf.train.AdamOptimizer(learning_rate=lr) #Adam优化器 tvars=tf.trainable_variables() #返回使用的所有变量 #add_n:逐个元素地添加所有输入张量 l2_loss: L2损失 l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1]) self.loss=loss+l2_reg_lamda*l2_loss grads,_=tf.clip_by_global_norm(tf.gradients(self.loss,tvars),clip) self.train_op=self.optimizer.apply_gradients(zip(grads,tvars))
def __init__(self, settings): self.embedding_size = settings.embedding_size self.time_step = settings.time_step self.hidden_size = settings.hidden_size self.seq_dim = settings.seq_dim self.layers_num = settings.layers_num self.n_classes = settings.n_classes self.n_seq = settings.n_seq self.vocabulary_size = settings.vocabulary_size self._weights_decay = settings.weights_decay self._embed_dropout_prob = settings.embed_dropout_prob self._cnn_kernel_outdim = settings.cnn_kernel_outdim self._cnn_kernel_size = settings.cnn_kernel_size self._cnn_stride = settings.cnn_stride self._global_steps = tf.Variable(0, trainable=False, name='Global_Step') self.initializer = initializers.xavier_initializer() self._dropout_prob = tf.placeholder(tf.float32, []) # input placeholder with tf.name_scope('Inputs'): self._sentence_lengths = tf.placeholder(tf.int32, [None], name='sentence_lengths') self._x_inputs = tf.placeholder(tf.int32, [None, self.time_step], name='x_input') self._y_inputs = tf.placeholder(tf.int32, [None, self.time_step], name='y_input') self._seq_inputs = tf.placeholder(tf.int32, [None, self.time_step], name='seq_input') self._batch_size = tf.placeholder(tf.int32, [], name='batch_size') self._embedding = [] with tf.variable_scope('embedding'): self._char_embedding = tf.get_variable(shape=[self.vocabulary_size+1, self.embedding_size], initializer=self.initializer, dtype=tf.float32, trainable=True, name='char_embedding') self.char_embedding = tf.nn.embedding_lookup(self._char_embedding, self.x_inputs) self._embedding.append(self.char_embedding) with tf.variable_scope('seq_embedding'): self.seq_embedding = tf.get_variable(shape=[self.n_seq, self.seq_dim], initializer=self.initializer, dtype=tf.float32, trainable=True, name='seq_embedding') self._embedding.append(tf.nn.embedding_lookup(self.seq_embedding, self.seq_inputs)) with tf.variable_scope('cnn_embedding'): self.cnn_embedding = self.cnn() self._embedding.append(self.cnn_embedding) self.embedding = tf.concat(self._embedding, axis=-1) self.embedding = tf.nn.dropout(self.embedding, self._embed_dropout_prob) with tf.variable_scope('bi_lstm'): bi_lstm_output = self.inference(self.embedding) bi_lstm_output = tf.nn.dropout(bi_lstm_output, self._dropout_prob) with tf.variable_scope('flatten_middle'): flatten_input = tf.reshape(bi_lstm_output, [-1, self.hidden_size * 2]) weights = self._variable_with_weight_decay('weights_middle', [self.hidden_size*2, self.hidden_size], self.weights_decay) tf.summary.histogram('weights_middle', weights) biases = self._variable_on_cpu('biases_middle', [self.hidden_size], tf.zeros_initializer()) tf.summary.histogram('biases_middle', biases) _flatten_middle = tf.matmul(flatten_input, weights)+biases flatten_middle = tf.tanh(_flatten_middle) with tf.variable_scope('flatten_out'): weights = self._variable_with_weight_decay('weights_out', [self.hidden_size, self.n_classes], self.weights_decay) tf.summary.histogram('weights_out', weights) biases = self._variable_on_cpu('biases_out', [self.n_classes], tf.zeros_initializer()) tf.summary.histogram('biases_out', biases) flatten_out = tf.nn.xw_plus_b(flatten_middle, weights, biases) with tf.name_scope('crf'): # 没用variable_scope self.logits = tf.reshape(flatten_out, [-1, self.time_step, self.n_classes]) self.transition_params = tf.get_variable('transitions', shape=[self.n_classes, self.n_classes], initializer=self.initializer) log_likelihood, self.transition_params = crf_log_likelihood( inputs=self.logits, tag_indices=self.y_inputs, transition_params=self.transition_params, sequence_lengths=self.sentence_lengths) self._crf_loss = -tf.reduce_mean(log_likelihood) tf.summary.scalar('crf_lost', self._crf_loss) self.lost = self._crf_loss + tf.add_n(tf.get_collection('losses')) tf.summary.scalar('lost', self.lost) with tf.name_scope('predict'): self.predict_sentence, self.best_score = crf_decode( self.logits, self.transition_params, self.sentence_lengths) self._correct_predict = tf.equal(self.predict_sentence, self.y_inputs) self.accuracy = tf.reduce_mean(tf.cast(self._correct_predict, 'float')) tf.summary.scalar('accuracy', self.accuracy) # self.conf_matrix = tf.confusion_matrix(self.y_inputs, self.predict_sentence, num_classes=self.n_classes) self.saver = tf.train.Saver(max_to_keep=2)
def loss(self,): log_likelihood, self.transition_params = crf_log_likelihood(inputs=self.logits, tag_indices=self.input_y, sequence_lengths=self.sequence_lengths) crf_loss = -tf.reduce_mean(log_likelihood) return crf_loss
def __init__(self, config, features, dropout_keep_prob, init_embeddings=None): super(DictHyperModel).__init__() input_ids = features["input_ids"] input_dicts = features["input_dicts"] seq_length = features["seq_length"] label_ids = features["label_ids"] self.label_ids = label_ids self.dict = input_dicts self.seq_length = seq_length x, batch_size, feat_size = model_utils.input_embedding( input_ids, config, init_embeddings=init_embeddings) x = tf.reshape(x, [batch_size, -1, feat_size * config.embedding_size]) x = tf.nn.dropout(x, dropout_keep_prob) def hyperlstm_cell(dim, input_main_dim, input_hyper_dim): cell = HyperLSTMCell( num_units=dim, input_main_dim=input_main_dim, input_hyper_dim=input_hyper_dim, forget_bias=1.0, use_recurrent_dropout=False, dropout_keep_prob=1.0, use_layer_norm=False, hyper_num_units=config.dict_hidden_size, hyper_embedding_size=config.hyper_embedding_size, hyper_use_recurrent_dropout=False) cell = tf.nn.rnn_cell.DropoutWrapper( cell, output_keep_prob=dropout_keep_prob) return cell with tf.variable_scope('hyper'): self.dict = tf.cast(self.dict, dtype=tf.float32) input_main_dim = model_utils.get_shape_list(x, expected_rank=3)[2] input_hyper_dim = model_utils.get_shape_list(self.dict, expected_rank=3)[2] x = tf.concat([x, self.dict], axis=2) (forward_output, backword_output), _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=hyperlstm_cell(config.hidden_size, input_main_dim, input_hyper_dim), cell_bw=hyperlstm_cell(config.hidden_size, input_main_dim, input_hyper_dim), inputs=x, sequence_length=self.seq_length, dtype=tf.float32) output = tf.concat([forward_output, backword_output], axis=2) with tf.variable_scope('output'): scores = layers.fully_connected(inputs=output, num_outputs=config.num_classes, activation_fn=None) transition_param = tf.get_variable( "transitions", [config.num_classes, config.num_classes]) self.prediction, _ = crf.crf_decode(scores, transition_param, self.seq_length) with tf.variable_scope('loss'): # crf if config.multitag: self.label_ids = tf.cast(self.label_ids, dtype=tf.bool) self.log_likelihood, _ = model_utils.crf_multitag_log_likelihood( scores, self.label_ids, self.seq_length, transition_param) else: self.log_likelihood, _ = crf.crf_log_likelihood( scores, self.label_ids, self.seq_length, transition_param) self.loss = tf.reduce_mean(-self.log_likelihood)
def CRF_layer(self): self.logit = self.bilstm.logit with tf.name_scope('crf'): log_likelihood_, self.transition = crf.crf_log_likelihood( self.logit, self.bilstm.input_y, self.bilstm.seq_lengths) self.cost = -tf.reduce_mean(log_likelihood_)
def __init__(self, vocab_size, word_dim, hidden_dim, pad_word, init_embedding=None, num_classes=4, clip=5, lr=0.001, l2_reg_lamda=0.0, num_layers=1, rnn_cell='lstm', bi_direction=False, hidden_dim2=128, hyper_embedding_size=16): self.x = tf.placeholder(dtype=tf.int32, shape=[None, None, 9], name='input_x') self.y = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_y') self.dict = tf.placeholder(dtype=tf.float32, shape=[None, None, 8], name='dict') self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob') self.seq_length = tf.reduce_sum( tf.cast( tf.not_equal(self.x[:, :, 2], tf.ones_like(self.x[:, :, 2]) * pad_word), tf.int32), 1) self.weights = tf.cast( tf.not_equal(self.x[:, :, 2], tf.ones_like(self.x[:, :, 2]) * pad_word), tf.float32) self.batch_size = tf.shape(self.x)[0] if init_embedding is None: self.embedding = tf.get_variable(shape=[vocab_size, word_dim], dtype=tf.float32, name='embedding') else: self.embedding = tf.Variable(init_embedding, dtype=tf.float32, name='embedding') with tf.variable_scope('embedding'): x = tf.nn.embedding_lookup(self.embedding, self.x) x = tf.reshape(x, [self.batch_size, -1, 9 * word_dim]) x = tf.nn.dropout(x, self.dropout_keep_prob) def lstm_cell(dim): cell = rnn.BasicLSTMCell(dim) cell = rnn.DropoutWrapper(cell, output_keep_prob=self.dropout_keep_prob) return cell with tf.variable_scope('character'): (forward_output, backword_output), _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=lstm_cell(hidden_dim), cell_bw=lstm_cell(hidden_dim), inputs=x, sequence_length=self.seq_length, dtype=tf.float32) output = tf.concat([forward_output, backword_output], axis=2) with tf.variable_scope('dict'): (forward_output, backword_output), _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=lstm_cell(hidden_dim2), cell_bw=lstm_cell(hidden_dim2), inputs=self.dict, sequence_length=self.seq_length, dtype=tf.float32) dict_output = tf.concat([forward_output, backword_output], axis=2) output = tf.concat([dict_output, output], axis=2) with tf.variable_scope('loss'): self.output = layers.fully_connected(inputs=output, num_outputs=num_classes, activation_fn=None) #crf log_likelihood, self.transition_params = crf.crf_log_likelihood( self.output, self.y, self.seq_length) loss = tf.reduce_mean(-log_likelihood) with tf.variable_scope('train_op'): self.optimizer = tf.train.AdamOptimizer(learning_rate=lr) tvars = tf.trainable_variables() l2_loss = tf.add_n( [tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1]) self.loss = loss + l2_loss * l2_reg_lamda grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clip) self.train_op = self.optimizer.apply_gradients(zip(grads, tvars))
def bilstm_crf(self): with tf.device('/cpu:0'): _word_embeddings = tf.Variable(self.config.embeddings, dtype=tf.float32, trainable=self.config.update_embedding) word_embeddings = tf.nn.embedding_lookup(params=_word_embeddings, ids=self.word_ids) self.word_embeddings = word_embeddings with tf.variable_scope("bi-lstm"): cell_fw = LSTMCell(self.config.hidden_dim) cell_bw = LSTMCell(self.config.hidden_dim) (output_fw_seq, output_bw_seq), _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=cell_fw, cell_bw=cell_bw, inputs=self.word_embeddings, sequence_length=self.sequence_lengths, dtype=tf.float32) output = tf.concat([output_fw_seq, output_bw_seq], axis=-1) output = tf.nn.dropout(output, self.dropout_pl) with tf.variable_scope("proj"): W = tf.get_variable(name="W", shape=[2 * self.config.hidden_dim, self.config.num_tags], initializer=tf.contrib.layers.xavier_initializer(), dtype=tf.float32) b = tf.get_variable(name="b", shape=[self.config.num_tags], initializer=tf.zeros_initializer(), dtype=tf.float32) s = tf.shape(output) output = tf.reshape(output, [-1, 2 * self.config.hidden_dim]) pred = tf.matmul(output, W) + b self.logits = tf.reshape(pred, [-1, s[1], self.config.num_tags]) if not self.config.CRF: self.labels_softmax_ = tf.argmax(self.logits, axis=-1) self.labels_softmax_ = tf.cast(self.labels_softmax_, tf.int32) with tf.variable_scope("loss"): if self.config.CRF: log_likelihood, self.transition_params = crf_log_likelihood(inputs=self.logits, tag_indices=self.labels, sequence_lengths=self.sequence_lengths) self.loss = -tf.reduce_mean(log_likelihood) else: losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.labels) mask = tf.sequence_mask(self.sequence_lengths) losses = tf.boolean_mask(losses, mask) self.loss = tf.reduce_mean(losses) with tf.variable_scope("optimizer"): if self.config.optimizer == 'Adam': optim = tf.train.AdamOptimizer(learning_rate=self.lr_pl) elif self.config.optimizer == 'Adadelta': optim = tf.train.AdadeltaOptimizer(learning_rate=self.lr_pl) elif self.config.optimizer == 'Adagrad': optim = tf.train.AdagradOptimizer(learning_rate=self.lr_pl) elif self.config.optimizer == 'RMSProp': optim = tf.train.RMSPropOptimizer(learning_rate=self.lr_pl) elif self.config.optimizer == 'Momentum': optim = tf.train.MomentumOptimizer(learning_rate=self.lr_pl, momentum=0.9) elif self.config.optimizer == 'SGD': optim = tf.train.GradientDescentOptimizer(learning_rate=self.lr_pl) else: optim = tf.train.GradientDescentOptimizer(learning_rate=self.lr_pl) grads_and_vars = optim.compute_gradients(self.loss) grads_and_vars_clip = [[tf.clip_by_value(g, -self.config.clip_grad, self.config.clip_grad), v] for g, v in grads_and_vars] self.train_op = optim.apply_gradients(grads_and_vars_clip, global_step=self.global_step)
def __init__(self, config: DictConcatConfig, is_training, features, init_embedding=None): super(DictConcatModel).__init__() input_ids = features["input_ids"] input_dicts = features["input_dicts"] seq_length = features["seq_length"] label_ids = features["label_ids"] self.input_ids = input_ids self.label_ids = label_ids self.dict = input_dicts self.seq_length = seq_length self.is_training = is_training input_shape = model_utils.get_shape_list(input_ids, expected_rank=3) self.batch_size = input_shape[0] self.max_length = input_shape[1] self.window_size = input_shape[2] if not is_training: config.embedding_dropout_prob = 0.0 config.hidden_dropout_prob = 0.0 if init_embedding is None: self.embedding = tf.get_variable( shape=[config.vocab_size, config.embedding_size], dtype=tf.float32, name='embedding', initializer=tf.truncated_normal_initializer(stddev=0.02)) else: self.embedding = tf.Variable(init_embedding, dtype=tf.float32, name='embedding') with tf.variable_scope('embedding'): x = tf.nn.embedding_lookup(self.embedding, self.input_ids) feat_size = self.window_size x = tf.reshape( x, [self.batch_size, -1, feat_size * config.embedding_size]) x = model_utils.dropout(x, config.embedding_dropout_prob) def lstm_cell(dim): cell = tf.nn.rnn_cell.LSTMCell(dim, name='basic_lstm_cell') cell = rnn.DropoutWrapper(cell, output_keep_prob=1.0 - config.hidden_dropout_prob) return cell with tf.variable_scope('character'): (forward_output, backword_output), _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=lstm_cell(config.hidden_size), cell_bw=lstm_cell(config.hidden_size), inputs=x, sequence_length=self.seq_length, dtype=tf.float32) output = tf.concat([forward_output, backword_output], axis=2) with tf.variable_scope('dict'): self.dict = tf.cast(self.dict, dtype=tf.float32) (forward_output, backword_output), _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=lstm_cell(config.dict_hidden_size), cell_bw=lstm_cell(config.dict_hidden_size), inputs=self.dict, sequence_length=self.seq_length, dtype=tf.float32) dict_output = tf.concat([forward_output, backword_output], axis=2) with tf.variable_scope('output'): output = tf.concat([dict_output, output], axis=2) scores = layers.fully_connected(inputs=output, num_outputs=config.num_classes, activation_fn=None) transition_param = tf.get_variable( "transitions", [config.num_classes, config.num_classes]) self.prediction, _ = crf.crf_decode(scores, transition_param, self.seq_length) with tf.variable_scope('loss'): # crf self.log_likelihood, _ = crf.crf_log_likelihood( scores, self.label_ids, self.seq_length, transition_param) self.loss = tf.reduce_mean(-self.log_likelihood)
def Model(self): # tf.device定义模型运行的具体设备,tf.name_scope定义对象属于哪个区域 with tf.device('/cpu:0'), tf.name_scope('embedding'): # with tf.device(None), tf.name_scope('embedding'): # embedding_ = tf.Variable(tf.truncated_normal([pm.vocab_size, pm.embedding_size], -0.25, 0.25), name='w') # 替换词语向量 embedding_ = get_bert_vec(pm.word_vec_path) # 在嵌入的张量中寻找id embedding = tf.nn.embedding_lookup(embedding_, self.input_x) # 将张量正则化处理防止过拟合 self.embedding = tf.nn.dropout(embedding, pm.keep_pro) with tf.name_scope('biLSTM'): # 定义双向LSTM网络, tf.nn.rnn_cell.LSTMCell与tf.contrib.rnn.LSTMCell一样 # cell_fw = tf.nn.rnn_cell.LSTMCell(pm.hidden_dim) # cell_bw = tf.nn.rnn_cell.LSTMCell(pm.hidden_dim) cell_fw = tf.contrib.rnn.LSTMCell(pm.hidden_dim) cell_bw = tf.contrib.rnn.LSTMCell(pm.hidden_dim) # 创建双向递归神经网络的动态版本 outputs, outstats = tf.nn.bidirectional_dynamic_rnn( cell_fw=cell_fw, cell_bw=cell_bw, inputs=self.embedding, sequence_length=self.seq_length, dtype=tf.float32) # 将双向神经网络拼接 outputs = tf.concat(outputs, 2) with tf.name_scope('output'): s = tf.shape(outputs) # output = tf.reshape(outputs, [-1, 2 * pm.hidden_dim]) # dense1 = tf.layers.dense(inputs=output, units=512, activation=tf.nn.relu, # kernel_regularizer=tf.contrib.layers.l2_regularizer(0.003)) # dense2 = tf.layers.dense(inputs=dense1, units=256, activation=tf.nn.relu) # output = tf.layers.dense(inputs=dense2, units=pm.num_tags, activation=None) # 全连接层,最后输出维度等于pm.num_tags output = tf.reshape(outputs, [-1, 2 * pm.hidden_dim]) output = tf.layers.dense(output, pm.num_tags) # TODO 高级tf.nn.dropout,防止过拟合,正则化张量keep_pro元素保留概率 output = tf.contrib.layers.dropout(output, pm.keep_pro) self.logits = tf.reshape(output, [-1, s[1], pm.num_tags]) with tf.name_scope('crf'): # log_likelihood是对数似然函数,transition_params是转移概率矩阵 log_likelihood, self.transition_params = crf_log_likelihood( inputs=self.logits, tag_indices=self.input_y, sequence_lengths=self.seq_length) with tf.name_scope('loss'): # tf.reduce_mean 主要用作降维或者计算tensor(图像)的平均值。keep_dims:是否降维度Flase降维 self.loss = tf.reduce_mean(-log_likelihood, keepdims=False) # 最大似然取负,使用梯度下降 with tf.name_scope('optimizer'): # tf.train.AdamOptimizer寻找全局最优解的优化算法,引入二次方梯度校正 optimizer = tf.train.AdamOptimizer( pm.learning_rate) # AdamOptimizer --> adam优化器 # TODO 梯度剪裁 gradients, variable = zip(*optimizer.compute_gradients(self.loss)) gradients, _ = tf.clip_by_global_norm(gradients, pm.clip) self.optimizer = optimizer.apply_gradients( zip(gradients, variable), global_step=self.global_step)
V1 = tf.Variable(tf.truncated_normal(stddev=0.01, shape=[hidden_num, num_tags])) V2 = tf.Variable(tf.truncated_normal(stddev=0.01, shape=[hidden_num, num_tags])) #生成bi-lstm网络 pred_p, y_label = lstm(x, y, A, Wc, bc, V1, V2) #crf的log似然损失函数 #print crf_log_likelihood的参数 print("#" * 40) print(pred_p) print(y_label) print(seq_lengths) print(A) cost, A = crf.crf_log_likelihood(inputs=pred_p, tag_indices=y_label, sequence_lengths=seq_lengths) cost = tf.reduce_mean(-cost) train = tf.train.AdamOptimizer(train_rate).minimize(cost) sess = tf.Session() sess.run(tf.initialize_all_variables()) step = 1 while step < train_step: batch_x, batch_y, batch_seq_lengths = dataGenerator.next_train_batch( batch_size) # batch_x=tf.reshape(batch_x,shape=[batch_size,sequence_length,frame_size]) _loss, __ = sess.run([cost, train], feed_dict={ x: batch_x, y: batch_y,
def __init__(self): self.config = Config() # 配置参数 self.input_x = tf.placeholder(shape=[None, self.config.seq_length], dtype=tf.int32, name='input-x') # 输入文本 self.input_y = tf.placeholder(shape=[None, self.config.seq_length], dtype=tf.int32, name='input-y') # 输入文本对应的true label self.input_length = tf.placeholder(shape=[None], dtype=tf.int32, name='input-length') # 输入文本的长度 self.input_keep_prob = tf.placeholder( dtype=tf.float32, name='input-keep-prob') # keep-prob # Embedding layer embedding = tf.get_variable( shape=[self.config.vocab_size, self.config.embedding_dim], dtype=tf.float32, name='embedding') embedding_x = tf.nn.embedding_lookup( params=embedding, ids=self.input_x) # dim:(batch_size, max_length, 300) embedding_x = tf.expand_dims( embedding_x, axis=1 ) # 扩充维度dim:(batch_size, 1, max_length, 300) 卷积操作后seq_length长度不变 # 卷积层 conv = tf.layers.conv2d( inputs=embedding_x, filters=self.config.hidden_dim, kernel_size=[1, self.config.kernel_size], strides=1, padding='SAME', activation='relu', use_bias=True, kernel_initializer=tf.contrib.layers.xavier_initializer(), bias_initializer=tf.contrib.layers.xavier_initializer()) final_output_layer_list = [] # 存储多个block的结果 for block_i in range(self.config.block_num): for dilation in self.config.dilation_size: with tf.variable_scope(name_or_scope='atrous-conv-layer-%d' % dilation, reuse=tf.AUTO_REUSE): # weight = tf.get_variable(shape=[self.config.kernel_size, self.config.embedding_dim, self.config.hidden_dim, self.config.hidden_dim], # dtype=tf.float32, # name='dilation-weight', # initializer=tf.contrib.layers.xavier_initializer()) # bias = tf.get_variable(shape=[self.config.hidden_dim], # dtype=tf.float32, # name='dilation-bias', # initializer=tf.contrib.layers.xavier_initializer()) # conv = tf.nn.atrous_conv2d(value=conv, filters=weight, rate=dilation, padding='SAME') # conv = conv + bias # conv = tf.nn.relu(conv) # 与上面语句效果等价 conv = tf.layers.conv2d(inputs=conv, filters=self.config.hidden_dim, kernel_size=[ self.config.kernel_size, self.config.embedding_dim ], strides=1, dilation_rate=dilation, padding='SAME', activation='relu', use_bias=True, kernel_initializer=tf.contrib. layers.xavier_initializer(), bias_initializer=tf.contrib.layers. xavier_initializer()) # 存储当前block的输出 final_output_layer_list.append(conv) # 将多个block的输出结果进行拼接 final_output = tf.concat(final_output_layer_list, axis=-1) # drop out final_output = tf.nn.dropout(final_output, keep_prob=self.input_keep_prob) # 压缩降维,去除维度为1的项 dim:(batch_size, max_length, 3*hidden_dim) final_output = tf.squeeze(input=final_output, axis=1) # 输出层 dim:(batch_size, max_length, num_classes) self.logits = tf.layers.dense(inputs=final_output, units=self.config.num_classes, name='logits') # 是否使用CRF层 if self.config.crf: log_likelihood, self.transition_params = crf.crf_log_likelihood( inputs=self.logits, tag_indices=self.input_y, sequence_lengths=self.input_length) self.loss = -tf.reduce_mean(log_likelihood) # 结果输出 self.predict, self.viterbi_score = crf.crf_decode( potentials=self.logits, transition_params=self.transition_params, sequence_length=self.input_length) else: # 损失函数,交叉熵 cross_entropy = tf.nn.softmax_cross_entropy_with_logits( labels=self.input_y, logits=self.logits) mask = tf.sequence_mask(lengths=self.input_length) losses = tf.boolean_mask(cross_entropy, mask=mask) self.loss = tf.reduce_mean(losses) # 结果输出 self.predict = tf.argmax(tf.nn.softmax(self.logits), axis=1, name='predict') # 优化器 self.optimizer = tf.train.AdamOptimizer( learning_rate=self.config.learning_rate).minimize(loss=self.loss)
def association(hidden, pool_idx, targets, n_targets, config, train=False, reuse=None, **kwargs): """ An Attention based sequence labeler model with association. :param hidden: The output of the featurizer. [batch_size, sequence_length, embed_dim] :param pool_idx: the index of the classify tokens along the sequence dimension. [batch_size] :param targets: A dict containing: 'labels': The sequence labeling targets. [batch_size, sequence_length], 'associations': A matrix of class ids for the associations [batch_size, sequence_length, seqence_length] :param n_targets: A python int containing the number of classes that the model should be learning to predict over. :param config: A config object, containing all parameters for the featurizer. :param train: If this flag is true, dropout and losses are added to the graph. :param reuse: Should reuse be set within this scope. :param kwargs: Spare arguments. :return: dict containing: "logits": The un-normalised log probabilities of each class being in each location. For usable predictions, sampling from this distrobution is not sufficiant and a viterbi decoding method should be used. "losses": The negative log likelihood for the sequence targets. "predict_params": A dictionary of params to be fed to the viterbi decode function. """ with tf.variable_scope("sequence-labeler", reuse=reuse): nx = config.n_embed length = config.max_length num_associations = len(config.association_types) + 1 def seq_lab_internal(hidden): attn_fn = functools.partial( attn, scope="seq_label_attn", n_state=nx, n_head=config.seq_num_heads, resid_pdrop=config.resid_p_drop, attn_pdrop=config.attn_p_drop, train=train, scale=False, mask=False, ) n = norm(attn_fn(hidden) + hidden, "seq_label_residual") flat_logits = tf.layers.dense(n, n_targets) logits = tf.reshape( flat_logits, tf.concat([tf.shape(hidden)[:2], [n_targets]], 0)) association_head = tf.layers.dense(n, nx) association_head = tf.reshape( association_head, tf.concat([tf.shape(hidden)[:2], [nx]], 0)) a = tf.expand_dims(association_head, 1) b = tf.expand_dims(association_head, 2) features = tf.concat( [ a - b, a * b, tf.tile(a, [1, length, 1, 1]), tf.tile(b, [1, 1, length, 1]), # TODO: Think about using prediction as a feature for associations. ], axis=-1, ) associations_flat = tf.layers.dense( tf.reshape(features, shape=[-1, nx * 4]), num_associations) associations = tf.reshape(associations_flat, [-1, length, length, num_associations]) return logits, associations_flat, associations with tf.variable_scope("seq_lab_attn"): if config.low_memory_mode and train: seq_lab_internal = recompute_grad(seq_lab_internal, use_entire_scope=True) logits, associations_flat, associations = seq_lab_internal(hidden) log_likelihood = 0.0 association_loss = 0.0 class_weights = kwargs.get("class_weights") if class_weights is not None: logits = class_reweighting(class_weights)(logits) transition_params = tf.get_variable("Transition_matrix", shape=[n_targets, n_targets]) if targets is not None: log_likelihood, _ = crf_log_likelihood( logits, targets["labels"], kwargs.get("max_length") * tf.ones(tf.shape(targets["labels"])[0]), transition_params=transition_params, ) sequence_mask = tf.sequence_mask(pool_idx + 1, maxlen=length, dtype=tf.float32) mask = tf.expand_dims(sequence_mask, 1) * tf.expand_dims( sequence_mask, 2) association_loss = tf.losses.sparse_softmax_cross_entropy( logits=associations_flat, labels=tf.reshape(targets["associations"], shape=[-1]), weights=tf.reshape(mask, shape=[-1]), ) return { "logits": { "sequence": logits, "association": associations }, "losses": -log_likelihood + config.assocation_loss_weight * association_loss, # TODO: think about weighting. "predict_params": { "transition_matrix": transition_params }, }
def loss_layer(self, project_logits): """ calculate crf loss :param project_logits: [1, num_steps, num_tags] :return: scalar loss """ with tf.variable_scope("crf_loss"): small = -1000.0 # pad logits for crf loss start_logits = tf.concat([ tf.constant(small, shape=[1, self.num_tags]), tf.zeros([1, 1]), tf.constant(small, shape=[1, 1]) ], -1) start_logits = tf.expand_dims(start_logits, 0) start_logits = tf.tile( start_logits, tf.concat( [tf.expand_dims(self.batch_size, 0), tf.constant([1, 1])], 0)) end_logits = tf.concat([ tf.constant(small, shape=[1, self.num_tags + 1]), tf.zeros([1, 1]) ], -1) end_logits = tf.expand_dims(end_logits, 0) end_logits = tf.tile( end_logits, tf.concat( [tf.expand_dims(self.batch_size, 0), tf.constant([1, 1])], 0)) pad_logits = tf.cast( small * tf.ones([self.batch_size, self.num_steps, 2]), tf.float32) logits = tf.concat([project_logits, pad_logits], axis=-1) logits = tf.concat([start_logits, logits, end_logits], axis=1) #targets = tf.expand_dims(self.targets, axis=0) targets = tf.concat([ tf.ones([tf.shape(self.targets)[0], 1], tf.int32) * self.num_tags, self.targets, tf.ones([tf.shape(self.targets)[0], 1], tf.int32) * tf.add(self.num_tags, 1) ], axis=-1) self.trans = tf.get_variable( "transitions", shape=[self.num_tags + 2, self.num_tags + 2], initializer=self.initializer) log_likelihood, self.trans = crf_log_likelihood( inputs=logits, tag_indices=targets, transition_params=self.trans, sequence_lengths=tf.reduce_sum( tf.concat([ tf.expand_dims(self.sequence_length, 0), tf.expand_dims( tf.ones([self.batch_size], tf.int32) * 2, 0) ], 0), 0)) return tf.reduce_mean(-log_likelihood)
def __init__(self, config: BaselineConfig, is_training, features, init_embedding=None): """Constructor for BertModel. Args: config: `BertConfig` instance. is_training: bool. rue for training model, false for eval model. Controls whether dropout will be applied. input_ids: int64 Tensor of shape [batch_size, seq_length, feat_size]. label_ids: (optional) int64 Tensor of shape [batch_size, seq_length]. seq_length: (optional) int64 Tensor of shape [batch_size]. init_embedding: (optional) Raises: ValueError: The config is invalid or one of the input tensor shapes is invalid. """ super(BaselineModel).__init__() input_ids = features["input_ids"] seq_length = features["seq_length"] label_ids = features["label_ids"] self.input_ids = input_ids self.label_ids = label_ids self.seq_length = seq_length self.is_training = is_training input_shape = model_utils.get_shape_list(input_ids, expected_rank=3) self.batch_size = input_shape[0] self.max_length = input_shape[1] self.window_size = input_shape[2] if not is_training: config.embedding_dropout_prob = 0.0 config.hidden_dropout_prob = 0.0 if init_embedding is None: self.embedding = tf.get_variable( shape=[config.vocab_size, config.embedding_size], dtype=tf.float32, name='embedding', initializer=tf.truncated_normal_initializer(stddev=0.02)) else: self.embedding = tf.Variable(init_embedding, dtype=tf.float32, name='embedding') with tf.variable_scope('embedding'): x = tf.nn.embedding_lookup(self.embedding, self.input_ids) feat_size = self.window_size x = tf.reshape( x, [self.batch_size, -1, feat_size * config.embedding_size]) x = model_utils.dropout(x, config.embedding_dropout_prob) def lstm_cell(dim): cell = tf.nn.rnn_cell.LSTMCell(dim, name='basic_lstm_cell') cell = rnn.DropoutWrapper(cell, output_keep_prob=1.0 - config.hidden_dropout_prob) cell = tf.nn.rnn_cell.MultiRNNCell([cell] * config.num_hidden_layers) return cell with tf.variable_scope('rnn'): (forward_output, backword_output), _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=lstm_cell(config.hidden_size), cell_bw=lstm_cell(config.hidden_size), inputs=x, sequence_length=self.seq_length, dtype=tf.float32) output = tf.concat([forward_output, backword_output], axis=2) with tf.variable_scope('output'): scores = layers.fully_connected(inputs=output, num_outputs=config.num_classes, activation_fn=None) transition_param = tf.get_variable( "transitions", [config.num_classes, config.num_classes]) self.prediction, _ = crf.crf_decode(scores, transition_param, self.seq_length) with tf.variable_scope('loss'): # crf self.log_likelihood, _ = crf.crf_log_likelihood( scores, self.label_ids, self.seq_length, transition_param) self.loss = tf.reduce_mean(-self.log_likelihood)
def __init__(self): self.config = Config() # 配置参数 self.input_x = tf.placeholder(shape=[None, self.config.seq_length], dtype=tf.int32, name='input-x') # 输入文本 self.input_y = tf.placeholder(shape=[None, self.config.seq_length], dtype=tf.int32, name='input-y') # 输入文本对应的true label self.input_length = tf.placeholder(shape=[None], dtype=tf.int32, name='input-length') # 输入文本的长度 self.input_keep_prob = tf.placeholder( dtype=tf.float32, name='input-keep-prob') # keep-prob # Embedding layer embedding = tf.get_variable( shape=[self.config.vocab_size, self.config.embedding_dim], dtype=tf.float32, name='embedding') embedding_x = tf.nn.embedding_lookup( params=embedding, ids=self.input_x) # dim:(batch_size, max_length, 300) # Bi-LSTM/Bi-GRU cell_fw = self.get_rnn(self.config.rnn_type) # 前向cell cell_bw = self.get_rnn(self.config.rnn_type) # 后向cell outputs, states = tf.nn.bidirectional_dynamic_rnn(cell_fw=cell_fw, cell_bw=cell_bw, inputs=embedding_x, dtype=tf.float32) outputs = tf.concat( values=outputs, axis=2 ) # 将前向cell和后向cell的结果进行concat拼接 dim:(batch_size, max_length, 2*hidden_dim) outputs = tf.layers.dropout(inputs=outputs, rate=self.input_keep_prob) # 输出层 dim:(batch_size, max_length, num_classes) self.logits = tf.layers.dense(inputs=outputs, units=self.config.num_classes, name='logits') # 是否使用CRF层 if self.config.crf: log_likelihood, self.transition_params = crf.crf_log_likelihood( inputs=self.logits, tag_indices=self.input_y, sequence_lengths=self.input_length) self.loss = -tf.reduce_mean(log_likelihood) # 结果输出 self.predict, self.viterbi_score = crf.crf_decode( potentials=self.logits, transition_params=self.transition_params, sequence_length=self.input_length) else: # 损失函数,交叉熵 cross_entropy = tf.nn.softmax_cross_entropy_with_logits( labels=self.input_y, logits=self.logits) mask = tf.sequence_mask(lengths=self.input_length) losses = tf.boolean_mask(cross_entropy, mask=mask) self.loss = tf.reduce_mean(losses) # 结果输出 self.predict = tf.argmax(tf.nn.softmax(self.logits), axis=1, name='predict') # 优化器 self.optimizer = tf.train.AdamOptimizer( learning_rate=self.config.learning_rate).minimize(loss=self.loss)
def __init__(self, batch_size, tag_nums, hidden_nums, sentence_len, word_embeddings, device='/gpu:1'): self.batch_size = batch_size self.tag_nums = tag_nums self.hidden_nums = hidden_nums self.sentence_len = sentence_len self.word_embeddings = word_embeddings self.device = device with tf.device(device): #网络的变量 word_embeddings = tf.Variable(initial_value=word_embeddings, trainable=True) #参与训练 #输入占位符 self.input_x = tf.placeholder(dtype=tf.int32, shape=[None, self.sentence_len], name='input_word_id') #输入词的id self.input_y = tf.placeholder(dtype=tf.int32, shape=[None, self.sentence_len], name='input_labels') self.sequence_lengths = tf.placeholder( dtype=tf.int32, shape=[None], name='sequence_lengths_vector') self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") with tf.name_scope('projection'): #投影层,先将输入的词投影成相应的词向量 word_id = self.input_x word_vectors = tf.nn.embedding_lookup(word_embeddings, ids=word_id, name='word_vectors') #word_vectors = tf.nn.dropout(word_vectors,0.8) with tf.name_scope('bi-lstm'): #labels = tf.reshape(input_y,shape=[-1,self.sentence_len],name='labels') #labels = tf.reshape(input_y,shape=[-1,self.tag_nums],name='labels') labels = tf.reshape(self.input_y, shape=[self.batch_size, self.sentence_len], name='labels') fw_lstm_cell = tf.nn.rnn_cell.LSTMCell(self.hidden_nums) bw_lstm_cell = tf.nn.rnn_cell.LSTMCell(self.hidden_nums) #双向传播 output, _state = tf.nn.bidirectional_dynamic_rnn( fw_lstm_cell, bw_lstm_cell, inputs=word_vectors, sequence_length=self.sequence_lengths, dtype=tf.float32) fw_output = output[ 0] #[batch_size,self.sentence_len,self.hidden_nums] bw_output = output[ 1] #[batch_size,self.sentence_len,self.hidden_nums] V1 = tf.get_variable( 'V1', dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer(), shape=[self.hidden_nums, self.hidden_nums]) V2 = tf.get_variable( 'V2', dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer(), shape=[self.hidden_nums, self.hidden_nums]) fw_output = tf.reshape(tf.matmul( tf.reshape(fw_output, [-1, self.hidden_nums], name='Lai'), V1), shape=tf.shape(output[0])) bw_output = tf.reshape(tf.matmul( tf.reshape(bw_output, [-1, self.hidden_nums], name='Rai'), V2), shape=tf.shape(output[1])) contact = tf.concat( [fw_output, bw_output], -1, name='bi_lstm_concat' ) #[batch_size,self.sentence_len,2*self.hidden_nums] contact = tf.nn.dropout(contact, self.dropout_keep_prob) s = tf.shape(contact) contact_reshape = tf.reshape(contact, shape=[-1, 2 * self.hidden_nums], name='contact') W_lstm = tf.get_variable( 'W_lstm', dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer(), shape=[2 * self.hidden_nums, self.tag_nums], trainable=True) b_lstm = tf.get_variable( 'b_lstm', initializer=tf.zeros(shape=[self.tag_nums])) p = tf.nn.relu(tf.matmul(contact_reshape, W_lstm) + b_lstm) #logit= tf.reshape(p,shape=[-1,s[1],self.tag_nums],name='omit_matrix') #logit= tf.reshape(p,shape=[-1,s[1],self.sentence_len],name='omit_matrix') self.logit = tf.reshape( p, shape=[-1, self.sentence_len, self.tag_nums], name='omit_matrix') with tf.name_scope("crf"): log_likelihood, transition_matrix = crf.crf_log_likelihood( self.logit, labels, sequence_lengths=self.sequence_lengths) self.cost = -tf.reduce_mean(log_likelihood) self.crf_labels, _ = crf.crf_decode( self.logit, transition_matrix, sequence_length=self.sequence_lengths ) #返回的第一个值:decode_tags: A [batch_size, max_seq_len]
def bils_crf(self): with tf.device('/cpu:0'), tf.name_scope('embedding'): embedding = tf.Variable(tf.truncated_normal( [pm.vacab_size, pm.embedding_size], -0.25, 0.25), name='embedding') embeding_input = tf.nn.embedding_lookup(embedding, self.input_x) self.embedding = tf.nn.dropout(embeding_input, keep_prob=self.keep_pro) with tf.name_scope('Cell'): cell_fw = tf.nn.rnn_cell.LSTMCell(pm.hidden_dim, state_is_tuple=True) Cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, self.keep_pro) cell_bw = tf.nn.rnn_cell.LSTMCell(pm.hidden_dim, state_is_tuple=True) Cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, self.keep_pro) with tf.name_scope('biLSTM'): outputs, _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=Cell_fw, cell_bw=Cell_bw, inputs=self.embedding, sequence_length=self.seq_length, dtype=tf.float32) outputs = tf.concat(outputs, 2) # with tf.name_scope('GCN'): with tf.name_scope('output'): s = tf.shape(outputs) output = tf.reshape(outputs, [-1, 2 * pm.hidden_dim]) output = tf.layers.dense(output, pm.num_tags) output = tf.contrib.layers.dropout(output, self.keep_pro) self.logits = tf.reshape(output, [-1, s[1], pm.num_tags]) with tf.name_scope('crf'): self.log_likelihood, self.transition_params = crf_log_likelihood( inputs=self.logits, tag_indices=self.input_y, sequence_lengths=self.seq_length) # log_likelihood是对数似然函数,transition_params是转移概率矩阵 # crf_log_likelihood{inputs:[batch_size,max_seq_length,num_tags], # tag_indices:[batchsize,max_seq_length], # sequence_lengths:[real_seq_length] # transition_params: A [num_tags, num_tags] transition matrix # log_likelihood: A scalar containing the log-likelihood of the given sequence of tag indices. with tf.name_scope('loss'): self.loss = tf.reduce_mean(-self.log_likelihood) #最大似然取负,使用梯度下降 with tf.name_scope('optimizer'): # 退化学习率 learning_rate = lr*(0.9**(global_step/10);staircase=True表示每decay_steps更新梯度 # learning_rate = tf.train.exponential_decay(self.config.lr, global_step=self.global_step, # decay_steps=10, decay_rate=self.config.lr_decay, staircase=True) # optimizer = tf.train.AdamOptimizer(learning_rate) # self.optimizer = optimizer.minimize(self.loss, global_step=self.global_step) #global_step 自动+1 # no.2 optimizer = tf.train.AdamOptimizer(pm.learning_rate) gradients, variables = zip(*optimizer.compute_gradients( self.loss)) # 计算变量梯度,得到梯度值,变量 gradients, _ = tf.clip_by_global_norm(gradients, pm.clip) # 对g进行l2正则化计算,比较其与clip的值,如果l2后的值更大,让梯度*(clip/l2_g),得到新梯度 self.optimizer = optimizer.apply_gradients( zip(gradients, variables), global_step=self.global_step) # global_step 自动+1 def build_gcn(self, input_shape): features_dim = self.fea_dim self.wei = self.add_variable(name='wei', shape=[features_dim, self.out_dim], initializer=tf.zeros_initializer()) def call_gcn(self, inputs, support): # inputs = np.array(inputs, dtype=float) # support = np.array(support, dtype=float) inputs = tf.cast(inputs, dtype=tf.float32) support = tf.cast(support, dtype=tf.float32) H_t = tf.matmul(support, inputs) output = tf.matmul(H_t, self.wei) return tf.sigmoid(output)
def loss_op(self): log_likelihood, self.transition_params = crf_log_likelihood( inputs=self.logits, tag_indices=self.labels, sequence_lengths=self.sequence_lengths) self.loss = -tf.reduce_mean(log_likelihood)
# -*- coding: utf-8 -*-
def build_tagging_graph(self, inputs, hidden_layers, channels, num_tags, use_crf, lamd, dropout_emb, dropout_hidden, kernel_size, use_bn, use_wn, active_type): """ Build a deep neural model for sequence tagging. """ stag_ids = tf.placeholder(dtype=INT_TYPE, shape=[None, None], name='stag_ids') seq_lengths = tf.placeholder(dtype=INT_TYPE, shape=[None], name='seq_lengths') # Default is not train. is_train = tf.placeholder(dtype=tf.bool, shape=[], name='is_train') masks = tf.cast(tf.sequence_mask(seq_lengths), FLOAT_TYPE) # Dropout on embedding output. if dropout_emb: inputs = tf.cond(is_train, lambda: tf.nn.dropout(inputs, 1 - dropout_emb), lambda: inputs) hidden_output = inputs pre_channels = inputs.get_shape()[-1].value for i in range(hidden_layers): k = kernel_size cur_channels = channels[i] filter_w = tf.get_variable('filter_w_%d' % i, shape=[k, pre_channels, cur_channels], dtype=FLOAT_TYPE) filter_v = tf.get_variable('filter_v_%d' % i, shape=[k, pre_channels, cur_channels], dtype=FLOAT_TYPE) bias_b = tf.get_variable( 'bias_b_%d' % i, shape=[cur_channels], initializer=tf.zeros_initializer(dtype=FLOAT_TYPE)) bias_c = tf.get_variable( 'bias_c_%d' % i, shape=[cur_channels], initializer=tf.zeros_initializer(dtype=FLOAT_TYPE)) # Weight normalization. if use_wn: epsilon = 1e-12 g_w = tf.get_variable('g_w_%d' % i, shape=[k, 1, cur_channels], dtype=FLOAT_TYPE) g_v = tf.get_variable('g_v_%d' % i, shape=[k, 1, cur_channels], dtype=FLOAT_TYPE) # Perform wn filter_w = g_w * filter_w / (tf.sqrt( tf.reduce_sum(filter_w**2, 1, keep_dims=True)) + epsilon) filter_v = g_v * filter_v / (tf.sqrt( tf.reduce_sum(filter_v**2, 1, keep_dims=True)) + epsilon) w = tf.nn.conv1d(hidden_output, filter_w, 1, 'SAME') + bias_b v = tf.nn.conv1d(hidden_output, filter_v, 1, 'SAME') + bias_c if use_bn: w = layers.batch_norm(inputs=v, decay=0.9, is_training=is_train, center=True, scale=True, scope='BatchNorm_w_%d' % i) v = layers.batch_norm(inputs=w, decay=0.9, is_training=is_train, center=True, scale=True, scope='BatchNorm_v_%d' % i) if active_type == 'glu': hidden_output = w * tf.nn.sigmoid(v) elif active_type == 'relu': hidden_output = tf.nn.relu(w) elif active_type == 'gtu': hidden_output = tf.tanh(w) * tf.nn.sigmoid(v) elif active_type == 'tanh': hidden_output = tf.tanh(w) elif active_type == 'linear': hidden_output = w elif active_type == 'bilinear': hidden_output = w * v # Mask paddings. hidden_output = hidden_output * tf.expand_dims(masks, -1) # Dropout on hidden output. if dropout_hidden: hidden_output = tf.cond( is_train, lambda: tf.nn.dropout(hidden_output, 1 - dropout_hidden), lambda: hidden_output) pre_channels = cur_channels # Un-scaled log probabilities. scores = layers.fully_connected(hidden_output, num_tags, tf.identity) if use_crf: cost, transitions = crf.crf_log_likelihood( inputs=scores, tag_indices=stag_ids, sequence_lengths=seq_lengths) cost = -tf.reduce_mean(cost) else: reshaped_scores = tf.reshape(scores, [-1, num_tags]) reshaped_stag_ids = tf.reshape(stag_ids, [-1]) real_distribution = layers.one_hot_encoding( reshaped_stag_ids, num_tags) cost = tf.nn.softmax_cross_entropy_with_logits( reshaped_scores, real_distribution) cost = tf.reduce_sum( tf.reshape(cost, tf.shape(stag_ids)) * masks) / tf.cast( tf.shape(inputs)[0], FLOAT_TYPE) # Calculate L2 penalty. l2_penalty = 0 if lamd > 0: for v in tf.trainable_variables(): if '/B:' not in v.name and '/biases:' not in v.name: l2_penalty += lamd * tf.nn.l2_loss(v) train_cost = cost + l2_penalty # Summary cost. tf.summary.scalar('cost', cost) summaries = tf.summary.merge_all() update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if update_ops: updates = tf.group(*update_ops) with tf.control_dependencies([updates]): cost = tf.identity(cost) return stag_ids, seq_lengths, is_train, cost, train_cost, scores, summaries
def __init__(self, config, features, dropout_keep_prob, init_embeddings=None): super(AttendedDictModel).__init__() input_ids = features["input_ids"] input_dicts = features["input_dicts"] seq_length = features["seq_length"] label_ids = features["label_ids"] self.label_ids = label_ids self.dict = input_dicts self.seq_length = seq_length dict_shape = model_utils.get_shape_list(input_dicts, expected_rank=3) self.dict_dim = dict_shape[2] x, batch_size, feat_size = model_utils.input_embedding( input_ids, config, init_embeddings=init_embeddings) x = tf.reshape(x, [batch_size, -1, feat_size * config.embedding_size]) x = tf.nn.dropout(x, dropout_keep_prob) with tf.variable_scope('character'): (forward_output, backword_output), _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=model_utils.multi_lstm_cell(config.hidden_size, config.num_hidden_layers, dropout_keep_prob), cell_bw=model_utils.multi_lstm_cell(config.hidden_size, config.num_hidden_layers, dropout_keep_prob), inputs=x, sequence_length=self.seq_length, dtype=tf.float32) output = tf.concat([forward_output, backword_output], axis=2) with tf.variable_scope('dict_attention'): dict_attention = layers.fully_connected(inputs=output, num_outputs=self.dict_dim, activation_fn=tf.sigmoid) # [B, L, D] self.dict = tf.cast(self.dict, dtype=tf.float32) attend_dict = tf.multiply(self.dict, dict_attention) with tf.variable_scope('dict'): (forward_output, backword_output), _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=model_utils.multi_lstm_cell(config.hidden_size, config.num_hidden_layers, dropout_keep_prob), cell_bw=model_utils.multi_lstm_cell(config.hidden_size, config.num_hidden_layers, dropout_keep_prob), inputs=attend_dict, sequence_length=self.seq_length, dtype=tf.float32) dict_output = tf.concat([forward_output, backword_output], axis=2) with tf.variable_scope('output'): output = tf.concat([dict_output, output], axis=2) scores = layers.fully_connected(inputs=output, num_outputs=config.num_classes, activation_fn=None) transition_param = tf.get_variable( "transitions", [config.num_classes, config.num_classes]) self.prediction, _ = crf.crf_decode(scores, transition_param, self.seq_length) with tf.variable_scope('loss'): # crf if config.multitag: self.label_ids = tf.cast(self.label_ids, dtype=tf.bool) self.log_likelihood, _ = model_utils.crf_multitag_log_likelihood( scores, self.label_ids, self.seq_length, transition_param) else: self.log_likelihood, _ = crf.crf_log_likelihood( scores, self.label_ids, self.seq_length, transition_param) self.loss = tf.reduce_mean(-self.log_likelihood)
def __init__(self, config, char_embeddings): #config self.config = config self.lr = config.lr self.l2_lamda = config.l2_lamda self.clip = config.clip self.char_dim = config.char_dim self.lstm_dim = config.lstm_dim self.seg_dim = config.seg_dim self.num_tags = config.num_tags self.num_chars = config.num_chars self.num_segs = config.num_segs #placeholder self.char_inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name='CharInputs') self.seg_inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name='SegInputs') self.tags = tf.placeholder(dtype=tf.int32, shape=[None, None], name='Tags') self.dropout_keep = tf.placeholder(dtype=tf.float32, name='Dropout_keep') #shape #[batch_size] self.lengths = tf.reduce_sum( tf.cast( tf.greater(self.char_inputs, tf.zeros_like(self.char_inputs)), tf.int32), 1) self.batch_size = tf.shape(self.char_inputs)[0] self.max_length = tf.shape(self.char_inputs)[1] #embedding_layer with tf.variable_scope("embedding_layer"): if char_embeddings is None: self.char_embeddings = tf.get_variable( name='char_embeddings', shape=[self.num_chars, self.char_dim], dtype=tf.float32) else: self.char_embeddings = tf.Variable(char_embeddings, name='char_embeddings', dtype=tf.float32) char_inputs = tf.nn.embedding_lookup(self.char_embeddings, self.char_inputs) if self.config.seg_dim > 0: self.seg_embeddings = tf.get_variable( name='seg_embeddings', shape=[self.num_segs, self.seg_dim]) seg_inputs = tf.nn.embedding_lookup(self.seg_embeddings, self.seg_inputs) inputs = tf.concat([char_inputs, seg_inputs], axis=-1) else: inputs = char_inputs #dropout lstm_inputs = tf.nn.dropout(inputs, keep_prob=self.dropout_keep) #bilistm_layer with tf.variable_scope("bilstm_layer"): cell_fw = rnn.LSTMCell(num_units=self.lstm_dim) cell_bw = rnn.LSTMCell(num_units=self.lstm_dim) (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=cell_fw, cell_bw=cell_bw, inputs=lstm_inputs, sequence_length=self.lengths, dtype=tf.float32) lstm_outputs = tf.concat([output_fw, output_bw], axis=2) #project_layer self.logits = layers.fully_connected(inputs=lstm_outputs, num_outputs=self.num_tags, activation_fn=None, scope='project_layer') #crf_layer with tf.variable_scope("crf_layer"): log_likelihood, self.transition_params = crf.crf_log_likelihood( inputs=self.logits, tag_indices=self.tags, sequence_lengths=self.lengths) self.loss = tf.reduce_mean(-log_likelihood) #summary tf.summary.scalar("loss", self.loss) #train_op self.global_step = tf.Variable(0, trainable=False) optimizer = tf.train.AdamOptimizer(self.lr) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), self.clip) self.train_op = optimizer.apply_gradients(zip(grads, tvars), self.global_step)
def run(self): data_loader = DataLoader(self.TRAIN_DATA_PATH, self.TRAIN_LABEL_PATH, self.TEST_DATA_PATH, self.TEST_LABEL_PATH) x_data, y_data = data_loader.get_train_data() test_x_data, test_y_data = data_loader.get_test_data() vocab_size = data_loader.vocab_size test_gt_list = [] test_res_list = [] graph = tf.Graph() with graph.as_default(): words = tf.placeholder(tf.int32, shape=[1, None], name="words") labels = tf.placeholder(tf.int32, shape=[1, None], name="labels") sequence_lengths = tf.placeholder(tf.int32, shape=[None], name="sequence_lengths") embeddings = tf.Variable(tf.random_uniform( [vocab_size, self.WORD_DIM], -1.0, 1.0), name="embeddings_o") embeddings = tf.nn.l2_normalize(embeddings, 1, name="embeddings_norm") word_embeddings = tf.nn.embedding_lookup(embeddings, words, name="word_embeddings") cell_fw = LSTMCell(self.HIDDEN_DIM) cell_bw = LSTMCell(self.HIDDEN_DIM) (output_fw_seq, output_bw_seq), _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=cell_fw, cell_bw=cell_bw, inputs=word_embeddings, dtype="float32") output = tf.concat([output_fw_seq, output_bw_seq], axis=-1) W = tf.get_variable( name="W", shape=[2 * self.HIDDEN_DIM, self.NUM_TAG], initializer=tf.contrib.layers.xavier_initializer(), dtype=tf.float32) b = tf.get_variable(name="b", shape=[self.NUM_TAG], initializer=tf.zeros_initializer(), dtype=tf.float32) s = tf.shape(output) output = tf.reshape(output, [-1, 2 * self.HIDDEN_DIM]) pred = tf.matmul(output, W) + b logits = tf.reshape(pred, [-1, s[1], self.NUM_TAG], name="logits") transition_params_copy = tf.get_variable( name="transition_params", shape=[self.NUM_TAG, self.NUM_TAG], initializer=tf.zeros_initializer(), dtype=tf.float32) log_likelihood, transition_params = crf_log_likelihood( inputs=logits, tag_indices=labels, sequence_lengths=sequence_lengths) transition_params_copy = transition_params loss = -tf.reduce_mean(log_likelihood) optimizer = tf.train.AdamOptimizer( learning_rate=0.001).minimize(loss) saver = tf.train.Saver(max_to_keep=1) with tf.Session(graph=graph) as sess: sess.run(tf.global_variables_initializer()) for epoch in range(self.EPOCH_NUM): print("epoch: ", epoch) batch = 0 self.BATCH_NUM = 0 while (1): batch += 1 batch_x, batch_y = self.next_batch(x_data, y_data) self.BATCH_NUM = self.BATCH_NUM + 1 seq_len = 0 for i in batch_x: seq_len += len(i) temp_seq = [] temp_seq.append(seq_len) seq_len = np.array(temp_seq) reshape_x = [] for x in batch_x: reshape_x.extend(x) reshape_x = np.array(reshape_x) reshape_y = [] for x in batch_y: reshape_y.extend(x) reshape_y = np.array(reshape_y) reshape_x = reshape_x.reshape(1, -1) reshape_y = reshape_y.reshape(1, -1) feed_dict = { words: reshape_x, labels: reshape_y, sequence_lengths: seq_len } train_loss, _ = sess.run([loss, optimizer], feed_dict) print("batch: ", batch) print("train_loss: ", train_loss) saver.save(sess, 'ckpt/BiLSTM_CRF.ckpt', global_step=batch) if self.check_end == 1: self.check_end = 0 break print("testing----------------------") result_file = open("resultnew.txt", "w") for i in range(test_x_data.shape[0]): seq_len = np.array([test_x_data[i].shape[0]]) batch_xdata = test_x_data[i].reshape(1, -1) batch_ydata = test_y_data[i].reshape(1, -1) for ydata in test_y_data[i]: test_gt_list.append(ydata) feed_dict = { words: batch_xdata, labels: batch_ydata, sequence_lengths: seq_len } temp_logits, temp_transition_params = sess.run( [logits, transition_params], feed_dict=feed_dict) viterbi_seq, _ = viterbi_decode( temp_logits[0][:seq_len[0]], temp_transition_params) for pred_data in viterbi_seq: test_res_list.append(pred_data) result_file.write(str(viterbi_seq)) result_file.write('\n') if len(test_gt_list) != len(test_res_list): print("test error!") precision, recall, f1 = self.evaluation(test_gt_list, test_res_list) print("Average Precision: ", precision) print("Average Recall: ", recall) print("Average F1: ", f1)
def loss_op(self): log_likelihood, self.transition_params = crf_log_likelihood(inputs=self.logits, tag_indices=self.labels, sequence_lengths=self.sequence_lengths) self.loss = tf.reduce_mean(-log_likelihood) tf.summary.scalar("loss", self.loss)
def sequence_labeler(hidden, targets, n_targets, config, pad_id, multilabel=False, train=False, reuse=None, pool_idx=None, **kwargs): """ An Attention based sequence labeler model. In the case of unidirectional base models such as GPT this model takes the output of the pre-trained model, applies an additional randomly initialised multihead attention block, with residuals on top. The extra attention is not future masked to allow the model to label sequences based on context in both directions. The representations fed into this model are necessarily future masked because a language modelling loss is the original objective of the featurizer. For bidirectional base models we apply the crf model directly to the output of the base model. :param hidden: The output of the featurizer. [batch_size, sequence_length, embed_dim] :param targets: The placeholder representing the sequence labeling targets. [batch_size, sequence_length] :param n_targets: A python int containing the number of classes that the model should be learning to predict over. :param dropout_placeholder: :param config: A config object, containing all parameters for the featurizer. :param train: If this flag is true, dropout and losses are added to the graph. :param reuse: Should reuse be set within this scope. :param kwargs: Spare arguments. :return: dict containing: "logits": The un-normalised log probabilities of each class being in each location. For usable predictions, sampling from this distribution is not sufficient and a viterbi decoding method should be used. "losses": The negative log likelihood for the sequence targets. "predict_params": A dictionary of params to be fed to the viterbi decode function. """ with tf.variable_scope("sequence-labeler", reuse=reuse): if targets is not None: targets = tf.cast(targets, dtype=tf.int32) nx = config.n_embed if config.use_auxiliary_info: nx += config.n_context_embed def seq_lab_internal(hidden): if config.base_model.is_bidirectional: n = hidden else: attn_fn = functools.partial( attn, scope="seq_label_attn", n_state=nx, n_head=config.seq_num_heads, resid_pdrop=config.resid_p_drop, attn_pdrop=config.attn_p_drop, train=train, scale=False, mask=False, ) n = norm(attn_fn(hidden) + hidden, "seq_label_residual") flat_logits = tf.layers.dense(n, n_targets) logits = tf.reshape( flat_logits, tf.concat([tf.shape(hidden)[:2], [n_targets]], 0)) return logits with tf.variable_scope("seq_lab_attn"): if config.low_memory_mode and train: seq_lab_internal = recompute_grad(seq_lab_internal, use_entire_scope=True) logits = seq_lab_internal(hidden) class_weights = kwargs.get("class_weights") if class_weights is not None and train: class_weights = tf.reshape(class_weights, [1, 1, -1]) one_hot_class_weights = class_weights * tf.one_hot(targets, depth=n_targets) per_token_weights = tf.reduce_sum(one_hot_class_weights, axis=-1, keep_dims=True) logits = class_reweighting(per_token_weights)(logits) log_likelihood = 0.0 default_lengths = kwargs.get("max_length") * tf.ones( tf.shape(hidden)[0], dtype=tf.int32) if pool_idx is None: pool_idx = default_lengths else: pool_idx = tf.where( tf.equal(pool_idx, 0), default_lengths, tf.cast(pool_idx, dtype=tf.int32), ) with tf.device("CPU:0"): if multilabel: transition_params = [] logits_individual = tf.unstack(logits, n_targets, axis=-1) if targets is not None: targets_individual = tf.unstack(targets, n_targets, axis=-1) logits = [] for i in range(n_targets): transition_params.append( tf.get_variable("Transition_matrix_{}".format(i), shape=[2, 2])) logits.append( tf.stack( (logits_individual[pad_id], logits_individual[i]), axis=-1)) if targets is not None and i != pad_id: log_likelihood += crf_log_likelihood( logits[-1], targets_individual[i], pool_idx, transition_params=transition_params[-1], )[0] logits = tf.stack(logits, axis=-1) else: transition_params = tf.get_variable( "Transition_matrix", shape=[n_targets, n_targets]) if targets is not None: log_likelihood, _ = crf_log_likelihood( logits, targets, pool_idx, transition_params=transition_params) return { "logits": logits, "losses": -log_likelihood, "predict_params": { "transition_matrix": transition_params }, }
def __init__(self, is_trainning, config): self.batch_size = batch_size = config.batch_size self.num_steps = num_steps = config.num_steps self.num_classes = num_classes = config.num_classes self._logits = [] size = config.hidden_size vocab_size = config.vocab_size self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps]) self._targets = tf.placeholder(tf.int32, [batch_size, num_steps]) self.initializer = initializers.xavier_initializer() with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [vocab_size, size], dtype=data_type()) inputs = tf.nn.embedding_lookup(embedding, self._input_data) if FLAGS.cnn_option != 1: with tf.variable_scope("CNN"): reshaped_inputs = tf.reshape(inputs, [batch_size, num_steps, -1, 1]) # reshaped_inputs.shape[2] is actually 200 filter_weight = tf.get_variable( 'weights', [3, reshaped_inputs.shape[2], 1, 1], initializer=tf.truncated_normal_initializer(stddev=0.1)) biases = tf.get_variable( 'biases', [1], initializer=tf.constant_initializer(0.0)) conv = tf.nn.conv2d(reshaped_inputs, filter_weight, strides=[1, 1, 1, 1], padding='SAME') relu = tf.nn.relu(tf.nn.bias_add(conv, biases)) relu = tf.reshape(relu, [batch_size, num_steps, -1]) # get the length of each sample self.length = tf.reduce_sum(tf.sign(self._input_data), reduction_indices=1) self.length = tf.cast(self.length, tf.int32) if FLAGS.cnn_option == 2: inputs1 = relu inputs = tf.concat([inputs, inputs1], 2) size = size * 2 # ========================= CNN BILSTM if FLAGS.cnn_option == 3: inputs1 = relu if is_trainning and config.keep_prob < 1: inputs1 = tf.nn.dropout(relu, config.keep_prob) lstm_bw_cell1 = tf.nn.rnn_cell.BasicLSTMCell(size, forget_bias=1.0, state_is_tuple=True) lstm_fw_cell1 = tf.nn.rnn_cell.BasicLSTMCell(size, forget_bias=1.0, state_is_tuple=True) if is_trainning and config.keep_prob < 1: lstm_fw_cell1 = tf.nn.rnn_cell.DropoutWrapper( cell=lstm_fw_cell1, input_keep_prob=1.0, output_keep_prob=config.keep_prob) lstm_bw_cell1 = tf.nn.rnn_cell.DropoutWrapper( cell=lstm_bw_cell1, input_keep_prob=1.0, output_keep_prob=config.keep_prob) # 多层lstm单元叠加起来 cell_fw1 = tf.nn.rnn_cell.MultiRNNCell([lstm_fw_cell1] * config.num_layers, state_is_tuple=True) cell_bw1 = tf.nn.rnn_cell.MultiRNNCell([lstm_bw_cell1] * config.num_layers, state_is_tuple=True) self._initial_state_fw1 = initial_state_fw1 = cell_fw1.zero_state( batch_size, data_type()) self._initial_state_bw1 = initial_state_bw1 = cell_bw1.zero_state( batch_size, data_type()) inputs1 = tf.unstack(inputs1, num_steps, 1) # 此处可以不要sequence length参数 因为卷积之后谁也说不准 outputs1, _, _ = tf.contrib.rnn.static_bidirectional_rnn( cell_fw1, cell_bw1, inputs1, initial_state_fw=initial_state_fw1, initial_state_bw=initial_state_bw1, dtype=tf.float32, scope="cnn_rnn") output1 = tf.reshape(tf.concat(outputs1, 1), [-1, size * 2]) # ========================= end if is_trainning and config.keep_prob < 1: inputs = tf.nn.dropout(inputs, config.keep_prob) lstm_bw_cell = tf.nn.rnn_cell.BasicLSTMCell(size, forget_bias=1.0, state_is_tuple=True) lstm_fw_cell = tf.nn.rnn_cell.BasicLSTMCell(size, forget_bias=1.0, state_is_tuple=True) if is_trainning and config.keep_prob < 1: lstm_fw_cell = tf.nn.rnn_cell.DropoutWrapper( cell=lstm_fw_cell, input_keep_prob=1.0, output_keep_prob=config.keep_prob) lstm_bw_cell = tf.nn.rnn_cell.DropoutWrapper( cell=lstm_bw_cell, input_keep_prob=1.0, output_keep_prob=config.keep_prob) # 多层lstm单元叠加起来 cell_fw = tf.nn.rnn_cell.MultiRNNCell([lstm_fw_cell] * config.num_layers, state_is_tuple=True) cell_bw = tf.nn.rnn_cell.MultiRNNCell([lstm_bw_cell] * config.num_layers, state_is_tuple=True) self._initial_state_fw = initial_state_fw = cell_fw.zero_state( batch_size, data_type()) self._initial_state_bw = initial_state_bw = cell_bw.zero_state( batch_size, data_type()) # get the length of each sample # self.length = tf.reduce_sum(tf.sign(self._input_data), reduction_indices=1) # self.length = tf.cast(self.length, tf.int32) inputs = tf.unstack(inputs, num_steps, 1) outputs, _, _ = tf.contrib.rnn.static_bidirectional_rnn( cell_fw, cell_bw, inputs, initial_state_fw=initial_state_fw, initial_state_bw=initial_state_bw, dtype=tf.float32, sequence_length=self.length) # outputs = [] state_fw = self._initial_state_fw state_bw = self._initial_state_bw # with tf.variable_scope("RNN"): # for time_step in range(num_steps): # if time_step > 0: tf.get_variable_scope().reuse_variables() # (cell_output, state) = cell(inputs[:, time_step, :], state) # outputs.append(cell_output) # output = tf.reshape(tf.concat(outputs,1 ), [-1, size]) output = tf.reshape(tf.concat(outputs, 1), [-1, size * 2]) if FLAGS.cnn_option == 3: size = size * 2 final_output = tf.concat([output, output1], 1) weight = tf.get_variable("weight", [size * 2, 5], dtype=data_type()) bias = tf.get_variable("bias", [5], dtype=data_type()) if FLAGS.cnn_option != 3: logits = tf.matmul(output, weight) + bias else: logits = tf.matmul(final_output, weight) + bias self.tags_scores = tf.reshape(logits, [batch_size, num_steps, num_classes]) small = -1000.0 # pad logits for crf loss start_logits = tf.concat([ small * tf.ones(shape=[self.batch_size, 1, self.num_classes]), tf.zeros(shape=[self.batch_size, 1, 1]) ], axis=-1) pad_logits = tf.cast( small * tf.ones([self.batch_size, self.num_steps, 1]), tf.float32) logits = tf.concat([self.tags_scores, pad_logits], axis=-1) logits = tf.concat([start_logits, logits], axis=1) targets = tf.concat([ tf.cast(self.num_classes * tf.ones([self.batch_size, 1]), tf.int32), self._targets ], axis=-1) self.trans = tf.get_variable( "transitions", shape=[self.num_classes + 1, self.num_classes + 1], initializer=self.initializer) log_likelihood, self.trans = crf_log_likelihood( inputs=logits, tag_indices=targets, transition_params=self.trans, sequence_lengths=self.length + 1) self.loss = loss = -tf.reduce_mean(log_likelihood) self._tg = self.tags_scores self._l = self.length self._tr = self.trans # loss # log_likelihood, self.transition_params = tf.contrib.crf.crf_log_likelihood(inputs=self.tags_scores, # tag_indices=self._targets, # sequence_lengths=self.length) # self.loss = loss = -tf.reduce_mean(log_likelihood) # loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example( # [logits], # [tf.reshape(self._targets,[-1])], # [tf.ones([batch_size * num_steps], dtype=data_type())]) # self._cost = cost = tf.reduce_sum(loss) / batch_size self._cost = cost = loss self._final_state_fw = state_fw self._final_state_bw = state_bw # 只在训练模型的时候定义BP操作 if not is_trainning: return self._learning_rate = tf.Variable(0.0, trainable=False) trainable_variables = tf.trainable_variables() grads, _ = tf.clip_by_global_norm( tf.gradients(loss, trainable_variables), config.max_grad_norm) # 梯度下降优化,指定学习速率 optimizer = tf.train.GradientDescentOptimizer(self._learning_rate) self._train_op = optimizer.apply_gradients( zip(grads, trainable_variables)) # self._train_op = optimizer.minimize(loss) self._new_learning_rate = tf.placeholder(tf.float32, shape=[], name="new_learning_rate") self._learning_rate_update = tf.assign(self._learning_rate, self._new_learning_rate)