def compute_logits(self): with tf.name_scope('compute_logits') as scope: self.all_votes_A = self.A_vote.forward( self.final_A_lits) # n_A_lits x 1 self.all_votes_L = self.L_vote.forward( self.final_L_lits) # n_L_lits x 1 self.all_votes_join_A = tf.concat([ self.all_votes_A[0:self.n_A_vars], self.all_votes_A[self.n_A_vars:self.n_A_lits] ], axis=1) self.all_votes_join_L = tf.concat([ self.all_votes_L[0:self.n_L_vars], self.all_votes_L[self.n_L_vars:self.n_L_lits] ], axis=1) self.all_votes_batched_A = tf.reshape( self.all_votes_join_A, [self.n_batches, self.n_A_vars_per_batch, 2]) self.all_votes_batched_L = tf.reshape( self.all_votes_join_L, [self.n_batches, self.n_L_vars_per_batch, 2]) # try to use only A_votes for logits? self.A_logits = self.all_votes_batched_A self.L_logits = self.all_votes_batched_L self.A_policy = tf.softmax(self.A_logits) self.L_policy = tf.softmax(self.L_logits)
def output_layer(self, seq_1, seq_2, scope = 'PointerNetwork'): h_P = seq_1 u_Q = seq_2 cell = self.GRUCellGPU(num_units) with tf.variable_scope(scope): # initialize hidden state of answer attn_v_a = tf.get_varialbe('attn_v') w_u_Q = tf.get_varialbe('weight_passage') w_v_Q = tf.get_varialbe('weight_answer') V_Q = tf.get_varialbe('weight_answer') score_a = tf.reduce_sum(attn_v_a * tf.tanh(w_u_Q * u_Q + w_a * h_a), [2]) # scores for all tokens alignments_a = tf.softmax(score_a) r_Q = tf.reduce_sum(tf.matmul(alignments_a, u_Q, transpose_b=True), [2]) attn_v = tf.get_varialbe('attn_v') w_P = tf.get_varialbe('weight_passage') w_a = tf.get_varialbe('weight_answer') h_a = tf.get_varialbe('embedding_answer') score = tf.reduce_sum(attn_v * tf.tanh(w_P * h_P + w_a * h_a), [2]) # scores for all tokens alignments = tf.softmax(score) alignments = tf.expand_dims(alignments, 1) v_P = tf.expand_dims(u_Q, 1) context = tf.reduce_sum(tf.matmul(alignments, v_P, transpose_b=True), [2]) outputs, h_a = cell(h_a, context) self.answer = outputs return self.answer
def _process_3d_logits_train(logits, routing_weights, labels): processing_results = _process_3d_logits(logits, routing_weights, labels) if FLAGS.loss == 'gibbs_ce': probs = processing_results['weighted_probs'] negative_log_likelihood = processing_results['weighted_gibbs_ce'] elif FLAGS.loss == 'unweighted_gibbs_ce': probs = processing_results['unweighted_probs'] negative_log_likelihood = processing_results['unweighted_gibbs_ce'] elif FLAGS.loss == 'moe': probs = processing_results['weighted_probs'] negative_log_likelihood = tf.math.reduce_mean( tf.keras.losses.sparse_categorical_crossentropy( labels, probs, from_logits=False)) elif FLAGS.loss == 'unweighted_moe': probs = processing_results['unweighted_probs'] negative_log_likelihood = tf.math.reduce_mean( tf.keras.losses.sparse_categorical_crossentropy( labels, probs, from_logits=False)) elif FLAGS.loss == 'poe': probs = tf.softmax(processing_results['weighted_logits']) negative_log_likelihood = tf.math.reduce_mean( tf.keras.losses.sparse_categorical_crossentropy( labels, processing_results['weighted_logits'], from_logits=True)) elif FLAGS.loss == 'unweighted_poe': probs = tf.softmax(processing_results['unweighted_logits']) negative_log_likelihood = tf.math.reduce_mean( tf.keras.losses.sparse_categorical_crossentropy( labels, processing_results['unweighted_logits'], from_logits=True)) return probs, negative_log_likelihood
def lstm_block(x, v, lstm_size=512, vocab_size=52, num_words=30, feed_previous=False, scope='lstm_block', reuse=False, batch_size=4): with tf.variable_scope(scope, reuse=reuse): with tf.variable_scope('lstm_1', reuse=reuse): lstm_first = tf.contrib.rnn.BasicLSTMCell(lstm_size, reuse=reuse) state_first = lstm_first.zero_state(batch_size, tf.float32) o_1, state_first = lstm_first(x[:, 0, :], state_first) r = tf.concat([o_1, v], axis=1) with tf.variable_scope('lstm_2', reuse=reuse): lstm_second = tf.contrib.rnn.BasicLSTMCell(lstm_size, reuse=reuse) state_second = lstm_second.zero_state(batch_size, tf.float32) o_2, state_second = lstm_second(r, state_second) o = fullyConnected(o_2, output_units=vocab_size, std='xavier', activation=tf.identity, reuse=False, scope='lstm_fc') with tf.variable_scope(scope, reuse=True): #Teacher training, we feed in a list of words so dont need to feed back in #the output of the lstm outputs = [] outputs.append(o) for i in range(num_words - 1): if not feed_previous: word = x[:, i + 1, :] else: word = tf.softmax(o) with tf.variable_scope('lstm_1', reuse=True): o, state_first = lstm_first(word, state_first) o = tf.concat([o, v], axis=1) with tf.variable_scope('lstm_2', reuse=True): o, state_second = lstm_second(o, state_second) o = fullyConnected(o, output_units=vocab_size, std='xavier', activation=tf.identity, reuse=True, scope='lstm_fc') outputs.append(o) return outputs
def _apply_score_activation(logits, num_classes, activation): """Applies activation to logits and removes the background class. Note that it is assumed that the background class has index 0, which is sliced away after the score transformation. Args: logits: the raw logit tensor. num_classes: the total number of classes including one background class. activation: the score activation type, one of 'SIGMOID', 'SOFTMAX' and 'IDENTITY'. Returns: scores: the tensor after applying score transformation and background class removal. """ batch_size = tf.shape(logits)[0] logits = tf.reshape(logits, [batch_size, -1, num_classes]) if activation == 'SIGMOID': scores = tf.sigmoid(logits) elif activation == 'SOFTMAX': scores = tf.softmax(logits) elif activation == 'IDENTITY': pass else: raise ValueError( 'The score activation should be SIGMOID, SOFTMAX or IDENTITY') scores = scores[..., 1:] return scores
def add_prob_logits_samples(self): outputs = tf.unstack(self.returns['output']) batch_nums = tf.range(0, limit=self.hpm['batch_size'], dtype=tf.int64) argmax_seqs = [] argmax_seqs_log_probs = [] for i, x in enumerate(outputs): max_ids = tf.argmax(x, axis=-1) indices = tf.stack((batch_nums, max_ids), axis=-1) log_probs = tf.gather_nd(x, indices) argmax_seqs.append(max_ids) argmax_seqs_log_probs.append(log_probs) self.outputs = self.returns['output'] if not self.hpm['pointer_gen']: self.outputs = tf.softmax(self.outputs) self.argmax_seqs = tf.stack(argmax_seqs, name='argmax_seqs') self.argmax_seqs_log_probs = tf.stack(argmax_seqs_log_probs, name='argmax_seqs_log_probs') sampler = tf.distributions.Categorical(logits=outputs) self.samples = sampler.sample(name='samples') self.samples = tf.identity(self.samples, name='samples') self.samples_log_probs = sampler.log_prob(self.samples, name="samples_log_probs") self.samples_log_probs = tf.identity(self.samples_log_probs, name="samples_log_probs")
def make_attention(s, e, name): """ s: sentence: slen x ssz e: ssz x embedsz a: attention coefficients. Here is a design choice. either I have: - out[w][k] = Σw' s[w][i] a[i][j][k] s[w'][k] | a: embedsz^3 - out[w][k] = Σw' a[i][k] (s[w];s[w'])[i] | a: 2xembedsz^2 - out[w][k] = a[0][i][k]s[w][i] + Σw' a[1][i][k]s[w'][i] | a: 2xembedsz^2 """ embedsz = e.shape[-1] a = tf.Variable(tf.random_normal([2 * embedsz, embedsz]), name=name) # s: slen x ssz @ ssz x embedsz: slen x embedsz s = tf.matmul(s, e) # ss: slen x (embedz + embedz) ss = tf.concat([s, s], axis=1) sattn = tf.matmul(ss, a) # softmax so we have keys into the next embedding layer # make sure that we know what values to use sattn = tf.softmax(sattn / tf.sqrt(embedsz)) return a, sattn
def soft_dice_loss(logits, ground_truth): #probabilities = tf.sigmoid(logits) probabilities = tf.softmax(logits) interception_volume = tf.reduce_sum(probabilities * ground_truth) return -2 * interception_volume + tf.constant(smooth) / ( tf.norm(ground_truth, ord=1) + tf.norm(probabilities, ord=1) + tf.constant(smooth))
def gated_attn(self, seq_1, seq_2, num_units = self.len_gs, scope = 'GatedAttn'): ''' gated attention for seq_2 w.r.t seq_1 input: seq_1: query sequence in attention mechanism seq_2: encoder sequence in attention mechanism output: outputs: v_P: context: ''' u_Q = seq_1 u_P = seq_2 cell = self.GRUCellGPU(num_units) with tf.variable_scope(scope): attn_v = tf.get_variable('attn_v', [num_units]) w_u_Q = tf.get_variable('weight_ques') w_u_P = tf.get_variable('weight_pass_orig') w_v_P = tf.get_variable('weight_pass_ques') v_P = tf.get_variable('embedding_pass_ques') score = tf.reduce_sum(attn_v * tf.tanh(w_u_Q * u_Q + w_u_P * u_P + w_v_P * v_P), [2]) # scores of all tokens alignments = tf.softmax(score) alignments = tf.expand_dims(alignments, 1) u_Q = tf.expand_dims(u_Q, 1) context = tf.reduce_sum(tf.matmul(alignments, u_Q, transpose_b=True), [2]) # gate inputs = tf.concat([u_Q, context], 1) w_g = tf.get_variable('weight_gate') g = tf.sigmoid(tf.reduce_sum(w_g * inputs)) gated_inputs = g * inputs outputs, v_P = cell(gated_inputs, v_P) return outputs, v_P
def write(self, Args): row = tf.transpose(tf.softmax(Args[0][:, 0:2])) # 2 * 1 value = tf.tile(Args[1], [2, 1]) a_w = D(tf.tile(self.ADDR, [1, 2]), self.ptrs[:, -2:]) * row # 2 * max_seq_length out = self.envs[-2:, :, :] * tf.expand_dims( 1.0 - a_w, 2) + tf.expand_dims(value, 1) * tf.expand_dims(a_w, 2) self.envs = tf.concat(0, [self.envs[0:self.max_env_size - 2, :, :], out])
def capsnet(inputs): ''' Construct a 3-layer capsule net with 28x28 inputs. ''' ## Layer 1 is a regular convulution. We blow 1 channel up into 256 channels. with tf.variable_scope('conv1') as scope: kernel = _get_kernel('weights', [9, 9, 1, 256], stddev=5e-2, reg=0.0) conv = tf.nn.conv2d(inputs, kernel, [1, 1, 1, 1], padding='VALID') biases = tf.get_variable('biases', [256], initializer=tf.constant_initializer(0.0)) pre_act = tf.nn.bias_add(conv, biases) conv1 = tf.nn.relu(pre_act, name=scope.name) ## Layer 2 is the first capsule layer. It amounts to 32 parallel convolutions from 256 channels ## down to 8 channels. Each of these 32 conv layers contains (width) * (height) capsules of length 8. ## The output of the layer is a [width * height * 32] * 8 matrix. Each of the [width * height * 32] rows ## represents a capsule. capsules1 = tf.zeros((0, 8)) with tf.variable_scope('primary_caps' + str(i)) as scope: for i in range(0, 32): kernel = _get_kernel('weights' + str(i), [9, 9, 256, 8], stddev=5e-2, reg=0.0) conv = tf.nn.conv2d(conv1, kernel, [1, 2, 2, 1], padding='VALID') biases = tf.get_variable('biases' + str(i), [8], initializer=tf.constant_initializer(0.0)) pre_act = tf.nn.bias_add(conv, biases) conv2 = tf.nn.relu(pre_act, name=scope.name) shaped = tf.reshape(conv2, [36, 8]) capsules1 = tf.concat([capsules1, shaped], 0) with tf.variable_scope('coupling') as scope: priors = tf.get_variable('priors', shape=[capsules1.shape[0], 10], initializer=tf.constant_initializer(0.0)) coupling_coeffs = tf.softmax(priors) with tf.variable_scope('secondary_caps'): for j in range(0, NUM_CLASSES): routes_into_j = [] for i in range(0, capsules1.shape[0]): W_ij = _get_tn_var('weights_' + str(i) + str(j), shape=[16, 8], stddev=0.04, reg=0.004) b_ij = tf.get_variable('biases_' + str(i) + str(j), [16], initializer=tf.constant_intializer(0.0)) uhat = tf.add(tf.matmul(W_ij, capsules1[i]), b_ij) # \times c_i routes_into_j.append(tf.scalar_mul(coupling_coeffs[i, j], uhat)) s_j = tf.reduce_sum(routes_into_j)
def get_h_tile(cls, s, s1): """ attended vectors of s1 which words in s1 is most similar to each words in s2 """ t1 = s1.shape[1] b_weight = tf.reshape(tf.softmax(tf.max(s, 2)[0], -1), [-1, 1]) h_tile = tf.tile(tf.matmul(b_weight, s1), [1, t1, 1]) # b_weight = F.softmax(torch.max(s, dim=2)[0], dim=-1).unsqueeze(1) # [b, t2] # h_tile = torch.bmm(b_weight, s1).repeat(1, t1, 1) # repeat to match s1 # [B, t1, D] return h_tile
def run_example(): x = tf.constant([1, 2, 3, 4, 5, 6], dtype=tf.float32) reshape_op = tf.reshape(x, [3, 2]) softmax_op = tf.softmax(reshape_op) with tf.Session() as sess: res = sess.run(softmax_op) print("Result 1 = \n{}\n".format(res))
def next_inputs_fn(time, outputs, state, sample_id): output_prob = tf.softmax(outputs, axis=-1) next_embedding = output_prob.dot(embedding) # elements_finished = (time >= sentence_lengths) elements_finished = tf.tile(tf.constant([0]), [self.batch_size]) # all_finished = tf.reduce_all(elements_finished) # making length different # all_finished = False next_inputs = next_embedding next_state = state return elements_finished, next_inputs, next_state
def attention_layer(input_tensor, num_attention_head, attn_head_size): """ input_tensor [B,N,S] num_attention_head : K attn_head_size : H """ def transpose_for_score(input_tensor,batch_size,seq_length,num_attn_head,attn_head_size): input_tensor = tf.reshape(input_tensor,[batch_size,seq_length, num_attn_head,attn_head_size]) output_tensor = tf.transpose(input_tensor,[0,2,1,3]) return output_tensor input_shape = input_tensor.shape.as_list() batch_size = input_shape[0] seq_length = input_shape[1] hidden_size = input_shape[2] #[B*N,S] input_tensor = tf.reshape(input_tensor,[-1,input_shape[-1]]) #[B*N,K*H] query = tf.layers.dense( input_tensor, num_attention_head * attn_head_size, name = 'query' ) #[B*N,K*H] key = tf.layers.dense( input_tensor, num_attenion_head*attn_head_size, name = 'key' ) #[B*N,K*H] value =tf.layers.dense( input_tensor, num_attenion_head*attn_head_size, name = 'value' ) #[B,K,N,H] query = transpose_for_score(query,batch_size,seq_length, num_attn_head,attn_head_size) key = transpose_for_score(key,batch_size,seq_length, num_attn_head,attn_head_size) attention_score = tf.matmul(query,key,transpose_b = True) attention_score = tf.multiply(attention_score,1.0/math.sqrt(attn_head_size)) attention_score = tf.softmax(attention_score,axis = -1)
def _apply_score_activation(logits, num_classes, activation): """Applies activation to logits and removes the background class.""" batch_size = tf.shape(logits)[0] if activation == 'SIGMOID': logits = tf.reshape(logits, [batch_size, -1, num_classes]) scores = tf.sigmoid(logits) elif activation == 'SOFTMAX': logits = tf.reshape(logits, [batch_size, -1, num_classes + 1]) scores = tf.softmax(logits) scores = scores[:, 1::] else: raise ValueError( 'The score activation should be either SIGMOID or SOFTMAX.') return scores
def build(self): # build input place holder self._build_input_pl() # build feature extractor rpn_feature_maps, mrcnn_feature_maps = self._build_feature_map() # get anchors featmap_shape_list = self._compute_featmap_shape_list(rpn_feature_maps) anchors = self._generate_anchors(featmap_shape_list) # rpn_logits is used to get loss of it # rpn_cls_scores (N,all_num_anchors,2) rpn_bboxes_delta, rpn_cls_scores = self._build_rpn( rpn_feature_maps, weight_decay=self._weight_decay) rpn_cls_scores_reshape = tf.reshape( rpn_cls_scores, (tf.shape(rpn_cls_scores)[0], -1, 2)) rpn_cls_prob = tf.softmax(rpn_cls_scores_reshape, axis=1) # shape(2*num_anchors_per_location,all_num_anchors) rpn_cls_prob_reshape = tf.reshape(rpn_cls_prob, (tf.shape(rpn_cls_prob)[0], -1, 2 * self._rpn_num_anchors_per_location)) ################################# # AnchorTarget # generate labels for anchors ################################# if self._training: rpn_bbox_targets, rpn_bbox_labels, rpn_bbox_inside_weights, rpn_bbox_outside_weights = self._generate_anchor_target( anchors, self._gt_boxes, self._img_shape) ################################# # Proposal ################################# rois = self._generate_proposal(rpn_cls_prob_reshape, rpn_bboxes_delta, anchors) # # decode bbox by applying deltas to anchors # self._bbox_decoder.decode(anchors, rpn_bboxes_delta) ################################# # ProposalTarget ################################# if self._training: self._generate_proposal_target( self._gt_boxes, self._gt_labels, rois, )
def get_logits(x): #cnn_input = tf.reshape(x,[deep_AS_config.batch_size, deep_AS_config.num_units, deep_AS_config.vocab_size,1]) softmax_weight1 = tf.get_variable( name="dense8_weights", shape=[ deep_AS_config.FLAGS.num_units, deep_AS_config.FLAGS.label_class ], initializer=tf.uniform_unit_scaling_initializer(1.43)) softmax_bias1 = tf.get_variable(name="dense8_biases", shape=[deep_AS_config.FLAGS.label_class], initializer=tf.constant_initializer(0.1)) #logit = tf.nn.relu(tf.matmul(out, dense8_weight) + dense8_bias) logit = tf.softmax(tf.matmul(x, softmax_weight1) + softmax_bias1) return logit
def get_u_tile(cls, s, s2): """ attended vectors of s2 for each word in s1, signify which words in s2 are most relevant to words in s1 """ a_weight = tf.softmax(s, 2) # [B, t1, t2] tf.assign(a_weight[tf.where(tf.is_nan(a_weight))], 0) # a_weight.data.masked_fill_(a_weight.data != a_weight.data, 0) # remove nan from softmax on -inf # u_tile = torch.bmm(a_weight, s2) # [B, t1, t2] * [B, t2, D] -> [B, t1, D] u_til = tf.matmul(a_weight, s2) a_weight = F.softmax(s, dim=2) # [B, t1, t2] tf.assign(a_weight[tf.where(tf.is_nan(a_weight))], 0) # a_weight.data.masked_fill_(a_weight.data != a_weight.data, 0) # remove nan from softmax on -inf # u_tile = torch.bmm(a_weight, s2) # [B, t1, t2] * [B, t2, D] -> [B, t1, D] u_til = tf.matmul(a_weight, s2) return u_tile
def __init__(self, batcher, in_node, out_node, cost, train_step, classifier=False): self.batcher = batcher self.in_node = in_node self.out_node = out_node self.cost = cost self.train_step = train_step self.global_step = tfutils.opt.global_step() self.eval_cost = tf.Variable(0.0, name='eval_cost', trainable=False) if classifier: self.eval_accuracy = tf.Variable(0.0, name='eval_accuracy', trainable=False) predictions = tf.argmax(tf.softmax(y_hat), 1) true = tf.argmax(y, 1) self.step = 1
def inference(image): FCN8s.logger.info("inference") # FCN net = FCN8s() with tf.name_scope("FCN8s"): net.build(image, debug=True) fub = tf.softmax(net.upscore32, axis=3) # Matting matting_module = tf.load_op_library("matting.so") with tf.name_scope("matting"): init = tf.constant_initializer(value=100.0, dtype=tf.double) lamb = tf.get_variable(name="lambda", initializer=init, shape=[1]) pred_annotation = matting_module.matting(image, fub, lamb) return pred_annotation
def do_episode(W): rewards = [] observations = [] observation = ENV.reset() for i in xrange(250): p_0 = tf.softmax(np.dot(W, observation)) if npr.uniform() < p_0: action = 0 pi.append(p_0) else: action = 1 pi.append(1 - p_0) observation, reward, done, info = ENV.step(action) rewards.append(reward) if done: break rewards = np.array(rewards) pi = np.array(pi) return np.dot(rewards, np.log(pi))
def self_matching_attn(self, seq, scope = 'SelfMatchAttn'): ''' self-matching attention of seq input: output: ''' v_P = seq with tf.variable_scope(scope): attn_v = tf.get_variable('attn_v') w_v_P = tf.get_variable('weight_passage') w_v_P_w = tf.get_variable('weight_passage_wave') score = tf.reduce_sum(attn_v * tf.tanh(w_v_P * v_P + w_v_P_w * v_P), [2]) # scores for all tokens alignments = tf.softmax(score) alignments = tf.expand_dims(alignments, 1) v_P = tf.expand_dims(u_Q, 1) context = tf.reduce_sum(tf.matmul(alignments, v_P, transpose_b=True), [2]) inputs = tf.concat([seq, context], 1) outputs = self.bidirectionalGRU(inputs, 1) h_P = outputs return h_P
def choose_action(self, observation, cur_time): observation = tf.stop_gradient(observation) layer = tf.layers.dense( inputs=observation, units=self.n_features, activation=tf.nn.tanh, kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3), bias_initializer=tf.constant_initializer(0.1), name='fc1') all_act = tf.layers.dense( inputs=layer, units=self.n_actions, activation=tf.nn.tanh, kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3), bias_initializer=tf.constant_initializer(0.1), name='fc2') act_prob = tf.softmax(all_act, name='act_prob') action = tf.multinomial(tf.log(act_prob), 1) self.actions.append(actions) self.all_act_prob.append(act_prob) return actions
def __call__(self, inputs): # transpose to 'channel first' x = Lambda(lambda x: tf.transpose(x, [0, 3, 1, 2]))(inputs) # key, value and query u = Conv2D(filters=self.groups, kernel_size=1, padding='same', data_format='channels_first', kernel_initializer='glorot_uniform')(x) v = Conv2D(filters=self.groups, kernel_size=1, padding='same', data_format='channels_first', kernel_initializer='glorot_uniform')(x) z = Lambda(lambda x: tf.matmul(x[0], x[1], transpose_a=True))([u, v]) # attention z = Lambda(lambda x: tf.softmax(x, axis=2))(z) w = Conv2D(filters=self.groups, kernel_size=1, padding='same', data_format='channels_first', kernel_initializer='glorot_uniform')(x) w = Lambda(lambda x: tf.matmul(x[0], x[1], transpose_b=True))([z, w]) w = Conv2D(filters=self.groups, kernel_size=1, padding='same', data_format='channels_first', kernel_initializer='glorot_uniform')(w) outputs = Lambda(lambda x: tf.transpose(x, [0, 3, 2, 1])) return outputs
def loop_fn(time, cell_output, cell_state, loop_state): emit_output = cell_output # == None for time == 0 if cell_output is None: # time == 0 next_cell_state = cell.zero_state(batch_size, tf.float32) else: next_cell_state = cell_state prev_h = next_cell_state.h prev_c = next_cell_state.c tgt_t = tgt_ta.read(time) # projection of previous hidden state onto source word space tgt_hid_proj = slim.fully_connected(prev_h, hidden_size, 'tgt_hid_proj') tgt_cel_proj = slim.fully_connected(prev_c, hidden_size, 'tgt_cel_proj') tgt_emb_t = tf.nn.embedding_lookup(tgt_embeddings, tgt_t) # tgt_rep of shape [batch_size, hidden_size]. tgt_rep = tgt_hid_proj + tgt_cel_proj + tgt_embs tgt_rep = tf.expand_dims(tgt_rep, 2) attn_scores = tf.squeeze(tf.matmul(windowed_seqpos_embs, tgt_rep), 2) # attn of shape [batch_size, max_time]. conv_attn_aux = seqpos_embs * tf.softmax(attn_scores) elements_finished = (time >= tgt_seqlen) finished = tf.reduce_all(elements_finished) next_input = tf.cond( finished, lambda: tf.zeros([batch_size, hidden_size], dtype=tf.float32), lambda: conv_attn_aux + tf.nn.embedding_lookup( inp_embeddings, tgt_t)) next_loop_state = None return (elements_finished, next_input, next_cell_state, emit_output, next_loop_state)
def construct_network(self): tf.reset_default_graph() self.word_ids = tf.placeholder(tf.int32, [None, None], name="word_ids") self.sentence_lengths = tf.placeholder(tf.int32, [None], name="sentence_lengths") self.word_ids_knowledge = tf.placeholder(tf.int32, [None, None, None], name="word_ids_know") self.sentence_tokens = tf.placeholder(tf.string, [None, None], name="word_list_sentence") self.knowledge_lengths = tf.placeholder(tf.int32, [None, None], name="sentence_lengths_know") self.knowledge_tokens = tf.placeholder(tf.string, [None, None, None], name="word_list_knowledge") self.knowledge_max_lengths = tf.placeholder(tf.int32, [None, None], name="sentence_lengths_max_know") self.word_ids_context = tf.placeholder(tf.int32, [None, None], name="word_ids_context") self.context_tokens = tf.placeholder(tf.string, [None, None], name="words_list_context") self.context_lengths = tf.placeholder(tf.int32, [None], name="sentence_lengths_context") self.sentence_labels = tf.placeholder(tf.float32, [None, None], name="sentence_labels") self.batch_size = tf.Variable(0) self.max_lengths = tf.placeholder(tf.int32, [None], name="max_lengths_padding") self.weights_path = tf.placeholder(tf.float32, [None, None], name="weights_path") self.learningrate = tf.placeholder(tf.float32, name="learningrate") self.is_training = tf.placeholder(tf.bool, name="is_training") self.loss = 0.0 input_tensor = None input_vector_size = 0 #reiss= ['physiological', 'love', 'spiritual growth', 'esteem', 'stability'] if self.config["human_needs"] == "maslow": reiss=['physiological', 'love', 'spiritual growth', 'esteem', 'stability'] #human_needs =['physiological', 'love', 'spiritual growth', 'esteem', 'stability'] elif self.config["human_needs"] == "reiss": #reiss = ['status', 'approval', 'tranquility', 'competition', 'health', 'family', 'romance', 'food', 'indep', 'power', 'order', 'curiosity', 'serenity', 'honor', 'belonging', 'contact', 'savings', 'idealism', 'rest'] reiss = ['status', 'approval', 'tranquility', 'competition', 'health', 'family', 'romance', 'food', 'indep', 'power', 'order', 'curiosity', 'serenity', 'honor', 'contact', 'savings', 'idealism', 'rest'] self.initializer = None if self.config["initializer"] == "normal": self.initializer = tf.random_normal_initializer(mean=0.0, stddev=0.1) elif self.config["initializer"] == "glorot": self.initializer = tf.glorot_uniform_initializer() elif self.config["initializer"] == "xavier": self.initializer = tf.glorot_normal_initializer() ############################################################################### BILSTM ############################################################################################# if self.config["neural_network"]=="BILSTM": ############################################################################### SENTENCE BI-LSTM ############################################################################################# zeros_initializer = tf.zeros_initializer() input_tensor = None with tf.variable_scope("sentence"): word_lstm_cell_fw = tf.nn.rnn_cell.LSTMCell(self.config["word_recurrent_size"], use_peepholes=self.config["lstm_use_peepholes"], state_is_tuple=True, initializer=self.initializer, reuse=False) word_lstm_cell_bw = tf.nn.rnn_cell.LSTMCell(self.config["word_recurrent_size"], use_peepholes=self.config["lstm_use_peepholes"], state_is_tuple=True, initializer=self.initializer, reuse=False) self.word_embeddings = tf.get_variable("word_embeddings", shape=[len(self.term2index), self.config["word_embedding_size"]], initializer=(zeros_initializer if self.config["emb_initial_zero"] == True else self.initializer), trainable=(True if self.config["train_embeddings"] == True else False)) use_elmo = True if use_elmo: elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True) input_tensor = elmo(inputs={"tokens": self.sentence_tokens,"sequence_len": self.sentence_lengths},signature="tokens",as_dict=True)["elmo"] else: input_tensor = tf.nn.embedding_lookup(self.word_embeddings, self.word_ids) input_vector_size = self.config["word_embedding_size"] self.word_representations = input_tensor dropout_input = self.config["dropout_input"] * tf.cast(self.is_training, tf.float32) + (1.0 - tf.cast(self.is_training, tf.float32)) input_tensor = tf.nn.dropout(input_tensor, dropout_input, name="dropout_word") (lstm_outputs_fw, lstm_outputs_bw), ((_, lstm_output_fw), (_, lstm_output_bw)) = tf.nn.bidirectional_dynamic_rnn(word_lstm_cell_fw, word_lstm_cell_bw, input_tensor, sequence_length=self.sentence_lengths, dtype=tf.float32, time_major=False) dropout_word_lstm = self.config["dropout_word_lstm"] * tf.cast(self.is_training, tf.float32) + (1.0 - tf.cast(self.is_training, tf.float32)) lstm_outputs_fw = tf.nn.dropout(lstm_outputs_fw, dropout_word_lstm, noise_shape=tf.convert_to_tensor([tf.shape(self.word_ids)[0],1,self.config["word_recurrent_size"]], dtype=tf.int32)) lstm_outputs_bw = tf.nn.dropout(lstm_outputs_bw, dropout_word_lstm, noise_shape=tf.convert_to_tensor([tf.shape(self.word_ids)[0],1,self.config["word_recurrent_size"]], dtype=tf.int32)) lstm_outputs = tf.concat([lstm_outputs_fw, lstm_outputs_bw], -1) self.lstm_outputs = lstm_outputs if self.config["sentence_composition"] == "last": processed_tensor = lstm_outputs self.attention_weights_unnormalised = tf.zeros_like(self.word_ids, dtype=tf.float32) elif self.config["sentence_composition"] == "attention": attention_evidence = tf.layers.dense(lstm_outputs, self.config["attention_evidence_size"], activation=tf.sigmoid, kernel_initializer=self.initializer) attention_weights = tf.layers.dense(attention_evidence, 1, activation=None, kernel_initializer=self.initializer) attention_weights = tf.reshape(attention_weights, shape=tf.shape(self.word_ids)) if self.config["attention_activation"] == "sharp": attention_weights = tf.exp(attention_weights) elif self.config["attention_activation"] == "soft": attention_weights = tf.sigmoid(attention_weights) elif self.config["attention_activation"] == "linear": pass else: raise ValueError("Unknown activation for attention: " + str(self.config["attention_activation"])) self.attention_weights_unnormalised = attention_weights attention_weights = tf.where(tf.sequence_mask(self.sentence_lengths), attention_weights, tf.zeros_like(attention_weights)) attention_weights = attention_weights / tf.reduce_sum(attention_weights, 1, keep_dims=True) processed_tensor_1 = tf.reduce_sum(lstm_outputs * attention_weights[:,:,numpy.newaxis], 1) self.token_scores = [tf.where(tf.sequence_mask(self.sentence_lengths), self.attention_weights_unnormalised, tf.zeros_like(self.attention_weights_unnormalised) - 1e6)] if self.config["hidden_layer_size"] > 0: if self.config["sentence_composition"] == "attention": #processed_tensor_sentence = tf.reduce_max(lstm_outputs,1) processed_tensor_sentence = tf.layers.dense(processed_tensor_1, self.config["hidden_layer_size"], activation=tf.nn.relu, kernel_initializer=self.initializer) elif self.config["sentence_composition"] == "last": processed_tensor_sentence = tf.layers.dense(processed_tensor, self.config["hidden_layer_size"], activation=tf.nn.relu, kernel_initializer=self.initializer) ##################################################################### CONTEXT BI-LSTM ################################################## with tf.variable_scope("context"): context_lstm_cell_fw = tf.nn.rnn_cell.LSTMCell(self.config["word_recurrent_size"], use_peepholes=self.config["lstm_use_peepholes"], state_is_tuple=True, initializer=self.initializer, reuse=False) context_lstm_cell_bw = tf.nn.rnn_cell.LSTMCell(self.config["word_recurrent_size"], use_peepholes=self.config["lstm_use_peepholes"], state_is_tuple=True, initializer=self.initializer, reuse=False) input_vector_size = self.config["word_embedding_size"] self.word_representations = input_tensor use_elmo = True if use_elmo: elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True) input_tensor= elmo(inputs={"tokens": self.context_tokens,"sequence_len": self.context_lengths},signature="tokens",as_dict=True)["elmo"] else: input_tensor = tf.nn.embedding_lookup(self.word_embeddings, self.word_ids_context) dropout_input = self.config["dropout_input"] * tf.cast(self.is_training, tf.float32) + (1.0 - tf.cast(self.is_training, tf.float32)) input_tensor = tf.nn.dropout(input_tensor, dropout_input, name="dropout_word") (lstm_outputs_fw, lstm_outputs_bw), ((_, lstm_output_fw), (_, lstm_output_bw)) = tf.nn.bidirectional_dynamic_rnn(context_lstm_cell_fw, context_lstm_cell_bw, input_tensor, sequence_length=self.context_lengths, dtype=tf.float32, time_major=False) dropout_word_lstm = self.config["dropout_word_lstm"] * tf.cast(self.is_training, tf.float32) + (1.0 - tf.cast(self.is_training, tf.float32)) lstm_outputs_fw = tf.nn.dropout(lstm_outputs_fw, dropout_word_lstm, noise_shape=tf.convert_to_tensor([tf.shape(self.word_ids_context)[0],1,self.config["word_recurrent_size"]], dtype=tf.int32)) lstm_outputs_bw = tf.nn.dropout(lstm_outputs_bw, dropout_word_lstm, noise_shape=tf.convert_to_tensor([tf.shape(self.word_ids_context)[0],1,self.config["word_recurrent_size"]], dtype=tf.int32)) lstm_outputs = tf.concat([lstm_outputs_fw, lstm_outputs_bw], -1) #if self.config["hidden_layer_size"] > 0: # lstm_outputs = tf.layers.dense(lstm_outputs, self.config["hidden_layer_size"], activation=tf.nn.relu, kernel_initializer=self.initializer) self.lstm_outputs = lstm_outputs if self.config["sentence_composition"] == "last": processed_tensor_context = lstm_outputs self.attention_weights_unnormalised = tf.zeros_like(self.word_ids_context, dtype=tf.float32) elif self.config["sentence_composition"] == "attention": attention_evidence = tf.layers.dense(lstm_outputs, self.config["attention_evidence_size"], activation=tf.sigmoid, kernel_initializer=self.initializer) attention_weights = tf.layers.dense(attention_evidence, 1, activation=None, kernel_initializer=self.initializer) attention_weights = tf.reshape(attention_weights, shape=tf.shape(self.word_ids_context)) if self.config["attention_activation"] == "sharp": attention_weights = tf.softmax(attention_weights) elif self.config["attention_activation"] == "soft": attention_weights = tf.sigmoid(attention_weights) elif self.config["attention_activation"] == "linear": pass else: raise ValueError("Unknown activation for attention: " + str(self.config["attention_activation"])) self.attention_weights_unnormalised = attention_weights attention_weights = tf.where(tf.sequence_mask(self.context_lengths), attention_weights, tf.zeros_like(attention_weights)) attention_weights = attention_weights / tf.reduce_sum(attention_weights, 1, keep_dims=True) processed_tensor_context = tf.reduce_sum(lstm_outputs * attention_weights[:,:,numpy.newaxis], 1) if self.config["hidden_layer_size"] > 0: #processed_tensor_context = tf.reduce_mean(lstm_outputs,1) processed_tensor_context = tf.layers.dense(processed_tensor_context, self.config["hidden_layer_size"], activation=tf.nn.relu, kernel_initializer=self.initializer) ####################################################################### KNOWLEDGE Bi-LSTM #################################################################################################### processed_tensor_1 = processed_tensor_sentence with tf.variable_scope("knowledge"): knowledge_input_tensor = tf.nn.embedding_lookup(self.word_embeddings, self.word_ids_knowledge) input_vector_size = self.config["word_embedding_size"] know_lstm_cell_fw = tf.nn.rnn_cell.LSTMCell(self.config["word_embedding_size"], use_peepholes=self.config["lstm_use_peepholes"], state_is_tuple=True, initializer=self.initializer, reuse=False) know_lstm_cell_bw = tf.nn.rnn_cell.LSTMCell(self.config["word_embedding_size"], use_peepholes=self.config["lstm_use_peepholes"], state_is_tuple=True, initializer=self.initializer, reuse=False) self.word_representations = knowledge_input_tensor s = tf.shape(knowledge_input_tensor) knowledge_input_tensor = tf.reshape(knowledge_input_tensor, shape=[s[0]*s[1], s[2], self.config["word_embedding_size"]]) knowledge_lengths = tf.reshape(self.knowledge_max_lengths, shape=[s[0]*s[1]]) dropout_input = self.config["dropout_input"] * tf.cast(self.is_training, tf.float32) + (1.0 - tf.cast(self.is_training, tf.float32)) knowledge_input_tensor = tf.nn.dropout(knowledge_input_tensor, dropout_input, name="dropout_word") char_lstm_outputs = tf.nn.bidirectional_dynamic_rnn(know_lstm_cell_fw, know_lstm_cell_bw, knowledge_input_tensor, sequence_length=knowledge_lengths, dtype=tf.float32, time_major=False) _, ((_, char_output_fw), (_, char_output_bw)) = char_lstm_outputs lstm_outputs = tf.concat([char_output_fw, char_output_bw], -1) ''' if self.config["sentence_composition"] == "attention": attention_evidence = tf.layers.dense(lstm_outputs, self.config["attention_evidence_size"], activation=tf.sigmoid, kernel_initializer=self.initializer) attention_weights = tf.layers.dense(attention_evidence, 1, activation=None, kernel_initializer=self.initializer) attention_weights = tf.reshape(attention_weights, shape=tf.shape(self.word_ids_knowledge)) if self.config["attention_activation"] == "sharp": attention_weights = tf.softmax(attention_weights) elif self.config["attention_activation"] == "soft": attention_weights = tf.sigmoid(attention_weights) elif self.config["attention_activation"] == "linear": pass else: raise ValueError("Unknown activation for attention: " + str(self.config["attention_activation"])) self.attention_weights_unnormalised = attention_weights attention_weights = tf.where(tf.sequence_mask(self.knowledge_max_lengths), attention_weights, tf.zeros_like(attention_weights)) attention_weights = attention_weights / tf.reduce_sum(attention_weights, 1, keep_dims=True) atten_shape = tf.shape(attention_weights) attention_weights = tf.reshape(attention_weights, shape=[tf.shape(attention_weights)[0]*tf.shape(attention_weights)[1],tf.shape(attention_weights)[2]]) lstm_outputs = tf.reduce_sum(lstm_outputs * attention_weights[:,:,numpy.newaxis], 1) ''' lstm_outputs = tf.reshape(lstm_outputs, shape=[s[0], s[1], 2*self.config["word_embedding_size"]]) dropout_word_lstm = self.config["dropout_word_lstm"] * tf.cast(self.is_training, tf.float32) + (1.0 - tf.cast(self.is_training, tf.float32)) lstm_outputs = tf.nn.dropout(lstm_outputs, dropout_word_lstm) if self.config["whidden_layer_size"] > 0: lstm_outputs = tf.layers.dense(lstm_outputs, self.config["hidden_layer_size"], activation=tf.nn.relu, kernel_initializer=self.initializer) knowledge_output_vector_size = 2 * self.config["word_embedding_size"] self.lstm_outputs = lstm_outputs t_lstm_outputs = tf.transpose(lstm_outputs, [0, 2, 1]) if self.config["sentence_composition"] == "attention": processed_tensor_1 = tf.expand_dims(processed_tensor_1, -1) #batch, Dim, 1 processed_tensor_1 = tf.transpose(processed_tensor_1, [0,2,1]) attention_weights = tf.matmul(processed_tensor_1,t_lstm_outputs) #batch, length_of_sentence, number of Knowledge if self.config["attention_activation"] == "sharp": attention_weights = tf.exp(attention_weights) elif self.config["attention_activation"] == "soft": attention_weights = tf.nn.softmax(attention_weights) #pass elif self.config["attention_activation"] == "linear": pass else: raise ValueError("Unknown activation for attention: " + str(self.config["attention_activation"])) self.attention_weights_unnormalised = attention_weights #attention_weights = tf.transpose(attention_weights, [0, 2, 1])# batch, 1,number of Knowledge self.attention_weights = attention_weights #attention_weights = tf.squeeze(attention_weights) #attention_weights = tf.exp(attention_weights) sum_attention_weights = attention_weights #sum_attention_weights = tf.squeeze(attention_weights) #attention_weights = tf.reshape(attention_weights, [s[0],s[1],s[2]]) #attention_weights = tf.where(tf.sequence_mask(self.knowledge_max_lengths), attention_weights, tf.zeros_like(attention_weights)) #attention_weights = attention_weights / tf.reduce_sum(sum_attention_weights,-1, keep_dims=True) #attention_weights = tf.reshape(attention_weights, [s[0],s[1]*s[2]]) #attention_weights = tf.expand_dims(attention_weights, -1) self.attention_weights = tf.squeeze(attention_weights) #attention_weights = tf.squeeze(attention_weights) #weights_path = tf.expand_dims(self.weights_path, -1) #weights_path = tf.transpose(weights_path, [0,2,1]) #attention_weights = tf.matmul(weights_path,attention_weights) attention_weights = tf.transpose(attention_weights, [0, 2, 1]) #attention_weights = tf.expand_dims(attention_weights, -1) processed_tensor_knowledge = tf.reduce_sum(lstm_outputs * attention_weights, axis=1) # bs, d processed_tensor_knowledge = tf.layers.dense(processed_tensor_knowledge, self.config["hidden_layer_size"], activation=tf.nn.relu, kernel_initializer=self.initializer) #processed_tensor_knowledge_att = tf.expand_dims(processed_tensor_knowledge, -1) #batch, Dim, 1 #processed_tensor_knowledge_att = tf.transpose(processed_tensor_knowledge_att, [0,2,1]) #batch,1,Dim ### attention over attention for the sentence #attention_weights = tf.matmul(processed_tensor_knowledge_att, processed_tensor_1) #attention_weights = tf.nn.softmax(attention_weights) #attention_weights = tf.transpose(attention_weights, [0, 2, 1]) #processed_tensor_knowledge_sentence = tf.reduce_sum(attention_weights * tf.transpose(processed_tensor_1,[0,2,1]), axis=1) #if self.config["hidden_layer_size"] > 0: #processed_tensor_knowledge = tf.layers.dense(processed_tensor_knowledge, self.config["hidden_layer_size"], activation=tf.nn.relu, kernel_initializer=self.initializer) #processed_tensor_knowledge_sentence = tf.layers.dense(processed_tensor_knowledge_sentence, self.config["hidden_layer_size"], activation=tf.nn.relu, kernel_initializer=self.initializer) ##################################################################################################################################################### ###############################################################CALCULATE SCORE################################################################ ##################################################################################################################################################### if self.config["sentence_composition"] == "attention": dense_input_sen_con = tf.concat([processed_tensor_sentence, processed_tensor_context],1) dense_input_sen_con = tf.layers.dense(dense_input_sen_con, self.config["hidden_layer_size"], activation=tf.nn.relu, kernel_initializer=self.initializer) dense_input = tf.concat([processed_tensor_sentence, processed_tensor_knowledge],1) #,processed_tensor_knowledge,,processed_tensor_context dense_input = tf.layers.dense(dense_input, self.config["hidden_layer_size"], activation=tf.nn.relu, kernel_initializer=self.initializer) final_score = (dense_input * processed_tensor_sentence) + (dense_input * processed_tensor_knowledge) final_score = dense_input_sen_con softmax_w = tf.get_variable('softmax_w', shape=[100, len(reiss)],initializer=tf.zeros_initializer, dtype=tf.float32) elif self.config["sentence_composition"] == "last": dense_input = tf.concat([processed_tensor_sentence, processed_tensor_context],2) #,processed_tensor_knowledge,processed_tensor_sentence,, dense_input = tf.reshape(dense_input,[self.batch_size, self.max_lengths[0] * dense_input.get_shape()[2]])#self.max_lengths[0] * dense_input.get_shape()[2]]) #dense_input = tf.concat([dense_input, processed_tensor_knowledge],1) softmax_w = tf.get_variable('softmax_w',shape = [56*200,len(reiss)], initializer=tf.zeros_initializer, dtype=tf.float32) softmax_b = tf.get_variable('softmax_b', shape=[len(reiss)],initializer=tf.zeros_initializer, dtype=tf.float32) #if self.config["hidden_layer_size"] > 0: # dense_input = tf.layers.dense(dense_input, self.config["hidden_layer_size"], activation=tf.nn.relu, kernel_initializer=self.initializer) self.sentence_scores = tf.matmul(final_score, softmax_w) + softmax_b ##################################################################################################################################################### ###############################################################CALCULATE SCORE################################################################ ##################################################################################################################################################### if self.config["human_needs"] == "maslow": w = [3.3580651133263086, 2.4043071629811266, 2.948496008202039, 2.609976477765905, 2.3545068920496965] else: #with belonging: w = [3.929112469627414, 4.352634266669815, 4.105348968927056, 4.009469417408209, 4.436903109491611, 3.4714643441750805, 4.533726764493145, 3.665643259544512, 5.264175448882736, 6.026320782448594, 3.7522367243231805, 3.8019798963053515, 7.896001211803761, 8.024995943144209, 15.275082043791086, 3.3076036095385644, 3.81662584588786, 8.618279130276653, 6.7344516295276895] #without belonging class: #w = [3.929112469627414, 4.352634266669815, 4.105348968927056, 4.009469417408209, 4.436903109491611, 3.4714643441750805, 4.533726764493145, 3.665643259544512, 5.264175448882736, 6.026320782448594, 3.7522367243231805, 3.8019798963053515, 7.896001211803761, 8.024995943144209, 3.3076036095385644, 3.81662584588786, 8.618279130276653, 6.7344516295276895] w = tf.convert_to_tensor(w, dtype=tf.float32) lossy = tf.nn.weighted_cross_entropy_with_logits(targets=self.sentence_labels,logits=self.sentence_scores, pos_weight=w) self.loss = tf.reduce_sum(lossy) regularizer = tf.nn.l2_loss(softmax_w) self.loss = tf.reduce_mean(self.loss+(0.01 * regularizer)) self.sentence_scores = tf.nn.sigmoid(self.sentence_scores) self.train_op = self.construct_optimizer(self.config["opt_strategy"], self.loss, self.learningrate, self.config["clip"])
def _cnn_model_function(features, labels, mode, params): model_func = params['model'] model_format = params['format'] model_dtype = params['dtype'] momentum = params['momentum'] learning_rate_init = params['learning_rate_init'] learning_rate_power = params['learning_rate_power'] decay_steps = params['decay_steps'] weight_decay = params['weight_decay'] loss_scale = params['loss_scale'] larc_eta = params['larc_eta'] larc_mode = params['larc_mode'] deterministic = params['deterministic'] num_classes = params['n_classes'] dali_cpu = params['dali_cpu'] device = '/gpu:0' labels = tf.reshape(labels, (-1, )) # Squash unnecessary unary dim inputs = features # TODO: Should be using feature columns? is_training = (mode == tf.estimator.ModeKeys.TRAIN) with tf.device(device): inputs = tf.cast(inputs, model_dtype) if model_format == 'channels_first': inputs = tf.transpose(inputs, [0, 3, 1, 2]) with nvutils.fp32_trainable_vars( regularizer=tf.contrib.layers.l2_regularizer(weight_decay)): top_layer = model_func(inputs, training=is_training) logits = tf.layers.dense(top_layer, num_classes) predicted_classes = tf.argmax(logits, axis=1, output_type=tf.int32) logits = tf.cast(logits, tf.float32) if mode == tf.estimator.ModeKeys.PREDICT: probabilities = tf.softmax(logits) predictions = { 'class_ids': predicted_classes[:, None], 'probabilities': probabilities, 'logits': logits } return tf.estimator.EstimatorSpec(mode, predictions=predictions) loss = tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels) loss = tf.identity( loss, name='loss' ) # For access by logger (TODO: Better way to access it?) reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) loss = tf.add_n([loss] + reg_losses, name='total_loss') with tf.device( None): # Allow fallback to CPU if no GPU support for these ops top1_accuracy = tf.metrics.accuracy(labels=labels, predictions=predicted_classes) top5_accuracy = tf.metrics.mean( tf.nn.in_top_k(predictions=logits, targets=labels, k=5)) tf.summary.scalar('top1_accuracy', top1_accuracy[1]) tf.summary.scalar('top5_accuracy', top5_accuracy[1]) if mode == tf.estimator.ModeKeys.EVAL: metrics = { 'top1_accuracy': top1_accuracy, 'top5_accuracy': top5_accuracy } return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics) assert (mode == tf.estimator.ModeKeys.TRAIN) #batch_size = inputs.shape[0] batch_size = tf.shape(inputs)[0] learning_rate = tf.train.polynomial_decay(learning_rate_init, tf.train.get_global_step(), decay_steps=decay_steps, end_learning_rate=0., power=learning_rate_power, cycle=False, name='learning_rate') opt = tf.train.MomentumOptimizer(learning_rate, momentum, use_nesterov=True) opt = hvd.DistributedOptimizer(opt) opt = nvutils.LarcOptimizer(opt, learning_rate, larc_eta, clip=larc_mode) opt = nvutils.LossScalingOptimizer(opt, scale=loss_scale) gate_gradients = (tf.train.Optimizer.GATE_OP if deterministic else tf.train.Optimizer.GATE_NONE) train_op = opt.minimize(loss, global_step=tf.train.get_global_step(), gate_gradients=gate_gradients, name='step_update') update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) or [] train_op = tf.group(train_op, update_ops) return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
def __init__(self, sequence_length, num_classes, vocab_size, embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0): # Placeholders for input, output and dropout self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x") self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y") self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") # Keeping track of l2 regularization loss (optional) l2_loss = tf.constant(0.0) # Embedding layer with tf.device('/cpu:0'), tf.name_scope("embedding"): self.W = tf.Variable(tf.random_uniform( [vocab_size, embedding_size], -1.0, 1.0), name="W") self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x) self.embedded_chars_expanded = tf.expand_dims( self.embedded_chars, -1) # Create a convolution + maxpool layer for each filter size pooled_outputs = [] for i, filter_size in enumerate(filter_sizes): with tf.name_scope("conv-maxpool-%s" % filter_size): # Convolution Layer filter_shape = [filter_size, embedding_size, 1, num_filters] W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W") b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b") conv = tf.nn.conv2d(self.embedded_chars_expanded, W, strides=[1, 1, 1, 1], padding="VALID", name="conv") # Apply nonlinearity h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") # Maxpooling over the outputs pooled = tf.nn.max_pool( h, ksize=[1, sequence_length - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="pool") pooled_outputs.append(pooled) # Combine all the pooled features num_filters_total = num_filters * len(filter_sizes) self.h_pool = tf.concat(pooled_outputs, 3) self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total]) # Add dropout with tf.name_scope("dropout"): self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob) # Final (unnormalized) scores and predictions with tf.name_scope("output"): W = tf.get_variable( "W", shape=[num_filters_total, num_classes], initializer=tf.contrib.layers.xavier_initializer()) b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b") l2_loss += tf.nn.l2_loss(W) l2_loss += tf.nn.l2_loss(b) self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores") # print self.scores self.scored_predictions = tf.softmax(self.scores, name="scored_predictions") self.predictions = tf.argmax(self.scores, 1, name="predictions") # Calculate mean cross-entropy loss with tf.name_scope("loss"): losses = tf.nn.softmax_cross_entropy_with_logits( logits=self.scores, labels=self.input_y) self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss # Accuracy with tf.name_scope("accuracy"): correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
def _build_model(self): """ Build computation graph """ # placeholders self.x_ph = tf.placeholder(tf.float32, [None, None, self.x_dim], name='inputs') self.y_ph = tf.placeholder(tf.float32, [None, self.y_dim], name='targets') # build 1D convolutional blocks for each channel cnn_outputs = [] # a list to collect multi-channel 1D cnn outputs for channel in range(self.x_dim): with tf.name_scope('Conv_Maxpool_{}'.format(channel)): # filters and biases of 1D conv layers with tf.variable_scope('conv_maxpool_{}'.format(channel)): filter1 = tf.get_variable('filter1', [5, 1, 8]) bias1 = tf.get_variable('bias1', [8], initializer=tf.constant_initializer(0.0)) filter2 = tf.get_variable('filter2', [3, 8, 4]) bias2 = tf.get_variable('bias2', [4], initializer=tf.constant_initializer(0.0)) inputs = tf.reshape(self.x_ph[:, :, channel], [-1, self.seqlen, 1]) # 1D cnn block 1, seqlen: 32 --> 14 # filter shape [filter_width, in_channels, out_channels] conv1 = tf.nn.conv1d( value=inputs, filters=filter1, stride=1, padding='VALID', name='conv1d_1' ) seqlen = self.seqlen - 4 h1 = tf.nn.relu(tf.nn.bias_add(conv1, bias1), name='h1_relu') h1 = tf.reshape(h1, shape=[-1, 1, self.seqlen - 4, 8]) avgpool1 = tf.nn.avg_pool( value=h1, ksize=[1, 1, 2, 1], strides=[1, 1, 2, 1], padding='VALID', name='avg_pool_1' ) avgpool1 = tf.reshape(avgpool1, shape=[-1, 14, 8]) # 1D cnn block 2, seqlen: 14 --> 6 conv2 = tf.nn.conv1d( value=avgpool1, filters=filter2W, stride=1, padding='VALID', name='conv1d_2' ) h2 = tf.nn.relu(tf.nn.bias_add(conv2, bias2)) h2 = tf.reshape(h2, shape = [-1, 1, 12, 4]) avgpool2 = tf.nn.avg_pool( value=h2, ksize=[1, 1, 2, 1], strides=[1, 1, 2, 1], padding='VALID', name='avg_pool_2' ) avgpool2 = tf.reshape(avgpool2, shape=[-1, 1, 6, 4]) # collect multi-channel outputs cnn_outputs.append(avgpool2) # Combine all channels' cnn outputs cnn_outputs = tf.concat(cnn_outputs, axis=3) num_filters = self.x_dim * 4 cnn_outputs_flat = tf.reshape(cnn_outputs, [-1, num_filters_total * 6]) # [batch, x_dim * 24] # fully connected layer with tf.name_scope('Dense'): with tf.variable_scope('logits'): dense1_w = tf.get_variable('wd1', [6 * 4 * 16, 16]) dense1_b = tf.get_variable('bd1', [16], initializer=tf.constant_initializer(0.0)) fc1 = tf.matmul(cnn_outputs_flat, dense1_w) + dense1_b dense_outputs = tf.nn.relu(fc1) # dropout dense1_dropout = tf.nn.dropout(dense_outputs, dropout, name='dropout') # final outputs with tf.variable_scope('Logits'): logits_w = tf.get_variable('logits_w', [16, self.y_dim]) logits_b = tf.get_variable('logits_b', [self.y_dim], initializer=tf.constant_initializer(0.0)) logits = tf.matmul(dense1_dropout, logits_w) + logits_b # predictions with tf.name_scope('Prediction'): self.preds = tf.softmax(logits=logits) # training with gradient descent, global variables with tf.name_scope('Global'): global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step') self.learning_rate = tf.train.exponential_decay( learning_rate=self.initial_learning_rate, global_step=global_step, decay_steps=self.decay_steps, decay_rate=self.decay_rate ) with tf.name_scope('Loss'): self.loss = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=self.y_ph)) with tf.name_scope('Train'): self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(loss, global_step=global_step) # summaries self.summaries = tf.summary.merge([ tf.summary.scalar('loss', self.loss), ])
def forward(self, x): return tf.softmax(x)
def res_net(x, y, activation=tf.nn.relu): """Builds a residual network. Note that if the input tensor is 2D, it must be square in order to be converted to a 4D tensor. Borrowed structure from: github.com/pkmital/tensorflow_tutorials/blob/master/10_residual_network.py Args: x: Input of the network y: Output of the network activation: Activation function to apply after each convolution Returns: Predictions and loss tensors. """ # Configurations for each bottleneck group. BottleneckGroup = namedtuple('BottleneckGroup', ['num_blocks', 'num_filters', 'bottleneck_size']) groups = [ BottleneckGroup(3, 128, 32), BottleneckGroup(3, 256, 64), BottleneckGroup(3, 512, 128), BottleneckGroup(3, 1024, 256) ] input_shape = x.get_shape().as_list() # Reshape the input into the right shape if it's 2D tensor if len(input_shape) == 2: ndim = int(sqrt(input_shape[1])) x = tf.reshape(x, [-1, ndim, ndim, 1]) # First convolution expands to 64 channels with tf.variable_scope('conv_layer1'): net = convolution2d( x, 64, 7, normalizer_fn=batch_norm, activation_fn=activation) # Max pool net = tf.nn.max_pool(net, [1, 3, 3, 1], strides=[1, 2, 2, 1], padding='SAME') # First chain of resnets with tf.variable_scope('conv_layer2'): net = convolution2d(net, groups[0].num_filters, 1, padding='VALID') # Create the bottleneck groups, each of which contains `num_blocks` # bottleneck groups. for group_i, group in enumerate(groups): for block_i in range(group.num_blocks): name = 'group_%d/block_%d' % (group_i, block_i) # 1x1 convolution responsible for reducing dimension with tf.variable_scope(name + '/conv_in'): conv = convolution2d( net, group.bottleneck_size, 1, padding='VALID', activation_fn=activation, normalizer_fn=batch_norm) with tf.variable_scope(name + '/conv_bottleneck'): conv = convolution2d( conv, group.bottleneck_size, 3, padding='SAME', activation_fn=activation, normalizer_fn=batch_norm) # 1x1 convolution responsible for restoring dimension with tf.variable_scope(name + '/conv_out'): input_dim = net.get_shape()[-1].value conv = convolution2d( conv, input_dim, 1, padding='VALID', activation_fn=activation, normalizer_fn=batch_norm) # shortcut connections that turn the network into its counterpart # residual function (identity shortcut) net = conv + net try: # upscale to the next group size next_group = groups[group_i + 1] with tf.variable_scope('block_%d/conv_upscale' % group_i): net = convolution2d( net, next_group.num_filters, 1, activation_fn=None, biases_initializer=None, padding='SAME') except IndexError: pass net_shape = net.get_shape().as_list() net = tf.nn.avg_pool( net, ksize=[1, net_shape[1], net_shape[2], 1], strides=[1, 1, 1, 1], padding='VALID') net_shape = net.get_shape().as_list() net = tf.reshape(net, [-1, net_shape[1] * net_shape[2] * net_shape[3]]) target = tf.one_hot(y, depth=10, dtype=tf.float32) logits = tf.contrib.layers.fully_connected(net, 10, activation_fn=None) loss = tf.losses.softmax_cross_entropy(target, logits) return tf.softmax(logits), loss
def train_softmax(x, y, x_test, y_test, learning_rate=0.005, max_iterations=1000000, regularization=1., w_diff_term_crit=0.0001, print_per_iteration=False): assert (x.shape[1] == x_test.shape[1], "train shape:" + str(x.shape) + " and test shape:" + str(x_test.shape) + " do not match in dimensionality") assert (x.shape[0] == y.shape[0], "number of training samples:" + str(x.shape) + " and number of labels:" + str(y.shape) + " do not match!") assert (x_test.shape[0] == y_test.shape[0], "number of testing samples:" + str(x_test.shape) + " and number of labels:" + str(y_test.shape) + " do not match!") # set up constants num_input_dims = x.shape[1] num_label_dims = y.shape[1] reg_fact = tf.constant(regularization, name='regularization_factor') with tf.name_scope('input'): x_input = tf.placeholder(tf.float32, shape=[None, num_input_dims], name='input') with tf.name_scope('target'): y_ = tf.placeholder(tf.float32, shape=[None, num_label_dims], name='target') # linear regression with tf.name_scope('linear_regression'): # init_vals = , name='truncated_normal_init_val_w') w = tf.Variable(tf.truncated_normal([num_input_dims, num_label_dims], stddev=1. / math.sqrt(2)), name='w') b = tf.Variable(tf.zeros([num_label_dims]), name='b') output = tf.softmax(tf.matmul(x_input, w) + b) with tf.name_scope('regularization'): l2loss = tf.nn.l2_loss(w,name="l2_loss") regularization_penalty = tf.reduce_sum(tf.square(l2loss), name='regularization_penalty_sum') regularization_penalty *= reg_fact with tf.name_scope('loss'): # squared error loss + regularizationPenalty loss = tf.nn.softmax_cross_entropy_with_logits(output,y_) # diff = y_ - output # sq_diff = tf.square(diff) # loss = tf.reduce_mean(sq_diff) + regularization_penalty # loss = tf.reduce_mean(sq_diff) with tf.name_scope('optimizer'): opt = tf.train.GradientDescentOptimizer(learning_rate) grads = opt.compute_gradients(loss) opt = opt.apply_gradients(grads) init = tf.initialize_all_variables() sess = tf.Session() sess.run(init) its = 0 loss_train = -1. w_old = sess.run(w)[0][0] for i in xrange(0, max_iterations): w__, output__, loss__, _, regularization_penalty__ = sess.run( [w, output, loss, opt, regularization_penalty], feed_dict={x_input: x, y_: y}) if i % 1 and print_per_iteration == 0: print "regularization_penalty:", regularization_penalty__ print "iteration:", i print "weight:", w__ print "loss:", loss__ w_new = sess.run(w)[0][0] its += 1 w_diff = np.sum(np.abs(w_new - w_old)) # todo include termination criterion (weight change) if w_diff < w_diff_term_crit: print "reg_param:", regularization, "finished at iteration:", its, w_new # print "weights:", w_new # print "weight_difference:", w_diff break w_old = w_new loss_test = sess.run([loss], feed_dict={x_input: x_test, y_: y_test}) sess.close() tf.reset_default_graph() return its, loss_test, loss_train