def ACM(x, blockname, groups=32): b, w, h, c = K.int_shape(x) mu = tf.reduce_mean(x, axis=[1, 2], name=blockname + '_mu') mu = tf.expand_dims(mu, axis=1) mu = tf.expand_dims(mu, axis=1) P = Conv2D(c // 2, 1, padding='same', groups=groups, name=blockname + '_P1')(mu) P = relu(P) P = Conv2D(c, 1, padding='same', groups=groups, name=blockname + '_P2')(P) P = sigmoid(P) x_mu = x - mu k = Conv2D(c, 1, padding='same', groups=groups, name=blockname + '_K')(x_mu) q = Conv2D(c, 1, padding='same', groups=groups, name=blockname + '_Q')(x_mu) k = softmax(k) q = softmax(q) k = x_mu * k q = x_mu * q k = K.sum(k, axis=[1, 2]) q = K.sum(q, axis=[1, 2]) k_q = k - q y = x + k_q y = y * P return y
def get_mixture_coef(self, out_tensor): """ Parses the output tensor to appropriate mixture density coefficients""" # This uses eqns 18 -> 23 of http://arxiv.org/abs/1308.0850. # Pen states: z_pen_logits = out_tensor[:, :, 0:3] # Process outputs into MDN parameters M = self.hps['num_mixture'] dist_params = [ out_tensor[:, :, (3 + M * (n - 1)):(3 + M * n)] for n in range(1, 7) ] z_pi, z_mu1, z_mu2, z_sigma1, z_sigma2, z_corr = dist_params # Softmax all the pi's and pen states: z_pi = softmax(z_pi) z_pen = softmax(z_pen_logits) # Exponent the sigmas and also make corr between -1 and 1. z_sigma1 = K.exp(z_sigma1) z_sigma2 = K.exp(z_sigma2) z_corr = tanh(z_corr) r = [ z_pi, z_mu1, z_mu2, z_sigma1, z_sigma2, z_corr, z_pen, z_pen_logits ] return r
def call(self, x, mask=None): # Get input tensor static shape _, N, C = x.get_shape().as_list() head_dim = C // self.num_heads x_qkv = self.qkv(x) x_qkv = tf.reshape(x_qkv, shape=(-1, N, 3, self.num_heads, head_dim)) x_qkv = tf.transpose(x_qkv, perm=(2, 0, 3, 1, 4)) q, k, v = x_qkv[0], x_qkv[1], x_qkv[2] # Query rescaling q = q * self.scale # multi-headed self-attention k = tf.transpose(k, perm=(0, 1, 3, 2)) attn = (q @ k) # Shift window num_window_elements = self.window_size[0] * self.window_size[1] relative_position_index_flat = tf.reshape(self.relative_position_index, shape=(-1, )) relative_position_bias = tf.gather(self.relative_position_bias_table, relative_position_index_flat) relative_position_bias = tf.reshape(relative_position_bias, shape=(num_window_elements, num_window_elements, -1)) relative_position_bias = tf.transpose(relative_position_bias, perm=(2, 0, 1)) attn = attn + tf.expand_dims(relative_position_bias, axis=0) if mask is not None: nW = mask.get_shape()[0] mask_float = tf.cast( tf.expand_dims(tf.expand_dims(mask, axis=1), axis=0), tf.float32) attn = tf.reshape( attn, shape=(-1, nW, self.num_heads, N, N)) + mask_float attn = tf.reshape(attn, shape=(-1, self.num_heads, N, N)) attn = softmax(attn, axis=-1) else: attn = softmax(attn, axis=-1) # Dropout after attention attn = self.attn_drop(attn) # Merge qkv vectors x_qkv = (attn @ v) x_qkv = tf.transpose(x_qkv, perm=(0, 2, 1, 3)) x_qkv = tf.reshape(x_qkv, shape=(-1, N, C)) # Linear projection x_qkv = self.proj(x_qkv) # Dropout after projection x_qkv = self.proj_drop(x_qkv) return x_qkv
def call(self, logits, mask=None): if mask is not None: # print mask m = tf.cast(mask[0], logits.dtype) # m = tf.cast(mask[0][0], logits.dtype) mm = tf.expand_dims(m, -2) * tf.expand_dims(m, -1) adder = (1.0 - mm) * self._small_negative return logits + adder, softmax(logits + adder) return logits, softmax(logits)
def __call__(self, inputs, training): x, mask = inputs g_x = iterative_call(self.inner_layers[0], x, training=training) theta_x = iterative_call(self.inner_layers[1], x, training=training) phi_x = iterative_call(self.inner_layers[2], x, training=training) f_div_C = softmax(tf.linalg.matmul(theta_x, phi_x) / self.temperature, axis=2) inverted_mask = 1. - mask mask = tf.image.resize(mask, size=x.shape[1:3], method=tf.image.ResizeMethod.BILINEAR) mask = tf.where(tf.greater(mask, 0.), tf.zeros_like(mask), mask) mask = 1. - mask inverted_mask = tf.image.resize(inverted_mask, size=(x.shape[1:3]), method=tf.image.ResizeMethod.BILINEAR) mask *= inverted_mask mask_expand = tf.reshape(mask, (x.shape[0], 1, -1)) mask_expand = tf.repeat(mask_expand, x.shape[1] * x.shape[2], axis=1) if self.use_self: mask_expand[:, range(x.shape[1] * x.shape[2]), range(x.shape[1] * x.shape[2])] = 1.0 f_div_C = mask_expand * f_div_C if self.re_norm: f_div_C = tf.keras.utils.normalize(f_div_C, axis=2, order=1) y = tf.reshape(tf.linalg.matmul(f_div_C, g_x), (x.shape[0], *x.shape[1:3], self.inter_channels)) W_y = iterative_call(self.inner_layers[3], y, training=training) W_y = iterative_call(self.inner_layers[4], W_y, training=training) assert self.mode.casefold() == 'combine' full_mask = tf.repeat(mask, self.inter_channels, axis=3) z = full_mask * x + (1 - full_mask) * W_y return z
def evaluate(adj, x, labels, idx_train, idx_val, idx_test, target, retrain_iters=2, norm_x='l1'): classification_margins = [] class_distrs = [] for _ in range(retrain_iters): print(f"... {_+1}/{retrain_iters}") model = GCN(adj, x, labels, device='GPU:0', seed=123 + _, norm_x=norm_x) model.build() his = model.train(idx_train, idx_val, verbose=0, epochs=100) logit = softmax(model.predict(target)).numpy().ravel() class_distrs.append(logit) best_second_class_before = ( logit - np.eye(data.n_classes)[labels[target]]).argmax() margin = logit[labels[target]] - logit[best_second_class_before] classification_margins.append(margin) model.close class_distrs = np.asarray(class_distrs) print(classification_margins) return class_distrs
def generate(self, mask_in): paddings = [ [0, 0], [self.padding, self.padding], [self.padding, self.padding], [0, 0], ] mask = tf.pad(mask_in, paddings) # NOTE(brendan): this unfolding is for convolution mask = tf.image.extract_patches( mask, sizes=(1, (2 * self.radius) + 1, (2 * self.radius) + 1, 1), strides=4 * (1, ), rates=4 * (1, ), padding="VALID", ) mask = tf.image.resize(mask, size=self.shape_up, method="nearest") mask = mask[:, :-self.step + 1, :-self.step + 1, :] # NOTE(brendan): convolve Gaussian weights with mask (smoothness inductive bias) mask = self.weight * mask mask = mask * softmax(self.coldness * mask, axis=-1) mask = tf.reduce_sum(mask, axis=-1, keepdims=True) m = round(self.margin) if self.clamp: mask = tf.clip_by_value(mask, clip_value_min=0, clip_value_max=1) cropped = mask[:, m:m + self.shape[0], m:m + self.shape[1], :] return cropped, mask
def call(self, queries, keys, values, mask=None, training=False, attentions=False): """ inputs q: (queries_size x timesteps x input_dim) keys: (keys memory_size x memory_timesteps x input_dim) values: (values queries_size x timesteps x output_dim) returns output: context vector (queries_size x timesteps x out_dim) attn: attn weights (queries_size x timesteps x out_dim) """ attn = tf.matmul(queries, keys, transpose_b=True) scale_dim = keys.shape[2] attn /= np.sqrt(scale_dim) if mask is not None: attn += mask attn = activations.softmax(attn) if training: attn = self.dropout(attn) output = tf.matmul(attn, values) if attentions: return output, attn else: return output
def build(self, z): input_shape, _ = z self.q_net = Dense(self.dout, input_shape=input_shape, use_bias=False, kernel_constraint=MaskWeights(self.mask), name="q_dense") self.k_net = Dense(self.dout, input_shape=input_shape, use_bias=False, kernel_constraint=MaskWeights(self.mask), name="k_dense") self.v_net = Dense(self.dout, input_shape=input_shape, use_bias=False, kernel_constraint=MaskWeights(self.mask), name="v_dense") self.out_net = Dense(self.dout, kernel_constraint=MaskWeights(self.mask), name="head_combiner") self.attention = Lambda(lambda x: activations.softmax( tf.matmul(x[0], x[1], transpose_b=True) / (self.dk ** -0.5) ), name='scale_dot_attention') self.dot_product = Lambda(lambda x: tf.matmul(x[0], x[1]), name='dot_product') self.head_split = Reshape((input_shape[1], self.h * self.nh, self.dk)) self.head_swap = Lambda(lambda x: tf.transpose(x, [0, 2, 1, 3])) self.head_merge = Reshape((input_shape[1], self.dout)) super(MaskedMultiHeadAttention, self).build(input_shape)
def call(self, inputs, mask=None, **kwargs): query, value = inputs embedding_size = query.shape[-1] step_size = value.shape[1] if self.attention_type == 'inner': attention_score = tf.squeeze(tf.matmul(value, query, transpose_b=True), axis=-1) else: querys = tf.tile(query, [1, step_size, 1]) query_value = tf.concat([querys, value, querys-value, querys*value], axis=-1) attention_score = self.out_kernel(self.mlp(query_value)) attention_score = tf.reshape(attention_score, (-1, step_size)) if mask is not None: if mask[0] is not None: raise ValueError('query should not support mask') if mask[1] is not None: min_value_matrix = tf.ones_like(attention_score) * (-2**31) attention_score = tf.where(mask[1], attention_score, min_value_matrix) attention_score = tf.divide(attention_score, tf.sqrt(embedding_size*1.0)) if self.norm: weighted_att_score = activations.softmax(attention_score) attention_vec = tf.squeeze(tf.matmul(tf.expand_dims(weighted_att_score, axis=1), value), axis=1) if not self.keepdims: out_shape = (-1, embedding_size) else: out_shape = (-1, 1, embedding_size) return tf.reshape(attention_vec, out_shape)
def random_sample(self, sample_epochs=20, disable=False): best_loss = -np.inf best_s = None s = tf.linalg.band_part(self.adj_changes, 0, -1) - tf.linalg.band_part( self.adj_changes, 0, 0) for _ in tqdm(range(sample_epochs), desc='Random Sampling', disable=disable): random_matrix = tf.random.uniform(shape=(self.n_nodes, self.n_nodes), minval=0., maxval=1.) sampled = tf.where(s > random_matrix, 1., 0.) if tf.reduce_sum(sampled) > self.n_perturbations: continue with tf.device(self.device): self.adj_changes.assign(sampled) adj = self.get_perturbed_adj() adj_norm = normalize_adj_tensor(adj) logit = self.surrogate([self.tf_x, adj_norm, self.idx_attack]) logit = softmax(logit) loss = self.compute_loss(logit) if best_loss < loss: best_loss = loss best_s = sampled return best_s.numpy()
def _read_inputs(self, inputs): """ Applies transformations to `inputs` to get control for this module. Computes elements in the interface vector 𝜉 """ def linear(name, first_dim=None, second_dim=None): """ Returns a linear transformation of `inputs`. If first_dim and second_dim are provide, reshape the resulting Tensor """ linear = self._layers[name](inputs) if first_dim and second_dim: linear = tf.reshape(linear, [-1, first_dim, second_dim]) return linear # v_t^i - The vectors to write to memory, for each write head `i`. write_vectors = linear('write_vectors', self._num_writes, self._word_size) # e_t^i - Amount to erase the memory by before writing, for each write head. erase_vectors = linear('erase_vectors', self._num_writes, self._word_size) # f_t^j - Amount that the memory at the locations read from at the previous # time step can be declared unused, for each read head `j`. free_gate = linear('free_gate') # g_t^{a, i} - Interpolation between writing to unallocated memory and # content-based lookup, for each write head `i`. Note: `a` is simply used to # identify this gate with allocation vs writing (as defined below). allocation_gate = linear('allocation_gate') # g_t^{w, i} - Overall gating of write amount for each write head. write_gate = linear('write_gate') # 𝜋_t^j - Mixing between "backwards" and "forwards" positions (for # each write head), and content-based lookup, for each read head. num_read_modes = 1 + 2 * self._num_writes read_mode = linear('read_mode', self._num_reads, num_read_modes) read_mode = activations.softmax(read_mode) # Parameters for the (read / write) "weights by content matching" modules. write_keys = linear('write_keys', self._num_writes, self._word_size) write_strengths = linear('write_strengths') read_keys = linear('read_keys', self._num_reads, self._word_size) read_strengths = linear('read_strengths') result = dict( read_keys=read_keys, read_strengths=read_strengths, write_keys=write_keys, write_strengths=write_strengths, write_vectors=write_vectors, erase_vectors=erase_vectors, free_gate=free_gate, allocation_gate=allocation_gate, write_gate=write_gate, read_mode=read_mode, ) return result
def getAction(self, game): logits, _ = self.model(tf.convert_to_tensor(self.extractState(game))) probs = activations.softmax(logits) action = np.random.choice(self.action_size, p=probs.numpy()[0]) while action >= len(game.getMoves()): action = np.random.choice(self.action_size, p=probs.numpy()[0]) return game.getMoves()[action]
def build_model(w,h,num_classes, dropout=.5, l2_reg=0.,conv_type='ds'): X = Input(shape=(h,w,3),name='X') #conv1 = conv_block(X,conv_type,64,3,1, name='conv1',num_conv_layers=2,l2_reg=l2_reg) #conv2 = conv_block(conv1,conv_type,128,3,1,name='conv2',l2_reg=l2_reg) conv1 = conv_block(X,conv_type,64,3,1, name='conv1',num_conv_layers=2,l2_reg=l2_reg,use_bn=True) conv2 = conv_block(conv1,conv_type,128,3,1,name='conv2',num_conv_layers=2,l2_reg=l2_reg,use_bn=True) conv3 = conv_block(conv2,conv_type,256,3,1,name='conv3',l2_reg=l2_reg,use_bn=True) conv4 = conv_block(conv3,conv_type,512,3,1,name='conv4',l2_reg=l2_reg,use_bn=True) conv5 = conv_block(conv4,conv_type,512,3,1,name='conv5',l2_reg=l2_reg,use_bn=True) fc1 = fc_block(conv5,conv_type,4096,(7,7), strides=(1,1), dropout=dropout, name='fc1',l2_reg=l2_reg,use_dropout=True,use_bn=True) fc2 = fc_block(fc1,conv_type,4096,(1,1), strides=(1,1), dropout=dropout, name='fc2',l2_reg=l2_reg,use_dropout=True,use_bn=True) score32 = score_block(fc2,conv_type,num_classes,name='score32',l2_reg=l2_reg) score16 = score_block(conv4,conv_type,num_classes,name='score16',l2_reg=l2_reg) score8 = score_block(conv3,conv_type,num_classes,name='score8',l2_reg=l2_reg) upscore32 = upsample_block(score32,num_classes,4,2,name='upscore32',l2_reg=l2_reg) upscore32c = crop(score16,name='upscore32c')(upscore32) fuse1 = Add(name='fuse1')([score16, upscore32c]) upscore16 = upsample_block(fuse1,num_classes,4,2,name='upscore16',l2_reg=l2_reg) upscore16c = crop(score8,name='upscore16c')(upscore16) fuse2 = Add(name='fuse2')([upscore16c, score8]) upscore8 = UpSampling2D((8,8),name='upscore8')(fuse2) upscore8c = crop(X,name='upscore8c')(upscore8) classifier = Lambda(lambda x: softmax(x))(upscore8c) fcn8 = Model(inputs=X, outputs=classifier, name = 'FCN8') return fcn8
def call(self, inputs): X, A = inputs N = K.shape(A)[-1] # Check if the layer is operating in mixed or batch mode mode = ops.autodetect_mode(X, A) self.reduce_loss = mode in (modes.MIXED, modes.BATCH) # Get normalized adjacency if K.is_sparse(A): I_ = tf.sparse.eye(N, dtype=A.dtype) A_ = tf.sparse.add(A, I_) else: I_ = tf.eye(N, dtype=A.dtype) A_ = A + I_ fltr = ops.normalize_A(A_) # Node embeddings Z = K.dot(X, self.kernel_emb) Z = ops.modal_dot(fltr, Z) if self.activation is not None: Z = self.activation(Z) # Compute cluster assignment matrix S = K.dot(X, self.kernel_pool) S = ops.modal_dot(fltr, S) S = activations.softmax(S, axis=-1) # softmax applied row-wise # Link prediction loss S_gram = ops.modal_dot(S, S, transpose_b=True) if mode == modes.MIXED: A = tf.sparse.to_dense(A)[None, ...] if K.is_sparse(A): LP_loss = tf.sparse.add(A, -S_gram) # A/tf.norm(A) - S_gram/tf.norm(S_gram) else: LP_loss = A - S_gram LP_loss = tf.norm(LP_loss, axis=(-1, -2)) if self.reduce_loss: LP_loss = K.mean(LP_loss) self.add_loss(LP_loss) # Entropy loss entr = tf.negative( tf.reduce_sum(tf.multiply(S, K.log(S + K.epsilon())), axis=-1) ) entr_loss = K.mean(entr, axis=-1) if self.reduce_loss: entr_loss = K.mean(entr_loss) self.add_loss(entr_loss) # Pooling X_pooled = ops.modal_dot(S, Z, transpose_a=True) A_pooled = ops.matmul_at_b_a(S, A) output = [X_pooled, A_pooled] if self.return_mask: output.append(S) return output
def convModel_APIHandler(): """ This function represents a Flask API endpoint that serves a Keras image classifier. If the input data is valid, this function will return the models top prediction for the image, along with all of the probabilities for the classes. """ try: # image is expected to be recieved in a base64 encoded string which will be decoded # and converted to a tensor and then preprocessed base64_image_string = request.form['image_data_buffer'] base64_decoded = base64.b64decode(base64_image_string) image = Image.open(io.BytesIO(base64_decoded)) preprocessed_img = prepare_image(image) raw_logits_tensor = model(preprocessed_img) prob_distribution = softmax(raw_logits_tensor) topPrediction = class_indices[prob_distribution.numpy().argmax()] allProbsDict = get_all_probs(prob_distribution) resp = jsonify(MostLikelyClass=topPrediction, allProbs=allProbsDict) resp.status_code = 200 return resp except: resp = jsonify({ 'message': 'There was an error processing your image by the server. Try a different image?' }) resp.status_code = 500 return resp
def squeeze_net(shape): inp = Input(shape=shape) conv1 = Conv2D(filters=96, kernel_size=(7, 7), strides=(2, 2), activation='relu', padding='same')(inp) pool1 = MaxPooling2D(pool_size=(3, 3), strides=(2, 2))(conv1) fire1 = fire_module(16, 64, 64)(pool1) fire2 = fire_module(16, 64, 64)(fire1) fire3 = fire_module(32, 128, 128)(fire2) pool2 = MaxPooling2D(pool_size=(3, 3), strides=(2, 2))(fire3) fire4 = fire_module(32, 128, 128)(pool2) fire5 = fire_module(48, 192, 192)(fire4) fire6 = fire_module(48, 192, 192)(fire5) fire7 = fire_module(64, 256, 256)(fire6) pool3 = MaxPooling2D(pool_size=(3, 3), strides=(2, 2))(fire7) fire8 = fire_module(64, 256, 256)(pool3) drop1 = Dropout(.5)(fire8) # conv2 should have number of filters same as desired number of outputs conv2 = Conv2D(filters=1000, kernel_size=(1, 1), strides=(1, 1), activation='relu')(drop1) pool4 = GlobalAveragePooling2D()(conv2) # activation functions should be changed to switch to regression out = softmax(pool4) model = Model(inputs=inp, outputs=[out]) return model
def multiplicative_self_attention(units, n_hidden=None, n_output_features=None, activation=None): """ Compute multiplicative self attention for time series of vectors (with batch dimension) the formula: score(h_i, h_j) = <W_1 h_i, W_2 h_j>, W_1 and W_2 are learnable matrices with dimensionality [n_hidden, n_input_features] Args: units: tf tensor with dimensionality [batch_size, time_steps, n_input_features] n_hidden: number of units in hidden representation of similarity measure n_output_features: number of features in output dense layer activation: activation at the output Returns: output: self attended tensor with dimensionality [batch_size, time_steps, n_output_features] """ n_input_features = K.int_shape(units)[2] if n_hidden is None: n_hidden = n_input_features if n_output_features is None: n_output_features = n_input_features exp1 = Lambda(lambda x: expand_tile(x, axis=1))(units) exp2 = Lambda(lambda x: expand_tile(x, axis=2))(units) queries = Dense(n_hidden)(exp1) keys = Dense(n_hidden)(exp2) scores = Lambda(lambda x: K.sum(queries * x, axis=3, keepdims=True))(keys) attention = Lambda(lambda x: softmax(x, axis=2))(scores) mult = Multiply()([attention, exp1]) attended_units = Lambda(lambda x: K.sum(x, axis=2))(mult) output = Dense(n_output_features, activation=activation)(attended_units) return output
def additive_self_attention(units, n_hidden=None, n_output_features=None, activation=None): """ Compute additive self attention for time series of vectors (with batch dimension) the formula: score(h_i, h_j) = <v, tanh(W_1 h_i + W_2 h_j)> v is a learnable vector of n_hidden dimensionality, W_1 and W_2 are learnable [n_hidden, n_input_features] matrices Args: units: tf tensor with dimensionality [batch_size, time_steps, n_input_features] n_hidden: number of2784131 units in hidden representation of similarity measure n_output_features: number of features in output dense layer activation: activation at the output Returns: output: self attended tensor with dimensionality [batch_size, time_steps, n_output_features] """ n_input_features = K.int_shape(units)[2] if n_hidden is None: n_hidden = n_input_features if n_output_features is None: n_output_features = n_input_features exp1 = Lambda(lambda x: expand_tile(x, axis=1))(units) exp2 = Lambda(lambda x: expand_tile(x, axis=2))(units) units_pairs = Concatenate(axis=3)([exp1, exp2]) query = Dense(n_hidden, activation="tanh")(units_pairs) attention = Dense(1, activation=lambda x: softmax(x, axis=2))(query) attended_units = Lambda(lambda x: K.sum(attention * x, axis=2))(exp1) output = Dense(n_output_features, activation=activation)(attended_units) return output
def get_default_model(): inputs = Input(shape=(None, None, 10)) x = Conv2D(128, 3, padding='same')(inputs) x = Activation('relu')(x) x = Conv2D(10, 1)(x) outputs = softmax(x, axis=3) return tf.keras.Model(inputs=inputs, outputs=outputs)
def call(self, inputs, **kwargs): features = inputs[0] relations = inputs[1] sx = inputs[2] sy = inputs[3] z, _ = self.kenn_layer_1(features, relations, sx, sy) return softmax(z)
def step(self, x, states): ytm, stm = states # repeat the hidden state to the length of the sequence _stm = K.repeat(stm, self.timesteps) # now multiplty the weight matrix with the repeated hidden state _Wxstm = K.dot(_stm, self.W_a) # calculate the attention probabilities # this relates how much other timesteps contributed to this one. et = K.dot(activations.tanh(_Wxstm + self._uxpb), K.expand_dims(self.V_a)) at = K.exp(et) at_sum = K.sum(at, axis=1) at_sum_repeated = K.repeat(at_sum, self.timesteps) at /= at_sum_repeated # vector of size (batchsize, timesteps, 1) # calculate the context vector context = K.squeeze(K.batch_dot(at, self.x_seq, axes=1), axis=1) # ~~~> calculate new hidden state # first calculate the "r" gate: rt = activations.sigmoid( K.dot(ytm, self.W_r) + K.dot(stm, self.U_r) + K.dot(context, self.C_r) + self.b_r) # now calculate the "z" gate zt = activations.sigmoid( K.dot(ytm, self.W_z) + K.dot(stm, self.U_z) + K.dot(context, self.C_z) + self.b_z) # calculate the proposal hidden state: s_tp = activations.tanh( K.dot(ytm, self.W_p) + K.dot((rt * stm), self.U_p) + K.dot(context, self.C_p) + self.b_p) # new hidden state: st = (1-zt)*stm + zt * s_tp yt = activations.softmax( K.dot(ytm, self.W_o) + K.dot(stm, self.U_o) + K.dot(context, self.C_o) + self.b_o) if self.return_probabilities: return at, [yt, st] else: return yt, [yt, st]
def read(self, keys, scale=None): """Read from memory. Read the memory for given the keys. For each key in keys we will get one result as `r = sum_i M[i] a[i]` where `M[i]` is the memory content at location i and `a[i]` is the attention weight for key at location i. `a` is calculated as softmax of a scaled similarity between key and each memory content: `a[i] = exp(scale*sim[i])/(sum_i scale*sim[i])` Args: keys (Tensor): shape[-1] is dim. For single key read, the shape is (batch_size, dim). For multiple key read, the shape is (batch_szie, k, dim), where k is the number of keys. scale (None|float|Tensor): shape is () or keys.shape[:-1]. The cosine similarities are multiplied with `scale` before softmax is applied. If None, use the scale provided at constructor. Returns: resutl Tensor: shape is same as keys. result[..., i] is the read result for the corresponding key. """ if not self._built: self.build(keys.shape[0]) assert 2 <= len(keys.shape) <= 3 assert keys.shape[0] == self._batch_size assert keys.shape[-1] == self.dim if scale is None: scale = self._scale else: if isinstance(scale, (int, float)): pass else: # assuming it's Tensor scale = expand_dims_as(scale, keys) sim = layers.dot([keys, self._memory], axes=-1, normalize=self._normalize) sim = sim * scale attention = activations.softmax(sim) result = layers.dot([attention, self._memory], axes=(-1, 1)) if len(sim.shape) > 2: # multiple read keys usage = tf.reduce_sum(attention, axis=tf.range(1, len(sim.shape) - 1)) else: usage = attention if self._snapshot_only: self._usage.assign_add(usage) else: self._usage = self._usage + usage return result
def squeeze_net_complex_skip(shape): inp = Input(shape=shape) conv1 = Conv2D(filters=96, kernel_size=(7, 7), strides=(2, 2), activation='relu', padding='same')(inp) pool1 = MaxPooling2D(pool_size=(3, 3), strides=(2, 2))(conv1) skip_conv1 = Conv2D(filters=128, kernel_size=(1, 1), strides=(1, 1), activation='relu')(pool1) fire1 = fire_module(16, 64, 64)(pool1) complex_add1 = Add()([fire1, skip_conv1]) fire2 = fire_module(16, 64, 64)(complex_add1) add1 = Add()([complex_add1, fire2]) fire3 = fire_module(32, 128, 128)(add1) skip_conv2 = Conv2D(filters=256, kernel_size=(1, 1), strides=(1, 1), activation='relu')(add1) complex_add2 = Add()([skip_conv2, fire3]) pool2 = MaxPooling2D(pool_size=(3, 3), strides=(2, 2))(complex_add2) fire4 = fire_module(32, 128, 128)(pool2) add2 = Add()([pool2, fire4]) skip_conv3 = Conv2D(filters=384, kernel_size=(1, 1), strides=(1, 1), activation='relu')(add2) fire5 = fire_module(48, 192, 192)(add2) complex_add3 = Add()([skip_conv3, fire5]) fire6 = fire_module(48, 192, 192)(complex_add3) add3 = Add()([complex_add3, fire6]) fire7 = fire_module(64, 256, 256)(add3) skip_conv4 = Conv2D(filters=512, kernel_size=(1, 1), strides=(1, 1), activation='relu')(add3) complex_add4 = Add()([skip_conv4, fire7]) pool3 = MaxPooling2D(pool_size=(3, 3), strides=(2, 2))(complex_add4) fire8 = fire_module(64, 256, 256)(pool3) add4 = Add()([pool3, fire8]) drop1 = Dropout(.5)(add4) # conv2 should have number of filters same as desired number of outputs conv2 = Conv2D(filters=1000, kernel_size=(1, 1), strides=(1, 1), activation='relu')(drop1) pool4 = GlobalAveragePooling2D()(conv2) # activation functions should be changed to switch to regression out = softmax(pool4) model = Model(inputs=inp, outputs=[out]) return model
def call(self, hidden, timesteps): hidden_transformed = self.transform_hidden(hidden) hidden_repeated = K.repeat(hidden_transformed, timesteps) input_seq_transformed = self._input_seq_shaped alignment_score = self.calculate_alignment(hidden_repeated, input_seq_transformed) score_vector = softmax(alignment_score, 1) context_vector = K.sum(score_vector * self.input_seq, 1) return context_vector
def update_surrogate(self, trainable_variables, idx): with tf.GradientTape() as tape: adj = self.get_perturbed_adj() adj_norm = normalize_adj_tensor(adj) logit = self.surrogate([self.tf_x, adj_norm, idx]) logit = softmax(logit) loss = self.compute_loss(logit) gradients = tape.gradient(loss, trainable_variables) self.optimizer.apply_gradients(zip(gradients, trainable_variables))
def head_no_fc(x): ''' No incluye fully connected, logra un +2.5~ en val_place_acc Sin los FC, evitamos un poco mas el overfitting ''' x = block_no_activation(x, k=1, n_c=259, s=1, padding='same') x = GlobalAveragePooling2D()(x) x = Reshape((7, 37, 1))(x) return Lambda(lambda x: softmax(x, axis=-2))(x)
def call(self, encoder_outputs, decoder_outputs, mask=None): w1_e = self.W1(encoder_outputs) w2_d = self.W2(decoder_outputs) tanh_output = activations.tanh(w1_e + w2_d) v_dot_tanh = self.V(tanh_output) if mask is not None: v_dot_tanh += (mask * -1e-9) attention_weights = activations.softmax(v_dot_tanh, axis=1) att_shape = tf.shape(attention_weights) return tf.reshape(attention_weights, (att_shape[0], att_shape[1]))
def predict_step_on_batch(self, x, out_weight=None, return_logits=True, device="CPU"): with tf.device(device): out = self(x, training=False) out = gather(out, out_weight) if not return_logits: out = softmax(out) return out
def compute_gradients(self, idx): with tf.GradientTape() as tape: tape.watch(self.adj_changes) adj = self.get_perturbed_adj() adj_norm = normalize_adj_tensor(adj) logit = self.surrogate([self.tf_x, adj_norm, idx]) logit = softmax(logit) loss = self.compute_loss(logit) gradients = tape.gradient(loss, self.adj_changes) return gradients