def shift(shape, stride, anchors): """Produce shifted anchors based on shape of the map and stride size. Args: shape: Shape to shift the anchors over. stride: Stride to shift the anchors with over the shape. anchors: The anchors to apply at each location. Returns: shifted anchors """ shift_x = (K.arange(0, shape[1], dtype=K.floatx()) + K.constant(0.5, dtype=K.floatx())) * stride shift_y = (K.arange(0, shape[0], dtype=K.floatx()) + K.constant(0.5, dtype=K.floatx())) * stride shift_x, shift_y = tf.meshgrid(shift_x, shift_y) shift_x = K.reshape(shift_x, [-1]) shift_y = K.reshape(shift_y, [-1]) shifts = K.stack([shift_x, shift_y, shift_x, shift_y], axis=0) shifts = K.transpose(shifts) number_of_anchors = K.shape(anchors)[0] k = K.shape(shifts)[0] # number of base points = feat_h * feat_w shifts = K.cast(K.reshape(shifts, [k, 1, 4]), K.floatx()) shifted_anchors = K.reshape(anchors, [1, number_of_anchors, 4]) + shifts shifted_anchors = K.reshape(shifted_anchors, [k * number_of_anchors, 4]) return shifted_anchors
def call(self, inputs): input_shape = self.in_shape if self.data_format == 'channels_first': x = K.arange(0, input_shape[1], dtype=K.floatx()) y = K.arange(0, input_shape[2], dtype=K.floatx()) else: x = K.arange(0, input_shape[0], dtype=K.floatx()) y = K.arange(0, input_shape[1], dtype=K.floatx()) x = x / K.max(x) y = y / K.max(y) loc_x, loc_y = tf.meshgrid(x, y, indexing='ij') if self.data_format == 'channels_first': loc = K.stack([loc_x, loc_y], axis=0) else: loc = K.stack([loc_x, loc_y], axis=-1) location = K.expand_dims(loc, axis=0) if self.data_format == 'channels_first': location = K.permute_dimensions(location, pattern=[0, 2, 3, 1]) location = tf.tile(location, [K.shape(inputs)[0], 1, 1, 1]) if self.data_format == 'channels_first': location = K.permute_dimensions(location, pattern=[0, 3, 1, 2]) return location
def yolo_head(feats, anchors, num_classes, input_shape, calc_loss=False): """Convert final layer features to bounding box parameters.""" num_anchors = len(anchors) # Reshape to batch, height, width, num_anchors, box_params. anchors_tensor = K.reshape(K.constant(anchors), [1, 1, 1, num_anchors, 2]) grid_shape = K.shape(feats)[1:3] # height, width grid_y = K.tile(K.reshape(K.arange(0, stop=grid_shape[0]), [-1, 1, 1, 1]), [1, grid_shape[1], 1, 1]) grid_x = K.tile(K.reshape(K.arange(0, stop=grid_shape[1]), [1, -1, 1, 1]), [grid_shape[0], 1, 1, 1]) grid = K.concatenate([grid_x, grid_y]) grid = K.cast(grid, K.dtype(feats)) feats = K.reshape( feats, [-1, grid_shape[0], grid_shape[1], num_anchors, num_classes + 5]) # Adjust preditions to each spatial grid point and anchor size. box_xy = (K.sigmoid(feats[..., :2]) + grid) / K.cast( grid_shape[::-1], K.dtype(feats)) box_wh = K.exp(feats[..., 2:4]) * anchors_tensor / K.cast( input_shape[::-1], K.dtype(feats)) box_confidence = K.sigmoid(feats[..., 4:5]) box_class_probs = K.sigmoid(feats[..., 5:]) if calc_loss == True: return grid, feats, box_xy, box_wh return box_xy, box_wh, box_confidence, box_class_probs
def positional_signal(hidden_size: int, length: int, min_timescale: float = 1.0, max_timescale: float = 1e4): """ Helper function, constructing basic positional encoding. The code is partially based on implementation from Tensor2Tensor library https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/layers/common_attention.py """ if hidden_size % 2 != 0: raise ValueError( f"The hidden dimension of the model must be divisible by 2." f"Currently it is {hidden_size}") position = K.arange(0, length, dtype=K.floatx()) num_timescales = hidden_size // 2 log_timescale_increment = K.constant( (np.log(float(max_timescale) / float(min_timescale)) / (num_timescales - 1)), dtype=K.floatx()) inv_timescales = (min_timescale * K.exp( K.arange(num_timescales, dtype=K.floatx()) * -log_timescale_increment)) scaled_time = K.expand_dims(position, 1) * K.expand_dims(inv_timescales, 0) signal = K.concatenate([K.sin(scaled_time), K.cos(scaled_time)], axis=1) return K.expand_dims(signal, axis=0)
def call(self, inputs, mask=None, training=None): inputs, relatives, memories, bias_context, bias_relative = inputs full = K.concatenate([memories, inputs], axis=1) # (batch, prev_len + seq_len, units) w_q = K.dot(inputs, self.kernel_q) # (batch, seq_len, units) w_kv = K.dot(full, self.kernel_kv) # (batch, prev_len + seq_len, units * 2) w_r = K.dot(relatives, self.kernel_r) # (batch, prev_len + seq_len, units) if self.use_bias: w_q = K.bias_add(w_q, self.bias_q) w_kv = K.bias_add(w_kv, self.bias_kv) w_r = K.bias_add(w_r, self.bias_r) if self.activation is not None: w_q = self.activation(w_q) w_kv = self.activation(w_kv) w_r = self.activation(w_r) w_k = w_kv[:, :, :self.units] # (batch, prev_len + seq_len, units) w_v = w_kv[:, :, self.units:] # (batch, prev_len + seq_len, units) w_qc = K.bias_add(w_q, bias_context) w_qc = self._reshape_to_batches(w_qc) # (batch * n_head, seq_len, units_head) w_k = self._reshape_to_batches(w_k) # (batch * n_head, prev_len + seq_len, units_head) a_context = K.batch_dot(w_qc, w_k, axes=2) # (batch * n_head, seq_len, prev_len + seq_len) w_qr = K.bias_add(w_q, bias_relative) w_qr = self._reshape_to_batches(w_qr) # (batch * n_head, seq_len, units_head) w_r = self._reshape_to_batches(w_r) # (batch * n_head, prev_len + seq_len, units_head) a_relative = K.batch_dot(w_qr, w_r, axes=2) # (batch * n_head, seq_len, prev_len + seq_len) a_relative = self._relative_shift(a_relative) # (batch * n_head, seq_len, prev_len + seq_len) att = (a_context + a_relative) / K.sqrt(K.constant(self.units_head, dtype=K.floatx())) exp = K.exp(att - K.max(att, axis=-1, keepdims=True)) q_len, k_len = K.shape(w_q)[1], K.shape(w_k)[1] indices = K.expand_dims(K.arange(0, k_len), axis=0) upper = K.expand_dims(K.arange(k_len - q_len, k_len), axis=-1) exp *= K.expand_dims(K.cast(indices <= upper, K.floatx()), axis=0) if mask is not None and mask[0] is not None: mask = K.cast(mask[0], K.floatx()) mask = K.concatenate([K.ones_like(memories[:, :, 0]), mask], axis=1) exp *= K.expand_dims(self._reshape_mask(mask), axis=1) att = exp / K.sum(exp, axis=-1, keepdims=True) if self.att_drop_layer is not None: att = self.att_drop_layer(att, training=training) w_v = self._reshape_to_batches(w_v) # (batch * n_head, prev_len + seq_len, units_head) w_o = K.batch_dot(att, w_v) # (batch * n_head, seq_len, units_head) w_o = self._reshape_from_batches(w_o) # (batch, seq_len, units) w_o = K.dot(w_o, self.kernel_o) # (batch, seq_len, units) if self.use_bias: w_o = K.bias_add(w_o, self.bias_o) if self.activation is not None: w_o = self.activation(w_o) # Add shape information to tensor when using `tf.keras` input_shape = K.int_shape(inputs) if input_shape[1] is not None: w_o = K.reshape(w_o, (-1,) + input_shape[1:]) return w_o
def call(self, inputs, **kwargs): length = K.shape(inputs[0])[1] + K.shape(inputs[1])[1] inputs = K.tile( K.expand_dims(K.arange(length - 1, -1, -1, dtype=K.floatx()), axis=0), [K.shape(inputs[0])[0], 1], ) if self.clamp_len is not None: inputs = K.clip(inputs, min_value=0, max_value=self.clamp_len) inputs = K.expand_dims(inputs, axis=-1) output_dim = K.cast(self.output_dim, K.floatx()) ranges = K.expand_dims(K.arange(0.0, self.output_dim, 2.0), axis=0) / output_dim inverse = 1.0 / K.pow(10000.0, ranges) positions = inputs * inverse return K.concatenate([K.sin(positions), K.cos(positions)], axis=-1)
def yolo_parse_output(yolo_output=None, anchors=None, num_classes=7, input_shape=None): anchor_reshape = tf.reshape(anchors, shape=(1, 1, 1, tf.shape(anchors)[0], 2)) anchor_reshape = tf.cast(anchor_reshape, dtype=K.dtype(yolo_output)) output_shape = tf.shape(yolo_output) height_index = K.arange(0, stop=output_shape[1]) width_index = K.arange(0, stop=output_shape[2]) tmp1, tmp2 = tf.meshgrid(height_index, width_index) conv_index = tf.reshape(tf.concat([tmp1, tmp2], axis=0), (2, output_shape[1], output_shape[2])) conv_index = tf.transpose(conv_index, (1, 2, 0)) conv_index = K.expand_dims(K.expand_dims(conv_index, 0), -2) # shape will be (1, 13, 13, 1, 2) conv_index = K.cast(conv_index, K.dtype(yolo_output)) yolo_output = tf.reshape(yolo_output, shape=(-1, output_shape[1], output_shape[2], tf.shape(anchors)[0], 5 + num_classes)) box_xy = yolo_output[..., :2] box_wh = yolo_output[..., 2:4] box_confidence = yolo_output[..., 4:5] box_classes = yolo_output[..., 5:] box_xy_sig = tf.sigmoid(box_xy) box_wh_coord = box_wh box_xy = (box_xy_sig + conv_index) / tf.cast(output_shape[1:3], dtype=K.dtype(yolo_output)) box_wh = tf.exp(box_wh) * anchor_reshape / tf.cast( input_shape, dtype=K.dtype(yolo_output)) box_confidence = tf.sigmoid(box_confidence) box_classes = tf.sigmoid(box_classes) box_coord = K.concatenate((box_xy_sig, box_wh_coord), axis=-1) return box_xy, box_wh, box_confidence, box_classes, box_coord
def call(self, x, **kwargs): if (self.size is None) or (self.mode == 'sum'): self.size = int(x.shape[-1]) batch_size, seq_len = K.shape(x)[0], K.shape(x)[1] position_j = 1. / K.pow( 10000., 2 * K.arange(self.size / 2, dtype='float32') / self.size) position_j = K.expand_dims(position_j, 0) # K.arange不支持变长,只好用这种方法生成 position_i = K.cumsum(K.ones_like(x[:, :, 0]), 1) - 1 position_i = K.expand_dims(position_i, 2) position_ij = K.dot(position_i, position_j) position_ij = K.concatenate( [K.cos(position_ij), K.sin(position_ij)], 2) if self.mode == 'sum': return position_ij + x elif self.mode == 'concat': return K.concatenate([position_ij, x], 2)
def soft_min_reg(cv, axis=None, min_disp=None, max_disp=None, labels=None): if axis == 1: cv = Lambda(lambda x: K.squeeze(x, axis=-1))(cv) disp_map = K.reshape( K.arange(min_disp, max_disp - 0.000001, (max_disp - min_disp) / labels, dtype="float32"), (1, 1, labels, 1)) if axis == 1: output = K.conv2d(cv, disp_map, strides=(1, 1), padding='valid', data_format="channels_first") x = K.expand_dims(K.squeeze(output, axis=1), axis=-1) else: x = K.conv2d(cv, disp_map, strides=(1, 1), padding='valid') return x
def call(self, x, mask=None): if (self.size == None) or (self.mode == 'sum'): self.size = int(x.shape[-1]) position_j = 1. / \ K.pow(10000., 2 * K.arange(self.size / 2, dtype='float32') / self.size) position_j = K.expand_dims(position_j, 0) position_i = tf.cumsum(K.ones_like(x[:, :, 0]), 1) - 1 position_i = K.expand_dims(position_i, 2) position_ij = K.dot(position_i, position_j) outputs = K.concatenate([K.cos(position_ij), K.sin(position_ij)], 2) if self.mode == 'sum': if self.scale: outputs = outputs * self.size**0.5 return x + outputs elif self.mode == 'concat': return K.concatenate([outputs, x], 2)
def _build_weights(self, input_shape): input_dim = input_shape[-1] d = collections.OrderedDict() d["input_kernel"] = self.add_weight( shape=(input_dim, self.units), name='input_kernel', initializer=self.kernel_initializer, regularizer=self.kernel_regularizer, constraint=self.kernel_constraint) d["attention_kernel"] = self.add_weight( shape=(self.units, self.units * 3), name='attention_kernel', initializer=self.attention_initializer, regularizer=self.attention_regularizer, constraint=self.attention_constraint) d["mlp_kernel"] = self.add_weight(shape=(self.units, self.units * 2), name='mlp_kernel', initializer=self.mlp_initializer, regularizer=self.mlp_regularizer, constraint=self.mlp_constraint) d["input_bias"] = self.add_weight(shape=(self.units, ), name='input_bias', initializer=self.bias_initializer, regularizer=self.bias_regularizer, constraint=self.bias_constraint) d["attention_bias"] = self.add_weight( shape=(self.units * 3, ), name='attention_bias', initializer=self.bias_initializer, regularizer=self.bias_regularizer, constraint=self.bias_constraint) d["mlp_bias"] = self.add_weight(shape=(self.units * 2, ), name='mlp_bias', initializer=self.bias_initializer, regularizer=self.bias_regularizer, constraint=self.bias_constraint) d["layer_norm_gamma"] = self.add_weight( shape=(1, self.units * 2), name='layer_norm_gamma', initializer=self.kernel_initializer) d["layer_norm_beta"] = self.add_weight( shape=(self.units * 2, ), name='layer_norm_beta', initializer=self.bias_initializer) if self.use_relative_position: d["rel_kernel"] = self.add_weight( shape=(self.units, self.units), name='rel_kernel', initializer=self.attention_initializer, regularizer=self.attention_regularizer, constraint=self.attention_constraint) i = tf.range(0, self.units, dtype=tf.float32) d2 = tf.floormod(i, 2) i2 = i - d2 for j in range(2): i2 = K.expand_dims(i2, axis=0) i2 = tf.pow(1e+4, i2 / self.units) d["d2"] = d2 d["i2"] = i2 d["range"] = K.expand_dims(K.arange(0, self.num_memory_slots, dtype=tf.float32), axis=0) return d
def yolo_head(feats, anchors, num_classes): """Convert final layer features to bounding box parameters. Parameters ---------- feats : tf.Tensor Final convolutional layer features. anchors : np.array, list Anchor box widths and heights. num_classes : int Number of target classes. Returns ------- box_xy: tf.Tensor (x, y) box predictions adjusted by spatial location in conv layer. box_wh: tf.Tensor (w, h) box predictions adjusted by anchors and conv spatial resolution. box_conf: tf.Tensor Probability estimate for whether each box contains any object. box_class_pred: tf.Tensor Probability distribution estimate for each box over class labels. """ num_anchors = len(anchors) # Reshape to batch, height, width, num_anchors, box_params. anchors_tensor = K.reshape(K.variable(anchors), [1, 1, 1, num_anchors, 2]) # Static implementation for fixed models. # TODO: Remove or add option for static implementation. # _, conv_height, conv_width, _ = K.int_shape(feats) # conv_dims = K.variable([conv_width, conv_height]) # Dynamic implementation of conv dims for fully convolutional model. conv_dims = K.shape(feats)[1:3] # assuming channels last # In YOLO the height index is the inner most iteration. conv_height_index = K.arange(0, stop=conv_dims[0]) conv_width_index = K.arange(0, stop=conv_dims[1]) conv_height_index = K.tile(conv_height_index, [conv_dims[1]]) # TODO: Repeat_elements and tf.split doesn't support dynamic splits. # conv_width_index = K.repeat_elements(conv_width_index, conv_dims[1], axis=0) conv_width_index = K.tile(K.expand_dims(conv_width_index, 0), [conv_dims[0], 1]) conv_width_index = K.flatten(K.transpose(conv_width_index)) conv_index = K.transpose(K.stack([conv_height_index, conv_width_index])) conv_index = K.reshape(conv_index, [1, conv_dims[0], conv_dims[1], 1, 2]) conv_index = K.cast(conv_index, K.dtype(feats)) feats = K.reshape( feats, [-1, conv_dims[0], conv_dims[1], num_anchors, num_classes + 5]) conv_dims = K.cast(K.reshape(conv_dims, [1, 1, 1, 1, 2]), K.dtype(feats)) # Static generation of conv_index: # conv_index = np.array([_ for _ in np.ndindex(conv_width, conv_height)]) # conv_index = conv_index[:, [1, 0]] # swap columns for YOLO ordering. # conv_index = K.variable( # conv_index.reshape(1, conv_height, conv_width, 1, 2)) # feats = Reshape( # (conv_dims[0], conv_dims[1], num_anchors, num_classes + 5))(feats) box_confidence = K.sigmoid(feats[..., 4:5]) box_xy = K.sigmoid(feats[..., :2]) box_wh = K.exp(feats[..., 2:4]) box_class_probs = K.softmax(feats[..., 5:]) # Adjust preditions to each spatial grid point and anchor size. # Note: YOLO iterates over height index before width index. box_xy = (box_xy + conv_index) / conv_dims box_wh = box_wh * anchors_tensor / conv_dims return box_confidence, box_xy, box_wh, box_class_probs