def _calculate_features(self, xy, wh, objectiveness, classes, anchors): shape = K.shape(xy)[1:3] # width, height xy_sig = K.sigmoid(xy) # TODO rethink logic here, grid needs to be calculated just once after model initialization col = K.reshape(K.tile(K.arange(0, shape[0]), shape[0:1]), (-1, shape[0])) row = K.reshape(K.tile(K.arange(0, shape[1]), shape[1:2]), (-1, shape[1])) row = K.transpose(row) col = K.repeat_elements(K.reshape(col, (shape[0], shape[1], 1, 1)), rep=len(anchors), axis=-2) row = K.repeat_elements(K.reshape(row, (shape[0], shape[1], 1, 1)), rep=len(anchors), axis=-2) grid = K.concatenate((col, row), axis=-1) # TODO same thing for the anchors anchors_tensor = K.reshape(K.constant(anchors), [1, 1, 1, len(anchors), 2]) box_xy = (xy_sig + K.cast(grid, K.dtype(xy_sig))) / (shape[0], shape[1]) box_wh = K.exp(wh) * anchors_tensor / K.cast(self.input_image_dims, K.dtype(wh)) obj_sig = K.sigmoid(objectiveness) class_sig = K.sigmoid(classes) return box_xy, box_wh, obj_sig, class_sig
def compute_position_ids(self, inputs): """T5的相对位置分桶(直接翻译自官方T5源码) """ q, v = inputs # 计算位置差 q_idxs = K.arange(0, K.shape(q)[1], dtype='int32') q_idxs = K.expand_dims(q_idxs, 1) v_idxs = K.arange(0, K.shape(v)[1], dtype='int32') v_idxs = K.expand_dims(v_idxs, 0) pos_ids = v_idxs - q_idxs # 后处理操作 num_buckets, max_distance = self.input_dim, self.max_distance ret = 0 n = -pos_ids if self.bidirectional: num_buckets //= 2 ret += K.cast(K.less(n, 0), 'int32') * num_buckets n = K.abs(n) else: n = K.maximum(n, 0) # now n is in the range [0, inf) max_exact = num_buckets // 2 is_small = K.less(n, max_exact) val_if_large = max_exact + K.cast( K.log(K.cast(n, K.floatx()) / max_exact) / np.log(max_distance / max_exact) * (num_buckets - max_exact), 'int32', ) val_if_large = K.minimum(val_if_large, num_buckets - 1) ret += K.switch(is_small, n, val_if_large) return ret
def yolo_head(feats, anchors, num_classes, input_shape, calc_loss=False): num_anchors = len(anchors) anchors_tensor = K.reshape(K.constant(anchors), [1, 1, 1, num_anchors, 2]) grid_shape = K.shape(feats)[1:3] grid_y = K.tile(K.reshape(K.arange(0, stop=grid_shape[0]), [-1, 1, 1, 1]), [1, grid_shape[1], 1, 1]) grid_x = K.tile(K.reshape(K.arange(0, stop=grid_shape[1]), [1, -1, 1, 1]), [grid_shape[0], 1, 1, 1]) grid = K.concatenate([grid_x, grid_y]) grid = K.cast(grid, K.dtype(feats)) feats = K.reshape( feats, [-1, grid_shape[0], grid_shape[1], num_anchors, num_classes + 5]) box_xy = (K.sigmoid(feats[..., :2]) + grid) / K.cast( grid_shape[::-1], K.dtype(feats)) box_wh = K.exp(feats[..., 2:4]) * anchors_tensor / K.cast( input_shape[::-1], K.dtype(feats)) box_confidence = K.sigmoid(feats[..., 4:5]) box_class_probs = K.sigmoid(feats[..., 5:]) if calc_loss == True: return grid, feats, box_xy, box_wh return box_xy, box_wh, box_confidence, box_class_probs
def shift(shape, stride, anchors): """Produce shifted anchors based on shape of the map and stride size. Args: shape (tuple): Shape to shift the anchors over. stride (int): Stride to shift the anchors with over the shape. anchors (numpy.array): The anchors to apply at each location. Returns: numpy.array: shifted anchors """ shift_x = (K.arange(0, shape[1], dtype=K.floatx()) + K.constant(0.5, dtype=K.floatx())) * stride shift_y = (K.arange(0, shape[0], dtype=K.floatx()) + K.constant(0.5, dtype=K.floatx())) * stride shift_x, shift_y = tf.meshgrid(shift_x, shift_y) shift_x = K.reshape(shift_x, [-1]) shift_y = K.reshape(shift_y, [-1]) shifts = K.stack([shift_x, shift_y, shift_x, shift_y], axis=0) shifts = K.transpose(shifts) number_of_anchors = K.shape(anchors)[0] k = K.shape(shifts)[0] # number of base points = feat_h * feat_w shifts = K.cast(K.reshape(shifts, [k, 1, 4]), K.floatx()) shifted_anchors = K.reshape(anchors, [1, number_of_anchors, 4]) + shifts shifted_anchors = K.reshape(shifted_anchors, [k * number_of_anchors, 4]) return shifted_anchors
def _compute_valid_seed_region(self): positions = K.concatenate([ K.expand_dims(K.tile(K.expand_dims(K.arange(self.height), axis=1), [1, self.width]), axis=-1), K.expand_dims(K.tile(K.expand_dims(K.arange(self.width), axis=0), [self.height, 1]), axis=-1), ], axis=-1) half_block_size = self.block_size // 2 valid_seed_region = K.switch( K.all( K.stack( [ positions[:, :, 0] >= half_block_size, positions[:, :, 1] >= half_block_size, positions[:, :, 0] < self.height - half_block_size, positions[:, :, 1] < self.width - half_block_size, ], axis=-1, ), axis=-1, ), self.ones, self.zeros, ) return K.expand_dims(K.expand_dims(valid_seed_region, axis=0), axis=-1)
def call(self, inputs, training=None, **kwargs): inputs, memory = inputs batch_size = K.shape(inputs)[0] seq_len = K.shape(inputs)[1] mem_mask = K.tile(K.ones_like(memory[:, :, :1], dtype=K.floatx()), [1, 1, seq_len]) # Build content mask with random permutation ranges = K.tile(K.expand_dims(K.arange(0, seq_len), axis=-1), [1, batch_size]) if self.enabled: shuffle = random_shuffle(ranges) else: shuffle = ranges if self.directional: shuffled = K.in_train_phase(shuffle, ranges, training) else: if self.enabled: shuffled = K.in_train_phase(shuffle, ranges + seq_len, training) else: shuffled = ranges + seq_len ranges = K.expand_dims(K.permute_dimensions(ranges, [1, 0]), axis=-1) shuffled = K.expand_dims(K.permute_dimensions(shuffled, [1, 0]), axis=1) content_mask = K.cast(ranges <= shuffled, dtype=K.floatx()) # Build query mask based on content mask ranges = K.arange(0, seq_len) eye = K.equal(K.expand_dims(ranges, axis=0), K.expand_dims(ranges, axis=-1)) eye = K.expand_dims(K.cast(eye, dtype=K.floatx()), axis=0) query_mask = content_mask * (1.0 - eye) content_mask = K.concatenate([mem_mask, content_mask], axis=1) query_mask = K.concatenate([mem_mask, query_mask], axis=1) return [ K.permute_dimensions(content_mask, [0, 2, 1]), K.permute_dimensions(query_mask, [0, 2, 1]), ]
def yolo_head(feats,anchors,num_classes,input_shape,calc_loss=False): """Convert final predictions into bounding boxes""" num_anchors = len(anchors) # (batch, height, width, num_anchors, box_prams) anchor_tensor = K.reshape(K.constant(anchors),[1,1,1,num_anchors,2]) grid_shape = K.shape(feats)[1:3] #(height,width) grid_y = K.tile(K.reshape(K.arange(0,stop=grid_shape[0]),[-1,1,1,1]), [1,grid_shape[1],1,1]) grid_x = K.tile(K.reshape(K.arange(0,stop=grid_shape[1]),[1,-1,1,1]), [grid_shape[0],1,1,1]) grid = K.concatenate([grid_x,grid_y]) grid = K.cast(grid,K.dtype(feats)) feats = K.reshape( feats,[-1,grid.shape[0],grid.shape[1],num_anchors,num_classes+5]) box_xy = (K.sigmoid(feats[...,:2])+grid) / K.cast(grid_shape[::-1],K.dtype(feats)) box_wh = K.exp(feats[...,2:4]) * anchor_tensor / K.cast(input_shape[::-1],K.dtype(feats)) box_confidence = K.sigmoid(feats[...,4:5]) box_class_probs = K.sigmoid(feats[...,5:]) if calc_loss: return grid,feats,box_xy,box_wh return box_xy, box_wh, box_confidence, box_class_probs
def call(self, inputs, mask=None, **kwargs): input_len = K.shape(inputs)[1] if self.attention_type == SeqSelfAttention.ATTENTION_TYPE_ADD: e = self._call_additive_emission(inputs) elif self.attention_type == SeqSelfAttention.ATTENTION_TYPE_MUL: e = self._call_multiplicative_emission(inputs) if self.attention_activation is not None: e = self.attention_activation(e) if self.attention_width is not None: if self.history_only: lower = K.arange(0, input_len) - (self.attention_width - 1) else: lower = K.arange(0, input_len) - self.attention_width // 2 lower = K.expand_dims(lower, axis=-1) upper = lower + self.attention_width indices = K.expand_dims(K.arange(0, input_len), axis=0) e -= 10000.0 * (1.0 - K.cast(lower <= indices, K.floatx()) * K.cast(indices < upper, K.floatx())) if mask is not None: mask = K.expand_dims(K.cast(mask, K.floatx()), axis=-1) e -= 10000.0 * ((1.0 - mask) * (1.0 - K.permute_dimensions(mask, (0, 2, 1)))) # a_{t} = \text{softmax}(e_t) e = K.exp(e - K.max(e, axis=-1, keepdims=True)) a = e / K.sum(e, axis=-1, keepdims=True) # l_t = \sum_{t'} a_{t, t'} x_{t'} v = K.batch_dot(a, inputs) if self.attention_regularizer_weight > 0.0: self.add_loss(self._attention_regularizer(a)) if self.return_attention: return [v, a] return v
def positional_signal(hidden_size: int, length: int, min_timescale: float = 1.0, max_timescale: float = 1e4): """ Helper function, constructing basic positional encoding. The code is partially based on implementation from Tensor2Tensor library https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/layers/common_attention.py """ '''if hidden_size % 2 != 0: raise ValueError( f"The hidden dimension of the model must be divisible by 2." f"Currently it is {hidden_size}")''' position = K.arange(0, length, dtype=tf.float32) num_timescales = hidden_size // 2 log_timescale_increment = tf.constant( (np.log(float(max_timescale) / float(min_timescale)) / (num_timescales - 1)), dtype=tf.float32) inv_timescales = ( min_timescale * tf.exp(K.arange(num_timescales, dtype=tf.float32) * -log_timescale_increment)) scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0) signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1) return tf.expand_dims(signal, axis=0)
def call(self, inputs: tensorflow.Tensor, mask: Optional[tensorflow.Tensor] = None, **kwargs) -> tensorflow.Tensor: if isinstance(inputs, list): query, key, value = inputs else: query = key = value = inputs if isinstance(mask, list): mask = mask[1] feature_dim = K.shape(query)[-1] e = K.batch_dot(query, key, axes=2) / K.sqrt( K.cast(feature_dim, dtype=K.floatx())) e = K.exp(e - K.max(e, axis=-1, keepdims=True)) if self.history_only: query_len, key_len = K.shape(query)[1], K.shape(key)[1] indices = K.tile(K.expand_dims(K.arange(key_len), axis=0), [query_len, 1]) upper = K.expand_dims(K.arange(key_len), axis=-1) e *= K.expand_dims(K.cast(indices <= upper, K.floatx()), axis=0) if mask is not None: e *= K.cast(K.expand_dims(mask, axis=-2), K.floatx()) a = e / (K.sum(e, axis=-1, keepdims=True) + K.epsilon()) v = K.batch_dot(a, value) if self.return_attention: return [v, a] return v
def positional_signal(hidden_size: int, length: int, min_timescale: float = 1.0, max_timescale: float = 1e4): """ Helper function, constructing positional encodings as described in "Attention is All You Need" (https://arxiv.org/abs/1706.03762) The implementation was taken from https://github.com/kpot/keras-transformer """ if hidden_size % 2 != 0: raise ValueError( f"The hidden dimension of the model must be divisible by 2. " f"Currently it is {hidden_size}") position = K.arange(0, length, dtype=K.floatx()) num_timescales = hidden_size // 2 log_timescale_increment = K.constant( (np.log(float(max_timescale) / float(min_timescale)) / (num_timescales - 1)), dtype=K.floatx()) inv_timescales = ( min_timescale * K.exp(K.arange(num_timescales, dtype=K.floatx()) * -log_timescale_increment)) scaled_time = K.expand_dims(position, 1) * K.expand_dims(inv_timescales, 0) signal = K.concatenate([K.sin(scaled_time), K.cos(scaled_time)], axis=1) return K.expand_dims(signal, axis=0)
def yolo_head(feats, anchors, num_classes, input_shape, calc_loss=False): num_anchors = len(anchors) # Reshape to batch, height, width, num_anchors, box_params. anchors_tensor = K.reshape(K.constant(anchors), [1, 1, 1, num_anchors, 2]) conv_dims = K.shape(feats)[1:3] conv_height_index = K.arange(0, stop=conv_dims[0]) conv_width_index = K.arange(0, stop=conv_dims[1]) x_axis, y_axis = meshgrid(conv_width_index, conv_height_index) grid = K.concatenate([x_axis, y_axis]) grid = K.cast(grid, K.dtype(feats)) feats = K.reshape( feats, [-1, conv_dims[0], conv_dims[1], num_anchors, num_classes + 5]) # Adjust preditions to each spatial grid point and anchor size. box_xy = (K.sigmoid(feats[..., :2]) + grid) / K.cast( conv_dims[::-1], K.dtype(feats)) box_wh = K.exp(feats[..., 2:4]) * anchors_tensor / K.cast( input_shape[::-1], K.dtype(feats)) box_confidence = K.sigmoid(feats[..., 4:5]) box_class_probs = K.sigmoid(feats[..., 5:]) if calc_loss == True: return grid, feats, box_xy, box_wh return box_xy, box_wh, box_confidence, box_class_probs
def yolo_head(feats, anchors, input_shape, calc_loss=False, att_map=None): """Convert final layer features to bounding box parameters.""" num_anchors = len(anchors) # Reshape to batch, height, width, num_anchors, box_params. anchors_tensor = K.reshape(K.constant(anchors), [1, 1, 1, num_anchors, 2]) grid_shape = K.shape(feats)[1:3] # height, width grid_y = K.tile(K.reshape(K.arange(0, stop=grid_shape[0]), [-1, 1, 1, 1]), [1, grid_shape[1], 1, 1]) grid_x = K.tile(K.reshape(K.arange(0, stop=grid_shape[1]), [1, -1, 1, 1]), [grid_shape[0], 1, 1, 1]) grid = K.concatenate([grid_x, grid_y]) grid = K.cast(grid, K.dtype(feats)) feats = K.reshape(feats, [-1, grid_shape[0], grid_shape[1], num_anchors, 5]) # Adjust preditions to each spatial grid point and anchor size. box_xy = (K.sigmoid(feats[..., :2]) + grid) / K.cast( grid_shape[..., ::-1], K.dtype(feats)) box_wh = K.exp(feats[..., 2:4]) * anchors_tensor / K.cast( input_shape[..., ::-1], K.dtype(feats)) if att_map is not None: seg_map = K.tile(att_map, [1, 1, 1, 3]) seg_map = K.expand_dims(seg_map, axis=-1) box_confidence = K.sigmoid( feats[..., 4:5] ) #*.8+seg_map*.2 ##denote if add attention score to confidence score else: box_confidence = K.sigmoid(feats[..., 4:5]) if calc_loss == True: return grid, feats, box_xy, box_wh return box_xy, box_wh, box_confidence
def call(self, inputs): input_shape = K.shape(inputs) if self.data_format == 'channels_first': x = K.arange(0, input_shape[2], dtype=inputs.dtype) y = K.arange(0, input_shape[3], dtype=inputs.dtype) else: x = K.arange(0, input_shape[1], dtype=inputs.dtype) y = K.arange(0, input_shape[2], dtype=inputs.dtype) x = x / K.max(x) y = y / K.max(y) loc_x, loc_y = tf.meshgrid(x, y, indexing='ij') if self.data_format == 'channels_first': loc = K.stack([loc_x, loc_y], axis=0) else: loc = K.stack([loc_x, loc_y], axis=-1) location = K.expand_dims(loc, axis=0) if self.data_format == 'channels_first': location = K.permute_dimensions(location, pattern=[0, 2, 3, 1]) location = tf.tile(location, [input_shape[0], 1, 1, 1]) if self.data_format == 'channels_first': location = K.permute_dimensions(location, pattern=[0, 3, 1, 2]) return location
def cell_offset_table(scale_size): # Dynamic implementation of conv dims for fully convolutional model. # In YOLO the height index is the inner most iteration. conv_height_index = K.arange(0, stop=scale_size) conv_width_index = K.arange(0, stop=scale_size) conv_height_index = K.tile(conv_height_index, [scale_size]) # 늘어놓는 함수 tile -> 같은걸 N번 반복함 # 결과 -> 0~12, 0~12, ...., 0~12 # TODO: Repeat_elements and tf.split doesn't support dynamic splits. # conv_width_index = K.repeat_elements(conv_width_index, conv_dims[1], axis=0) conv_width_index = K.tile( K.expand_dims(conv_width_index, 0), [scale_size, 1]) # tile을 [n, m] 쓰면 dims 2로 만들어줌 # 결과 -> [0~12], [0~12], [0~12], ... conv_width_index = K.flatten(K.transpose(conv_width_index)) # 결과 -> 0, 0, 0, 0, 0, 0, 0 (13개), 1, 1, 1, 1, 1, 1, 1 (13개), ... conv_index = K.transpose(K.stack([conv_height_index, conv_width_index])) # 결과 -> [0, 0], [1, 0], [2, 0], ..., [11, 12], [12, 12] conv_index = K.reshape(conv_index, [1, scale_size, scale_size, 1, 2]) # 결과 -> 1 * 13 * 13 에 있는 [1 * 2]의 conv index item이 만들어짐 # 각각 [1 * 2]의 값은 [0, 0], [1, 0], [2, 0], ..., [11, 12], [12, 12] # 이런 식으로 이루어져 있음 -> Mask를 만들기 위한 과정 # 결과 shape -> 1, 13, 13, 1, 2 conv_index = K.cast(conv_index, tf.float32) diff = (1 / scale_size * 416) conv_index = conv_index * diff return conv_index
def _encoder(x): # x = tf.keras.layers.Dropout(rate)(x) # Two Embeddings (3 for classes, 10 for degrees) cls = K.expand_dims(K.arange(3), axis=0) cls = K.stop_gradient(cls) cls = tf.keras.layers.Embedding(3, d_model)(cls) cls = K.expand_dims(cls, axis=2) # (1, 3, 1, d_model) direct = K.expand_dims(K.arange(10), axis=0) direct = K.stop_gradient(direct) direct = tf.keras.layers.Embedding(10, d_model)(direct) direct = K.expand_dims(direct, axis=1) # (1, 1, 10, d_model) embedding = tf.keras.layers.Reshape((30, d_model))(cls + direct) for i in range(n_layers): x = transformer_layer(d_model, n_heads, dff, rate)(x) x = multi_head_attention(d_model, n_heads, perm_and_reshape=False)(embedding, x, x) x = tf.keras.layers.Dropout(rate)(x) x = tf.keras.layers.BatchNormalization()(x) if softmax: x = tf.keras.layers.Softmax()(x) return x
def yolo_head(feats, anchors, num_classes, input_shape, calc_loss=False): """转换识别结果 例如:(batch_size,13,13,255) -> (batch_size,13,13,3,85) """ num_anchors = len(anchors) anchors_tensor = K.reshape(K.constant(anchors), [1, 1, 1, num_anchors, 2]) grid_shape = K.shape(feats)[1:3] # 特征层高和宽 grid_y = K.tile(K.reshape(K.arange(0, stop=grid_shape[0]), [-1, 1, 1, 1]), [1, grid_shape[1], 1, 1]) grid_x = K.tile(K.reshape(K.arange(0, stop=grid_shape[1]), [1, -1, 1, 1]), [grid_shape[0], 1, 1, 1]) grid = K.concatenate([grid_x, grid_y]) # 生成 特征层网格点坐标 # 如(13,13)特征层面,[[(0,0)..(0,12)]..[(12,0)..[12,12]]] grid = K.cast(grid, K.dtype(feats)) feats = K.reshape( feats, [-1, grid_shape[0], grid_shape[1], num_anchors, num_classes + 5]) # 网格点坐标(特征层中心点)+识别结果(偏移量) box_xy = (K.sigmoid(feats[..., :2]) + grid) / K.cast(grid_shape[::-1], K.dtype(feats)) box_wh = K.exp(feats[..., 2:4]) * anchors_tensor / K.cast(input_shape[::-1], K.dtype(feats)) if calc_loss == True: return grid, feats, box_xy, box_wh else: box_confidence = K.sigmoid(feats[..., 4:5]) box_class_probs = K.sigmoid(feats[..., 5:]) # todo:这里调用激活函数是起到什么作用 return box_xy, box_wh, box_confidence, box_class_probs
def yolo3_head(feats, anchors, num_classes, input_shape, calc_loss=False): """Convert final layer features to bounding box parameters.""" num_anchors = len(anchors) # Reshape to batch, height, width, num_anchors, box_params. anchors_tensor = K.reshape(K.constant(anchors), [1, 1, 1, num_anchors, 2]) grid_shape = K.shape(feats)[1:3] # height, width grid_y = K.tile(K.reshape(K.arange(0, stop=grid_shape[0]), [-1, 1, 1, 1]), [1, grid_shape[1], 1, 1]) grid_x = K.tile(K.reshape(K.arange(0, stop=grid_shape[1]), [1, -1, 1, 1]), [grid_shape[0], 1, 1, 1]) grid = K.concatenate([grid_x, grid_y]) grid = K.cast(grid, K.dtype(feats)) feats = K.reshape( feats, [-1, grid_shape[0], grid_shape[1], num_anchors, num_classes + 5]) # Adjust preditions to each spatial grid point and anchor size. box_xy = (K.sigmoid(feats[..., :2]) + grid) / K.cast(grid_shape[::-1], K.dtype(feats)) box_wh = K.exp(feats[..., 2:4]) * anchors_tensor / K.cast(input_shape[::-1], K.dtype(feats)) box_confidence = K.sigmoid(feats[..., 4:5]) box_class_probs = K.sigmoid(feats[..., 5:]) if calc_loss == True: return grid, feats, box_xy, box_wh return box_xy, box_wh, box_confidence, box_class_probs
def call(self, inputs, mask=None, **kwargs): if len(inputs) == 4: query, key, value, prev = inputs mask = mask[1] else: query = key = value = inputs[0] prev = inputs[1] mask = mask[0] feature_dim = K.shape(query)[-1] e = K.batch_dot(query, key, axes=2) / K.sqrt( K.cast(feature_dim, dtype=K.floatx())) new_prev = e = e + prev if self.history_only: query_len, key_len = K.shape(query)[1], K.shape(key)[1] indices = K.expand_dims(K.arange(0, key_len), axis=0) upper = K.expand_dims(K.arange(0, query_len), axis=-1) e -= 10000.0 * K.expand_dims(K.cast(indices > upper, K.floatx()), axis=0) if mask is not None: e -= 10000.0 * (1.0 - K.cast(K.expand_dims(mask, axis=-2), K.floatx())) self.intensity = e e = K.exp(e - K.max(e, axis=-1, keepdims=True)) self.attention = e / K.sum(e, axis=-1, keepdims=True) v = K.batch_dot(self.attention, value) output = [v, new_prev] if self.return_attention: output.append(self.attention) return output
def yolo_head(feats, anchors, num_classes, input_shape, calc_loss=False): num_anchors = len(anchors) # [1, 1, 1, num_anchors, 2] anchors_tensor = K.reshape(K.constant(anchors), [1, 1, 1, num_anchors, 2]) # 获得x,y的网格 # (13,13, 1, 2) grid_shape = K.shape(feats)[1:3] # height, width grid_y = K.tile(K.reshape(K.arange(0, stop=grid_shape[0]), [-1, 1, 1, 1]), [1, grid_shape[1], 1, 1]) grid_x = K.tile(K.reshape(K.arange(0, stop=grid_shape[1]), [1, -1, 1, 1]), [grid_shape[0], 1, 1, 1]) grid = K.concatenate([grid_x, grid_y]) grid = K.cast(grid, K.dtype(feats)) # (batch_size,13,13,3,85) feats = K.reshape(feats, [-1, grid_shape[0], grid_shape[1], num_anchors, num_classes + 5]) # 将预测值调成真实值 # box_xy对应框的中心点 # box_wh对应框的宽和高 box_xy = (K.sigmoid(feats[..., :2]) + grid) / K.cast(grid_shape[...,::-1], K.dtype(feats)) box_wh = K.exp(feats[..., 2:4]) * anchors_tensor / K.cast(input_shape[...,::-1], K.dtype(feats)) box_confidence = K.sigmoid(feats[..., 4:5]) box_class_probs = K.sigmoid(feats[..., 5:]) # 在计算loss的时候返回如下参数 if calc_loss == True: return grid, feats, box_xy, box_wh return box_xy, box_wh, box_confidence, box_class_probs
def call(self, inputs): """如果custom_position_ids,那么第二个输入为自定义的位置id """ if self.custom_position_ids: seq_len = K.shape(inputs)[1] inputs, position_ids = inputs if 'float' not in K.dtype(position_ids): position_ids = K.cast(position_ids, K.floatx()) else: input_shape = K.shape(inputs) batch_size, seq_len = input_shape[0], input_shape[1] position_ids = K.arange(0, seq_len, dtype=K.floatx())[None] indices = K.arange(0, self.output_dim // 2, dtype=K.floatx()) indices = K.pow(10000.0, -2 * indices / self.output_dim) embeddings = tf.einsum('bn,d->bnd', position_ids, indices) embeddings = K.stack([K.sin(embeddings), K.cos(embeddings)], axis=-1) embeddings = K.reshape(embeddings, (-1, seq_len, self.output_dim)) if self.merge_mode == 'add': return inputs + embeddings elif self.merge_mode == 'mul': return inputs * embeddings else: if not self.custom_position_ids: embeddings = K.tile(embeddings, [batch_size, 1, 1]) return K.concatenate([inputs, embeddings])
def call(self, inputs): #input_shape = K.cast(K.shape(inputs), dtype='int64') #input_shape=K.cast(inputs.shape,dtype='int64') input_shape = inputs.shape output_shape = (input_shape[0], input_shape[1] * self.stride[1], input_shape[2] * self.stride[2], input_shape[3]) #output_list = [] #output_list.append(self.pooling_argmax // (output_shape[2] * output_shape[3])) #output_list.append(self.pooling_argmax % (output_shape[2] * output_shape[3]) // output_shape[3]) argmax = self.pooling_argmax #K.stack(output_list) one_like_mask = K.ones_like(argmax) batch_range = K.reshape(K.arange(start=0, stop=input_shape[0], dtype='int64'), shape=[input_shape[0], 1, 1, 1]) b = one_like_mask * batch_range y = argmax // (output_shape[2] * output_shape[3]) x = argmax % (output_shape[2] * output_shape[3]) // output_shape[3] feature_range = K.arange(start=0, stop=output_shape[3], dtype='int64') f = one_like_mask * feature_range # transpose indices & reshape update values to one dimension updates_size = tf.size(inputs) indices = K.transpose( K.reshape(K.stack([b, y, x, f]), [4, updates_size])) values = K.reshape(inputs, [updates_size]) return tf.scatter_nd(indices, values, output_shape)
def yolo_head(feats, anchors, num_classes, input_shape, calc_loss=False): """ :param feats: (N, 13, 13, 3 * (5+n_class)), ... :param anchors: (3, 2) :param num_classes: 15 :param input_shape: (416, 416) :param calc_loss: :return: """ num_anchors = len(anchors) # Reshape to batch, height, width, num_anchors, box_params. if calc_loss: anchors_tensor = K.reshape(K.constant(anchors), [1, 1, 1, num_anchors, 2]) grid_shape = K.shape(feats)[1:3] # height, width grid_y = K.tile( K.reshape(K.arange(0, stop=grid_shape[0]), [-1, 1, 1, 1]), [1, grid_shape[1], 1, 1]) grid_x = K.tile( K.reshape(K.arange(0, stop=grid_shape[1]), [1, -1, 1, 1]), [grid_shape[0], 1, 1, 1]) grid = K.concatenate([grid_x, grid_y]) grid = K.cast(grid, K.floatx()) feats = K.reshape(feats, [ -1, grid_shape[0], grid_shape[1], num_anchors, num_classes + 5 ]) box_xy = (K.sigmoid(feats[..., :2]) + grid) / K.cast( grid_shape[::-1], K.dtype(feats)) box_wh = K.exp(feats[..., 2:4]) * anchors_tensor / K.cast( input_shape[::-1], K.dtype(feats)) return grid, feats, box_xy, box_wh else: anchors_tensor = np.reshape(np.array(anchors), [1, 1, 1, num_anchors, 2]) grid_shape = np.asarray(feats.shape[1:3]) # height, width grid_y = np.tile( np.reshape(np.arange(0, stop=grid_shape[0]), [-1, 1, 1, 1]), [1, grid_shape[1], 1, 1]) grid_x = np.tile( np.reshape(np.arange(0, stop=grid_shape[1]), [1, -1, 1, 1]), [grid_shape[0], 1, 1, 1]) grid = np.concatenate([grid_x, grid_y], axis=-1) grid = grid.astype(feats.dtype) feats = np.reshape(feats, [ -1, grid_shape[0], grid_shape[1], num_anchors, num_classes + 5 ]) box_xy = (utils.sigmoid(feats[..., :2]) + grid) / grid_shape[..., ::-1].astype(feats.dtype) box_wh = np.exp(feats[..., 2:4]) * anchors_tensor / input_shape[ ..., ::-1].astype(feats.dtype) box_confidence = utils.sigmoid(feats[..., 4:5]) box_class_probs = utils.sigmoid(feats[..., 5:]) return box_xy, box_wh, box_confidence, box_class_probs
def yolo4_decode(feats, anchors, num_classes, input_shape, scale_x_y=None, calc_loss=False): """Decode final layer features to bounding box parameters.""" num_anchors = len(anchors) # Reshape to batch, height, width, num_anchors, box_params. anchors_tensor = K.reshape(K.constant(anchors), [1, 1, 1, num_anchors, 2]) # ---------------------------------------------------------------------------------------------------------- # 生成 grid 网格基准 (13, 13, 1, 2) grid_shape = K.shape(feats)[1:3] # height, width grid_y = K.tile(K.reshape(K.arange(0, stop=grid_shape[0]), [-1, 1, 1, 1]), [1, grid_shape[1], 1, 1]) grid_x = K.tile(K.reshape(K.arange(0, stop=grid_shape[1]), [1, -1, 1, 1]), [grid_shape[0], 1, 1, 1]) grid = K.concatenate([grid_x, grid_y]) grid = K.cast(grid, K.dtype(feats)) # Reshape to ([batch_size, height, width, num_anchors, (num_classes+5)]) feats = K.reshape( feats, [-1, grid_shape[0], grid_shape[1], num_anchors, num_classes + 5]) # Adjust predictions to each spatial grid point and anchor size. # box_xy 数值范围调整为【0-1】(归一化) # box_wh 数值范围调整为 【0-1】(归一化),输入尺寸是使用backbone的最小特征图尺寸*stride得到的 # 强调说明一下:这里 box_xy 是相对于grid 的位置(说成input似乎也行);box_wh是相对于 input_shape大小 # scale_x_y是一个 trick,见下文链接 if scale_x_y: # Eliminate grid sensitivity trick involved in YOLOv4 # # Reference Paper & code: # "YOLOv4: Optimal Speed and Accuracy of Object Detection" # https://arxiv.org/abs/2004.10934 # https://github.com/opencv/opencv/issues/17148 # https://zhuanlan.zhihu.com/p/139724869 box_xy_tmp = K.sigmoid( feats[..., :2]) * scale_x_y - (scale_x_y - 1) / 2 box_xy = (box_xy_tmp + grid) / K.cast(grid_shape[..., ::-1], K.dtype(feats)) else: box_xy = (K.sigmoid(feats[..., :2]) + grid) / K.cast( grid_shape[..., ::-1], K.dtype(feats)) box_wh = K.exp(feats[..., 2:4]) * anchors_tensor / K.cast( input_shape[..., ::-1], K.dtype(feats)) # sigmoid objectness scores 置信度解码 box_confidence = K.sigmoid(feats[..., 4:5]) # class probs 类别解码 box_class_probs = K.sigmoid(feats[..., 5:]) # 在计算loss的时候返回grid, feats, box_xy, box_wh # 在预测的时候返回box_xy, box_wh, box_confidence, box_class_probs if calc_loss: return grid, feats, box_xy, box_wh return box_xy, box_wh, box_confidence, box_class_probs
def _attention_regularizer(self, attention): batch_size = K.cast(K.shape(attention)[0], K.floatx()) input_len = K.shape(attention)[-1] indices = K.expand_dims(K.arange(0, input_len), axis=0) diagonal = K.expand_dims(K.arange(0, input_len), axis=-1) eye = K.cast(K.equal(indices, diagonal), K.floatx()) return self.attention_regularizer_weight * K.sum(K.square(K.batch_dot( attention, K.permute_dimensions(attention, (0, 2, 1))) - eye)) / batch_size
def yolo_head(feats, anchors, num_classes): """Convert final layer features to bounding box parameters. Parameters ---------- feats : tensor Final convolutional layer features. anchors : array-like Anchor box widths and heights. num_classes : int Number of target classes. Returns ------- box_xy : tensor x, y box predictions adjusted by spatial location in conv layer. box_wh : tensor w, h box predictions adjusted by anchors and conv spatial resolution. box_conf : tensor Probability estimate for whether each box contains any object. box_class_pred : tensor Probability distribution estimate for each box over class labels. """ num_anchors = len(anchors) # Reshape to batch, height, width, num_anchors, box_params. anchors_tensor = K.reshape(K.variable(anchors), [1, 1, 1, num_anchors, 2]) # Dynamic implementation of conv dims for fully convolutional model. conv_dims = K.shape(feats)[1:3] # assuming channels last # In YOLO the height index is the inner most iteration. conv_height_index = K.arange(0, stop=conv_dims[0]) conv_width_index = K.arange(0, stop=conv_dims[1]) conv_height_index = K.tile(conv_height_index, [conv_dims[1]]) conv_width_index = K.tile(K.expand_dims(conv_width_index, 0), [conv_dims[0], 1]) conv_width_index = K.flatten(K.transpose(conv_width_index)) conv_index = K.transpose(K.stack([conv_height_index, conv_width_index])) conv_index = K.reshape(conv_index, [1, conv_dims[0], conv_dims[1], 1, 2]) conv_index = K.cast(conv_index, K.dtype(feats)) feats = K.reshape( feats, [-1, conv_dims[0], conv_dims[1], num_anchors, num_classes + 5]) conv_dims = K.cast(K.reshape(conv_dims, [1, 1, 1, 1, 2]), K.dtype(feats)) box_confidence = K.sigmoid(feats[..., 4:5]) box_xy = K.sigmoid(feats[..., :2]) box_wh = K.exp(feats[..., 2:4]) box_class_probs = K.softmax(feats[..., 5:]) # Adjust preditions to each spatial grid point and anchor size. # Note: YOLO iterates over height index before width index. box_xy = (box_xy + conv_index) / conv_dims box_wh = box_wh * anchors_tensor / conv_dims return box_confidence, box_xy, box_wh, box_class_probs
def yolo3_decode(feats, anchors, num_classes, input_shape, scale_x_y=None, calc_loss=False): """Decode final layer features to bounding box parameters.""" num_anchors = len(anchors) # Reshape to batch, height, width, num_anchors, box_params. anchors_tensor = K.reshape(K.constant(anchors), [1, 1, 1, num_anchors, 2]) grid_shape = K.shape(feats)[1:3] # height, width grid_y = K.tile(K.reshape(K.arange(0, stop=grid_shape[0]), [-1, 1, 1, 1]), [1, grid_shape[1], 1, 1]) grid_x = K.tile(K.reshape(K.arange(0, stop=grid_shape[1]), [1, -1, 1, 1]), [grid_shape[0], 1, 1, 1]) grid = K.concatenate([grid_x, grid_y]) grid = K.cast(grid, K.dtype(feats)) feats = K.reshape( feats, [-1, grid_shape[0], grid_shape[1], num_anchors, num_classes + 5]) box_xy = feats[..., :2] box_wh = feats[..., 2:4] box_xy = tf.where(box_xy < -10.0, -10.0, box_xy) box_xy = tf.where(box_xy > 10.0, 10.0, box_xy) box_wh = tf.where(box_wh < -8.0, -8.0, box_wh) box_wh = tf.where(box_wh > 8.0, 8.0, box_wh) # Adjust preditions to each spatial grid point and anchor size. if scale_x_y: # Eliminate grid sensitivity trick involved in YOLOv4 # # Reference Paper & code: # "YOLOv4: Optimal Speed and Accuracy of Object Detection" # https://arxiv.org/abs/2004.10934 # https://github.com/opencv/opencv/issues/17148 # box_xy_tmp = K.sigmoid( feats[..., :2]) * scale_x_y - (scale_x_y - 1) / 2 box_xy = (box_xy_tmp + grid) / K.cast(grid_shape[..., ::-1], K.dtype(feats)) else: box_xy = (K.sigmoid(feats[..., :2]) + grid) / K.cast( grid_shape[..., ::-1], K.dtype(feats)) box_wh = K.exp(feats[..., 2:4]) * anchors_tensor / K.cast( input_shape[..., ::-1], K.dtype(feats)) box_confidence = K.sigmoid(feats[..., 4:5]) box_class_probs = K.sigmoid(feats[..., 5:]) if calc_loss == True: return grid, feats, box_xy, box_wh return box_xy, box_wh, box_confidence, box_class_probs
def build(self, input_shape): self.pos_encoding = self.add_weight(shape=(input_shape[0],self.d_model), initializer=tf.keras.initializers.Zeros(), name='pos_encoding', trainable=False) self.position = K.expand_dims(K.arange(0,self.max_len,dtype=tf.float32),1) self.div_term = K.exp(K.arange(0,self.d_model, 2,dtype='float32') * (np.log(10000.0) / self.d_model)) self.pos_encoding[:,0::2] = K.sin(self.position * self.div_term) self.pos_encoding[:,1::2] = K.cos(self.position * self.div_term) self.pos_encoding = K.transpose(K.expand_dims(self.pos_encoding,0))
def call(self, x, **kwargs): mask = K.expand_dims(K.cast(K.arange(start=0, stop=K.shape(x)[1] + 1), 'float32'), axis=-1) bins = K.expand_dims(K.cast(K.arange(self.embedding_size // 2) * 2, 'float32'), axis=0) evens = K.dot(mask, 1.0 / K.pow(10000.0, bins / self.embedding_size)) odds = tf.identity(evens) evens = K.sin(evens)[1:, :] odds = K.cos(odds)[1:, :] pos = K.reshape(K.stack([evens, odds], axis=2), (-1, K.shape(x)[1], self.embedding_size)) return pos
def construct_grid(rows, cols): grid_x = K.arange(0, stop=cols) grid_x = K.reshape(grid_x, [1, -1, 1, 1]) grid_x = K.tile(grid_x, [rows, 1, 1, 1]) grid_y = K.arange(0, stop=rows) grid_y = K.reshape(grid_y, [-1, 1, 1, 1]) grid_y = K.tile(grid_y, [1, cols, 1, 1]) grid = K.concatenate([grid_x, grid_y]) return grid