def build_network(self, input, trainable): is_trainable(trainable) intermediate_heatmap_layers = [] tower = layers.Conv2D(out_channel_ratio(64), kernel_size=[3, 3], strides=(2, 2), activation=None, padding='same')(input) tower = layers.BatchNormalization()(tower) tower = layers.ReLU()(tower) tower = inverted_bottleneck(tower, 1, out_channel_ratio(16), 1, 3) tower = max_pool(tower, 2, 2, 2, 2, name='max_pool_0') net_h_w = int(tower.shape[1]) # build network recursively hg_out = self.hourglass_module(tower, STAGE_NUM, intermediate_heatmap_layers) for index, l2 in enumerate(intermediate_heatmap_layers): l2_w_h = int(l2.shape[1]) if l2_w_h == net_h_w: continue scale = net_h_w // l2_w_h intermediate_heatmap_layers[index] = upsample( l2, scale, name="upsample_for_loss_%d" % index) merged_layer = tf.keras.layers.Average()(intermediate_heatmap_layers) return hg_out, merged_layer
def hourglass_module(self, inp, stage_nums, intermediate_heatmap_layers): if stage_nums > 0: down_sample = max_pool(inp, 2, 2, 2, 2, name="hourglass_downsample_%d" % stage_nums) tower = inverted_bottleneck(down_sample, up_channel_ratio(6), out_channel_ratio(24), 0, 3) tower = inverted_bottleneck(tower, up_channel_ratio(6), out_channel_ratio(24), 0, 3) tower = inverted_bottleneck(tower, up_channel_ratio(6), out_channel_ratio(24), 0, 3) tower = inverted_bottleneck(tower, up_channel_ratio(6), out_channel_ratio(24), 0, 3) block_front = inverted_bottleneck(tower, up_channel_ratio(6), out_channel_ratio(24), 0, 3) stage_nums -= 1 block_mid = self.hourglass_module(block_front, stage_nums, intermediate_heatmap_layers) block_back = inverted_bottleneck(block_mid, up_channel_ratio(6), N_KPOINTS, 0, 3, scope="hourglass_back_%d" % stage_nums) up_sample = upsample(block_back, 2, "hourglass_upsample_%d" % stage_nums) # jump layer tower = inverted_bottleneck(inp, up_channel_ratio(6), out_channel_ratio(24), 0, 3) tower = inverted_bottleneck(tower, up_channel_ratio(6), out_channel_ratio(24), 0, 3) tower = inverted_bottleneck(tower, up_channel_ratio(6), out_channel_ratio(24), 0, 3) tower = inverted_bottleneck(tower, up_channel_ratio(6), out_channel_ratio(24), 0, 3) branch_jump = inverted_bottleneck(tower, up_channel_ratio(6), N_KPOINTS, 0, 3) curr_hg_out = layers.Add()([up_sample, branch_jump]) # mid supervise intermediate_heatmap_layers.append(curr_hg_out) return curr_hg_out else: return inverted_bottleneck(inp, up_channel_ratio(6), out_channel_ratio(24), 0, 3, scope="hourglass_mid_%d" % stage_nums)
def hourglass_module(inp, stage_nums): if stage_nums > 0: down_sample = max_pool(inp, 2, 2, 2, 2, name="hourglass_downsample_%d" % stage_nums) block_front = slim.stack( down_sample, inverted_bottleneck, [ (up_channel_ratio(6), out_channel_ratio(24), 0, 3), (up_channel_ratio(6), out_channel_ratio(24), 0, 3), (up_channel_ratio(6), out_channel_ratio(24), 0, 3), (up_channel_ratio(6), out_channel_ratio(24), 0, 3), (up_channel_ratio(6), out_channel_ratio(24), 0, 3), ], scope="hourglass_front_%d" % stage_nums) stage_nums -= 1 block_mid = hourglass_module(block_front, stage_nums) block_back = inverted_bottleneck(block_mid, up_channel_ratio(6), N_KPOINTS, 0, 3, scope="hourglass_back_%d" % stage_nums) up_sample = upsample(block_back, 2, "hourglass_upsample_%d" % stage_nums) # jump layer branch_jump = slim.stack( inp, inverted_bottleneck, [ (up_channel_ratio(6), out_channel_ratio(24), 0, 3), (up_channel_ratio(6), out_channel_ratio(24), 0, 3), (up_channel_ratio(6), out_channel_ratio(24), 0, 3), (up_channel_ratio(6), out_channel_ratio(24), 0, 3), (up_channel_ratio(6), N_KPOINTS, 0, 3), ], scope="hourglass_branch_jump_%d" % stage_nums) curr_hg_out = tf.add(up_sample, branch_jump, name="hourglass_out_%d" % stage_nums) # mid supervise l2s.append(curr_hg_out) return curr_hg_out _ = inverted_bottleneck(inp, up_channel_ratio(6), out_channel_ratio(24), 0, 3, scope="hourglass_mid_%d" % stage_nums) return _
def build_network(input, trainable): is_trainable(trainable) net = convb(input, 3, 3, out_channel_ratio(32), 2, name="Conv2d_0") with tf.variable_scope('MobilenetV2'): # 128, 112 mv2_branch_0 = slim.stack(net, inverted_bottleneck, [(1, out_channel_ratio(16), 0, 3), (1, out_channel_ratio(16), 0, 3)], scope="MobilenetV2_part_0") # 64, 56 mv2_branch_1 = slim.stack( mv2_branch_0, inverted_bottleneck, [ (up_channel_ratio(6), out_channel_ratio(24), 1, 3), (up_channel_ratio(6), out_channel_ratio(24), 0, 3), (up_channel_ratio(6), out_channel_ratio(24), 0, 3), (up_channel_ratio(6), out_channel_ratio(24), 0, 3), (up_channel_ratio(6), out_channel_ratio(24), 0, 3), ], scope="MobilenetV2_part_1") # 32, 28 mv2_branch_2 = slim.stack( mv2_branch_1, inverted_bottleneck, [ (up_channel_ratio(6), out_channel_ratio(32), 1, 3), (up_channel_ratio(6), out_channel_ratio(32), 0, 3), (up_channel_ratio(6), out_channel_ratio(32), 0, 3), (up_channel_ratio(6), out_channel_ratio(32), 0, 3), (up_channel_ratio(6), out_channel_ratio(32), 0, 3), ], scope="MobilenetV2_part_2") # 16, 14 mv2_branch_3 = slim.stack( mv2_branch_2, inverted_bottleneck, [ (up_channel_ratio(6), out_channel_ratio(64), 1, 3), (up_channel_ratio(6), out_channel_ratio(64), 0, 3), (up_channel_ratio(6), out_channel_ratio(64), 0, 3), (up_channel_ratio(6), out_channel_ratio(64), 0, 3), (up_channel_ratio(6), out_channel_ratio(64), 0, 3), ], scope="MobilenetV2_part_3") # 8, 7 mv2_branch_4 = slim.stack( mv2_branch_3, inverted_bottleneck, [(up_channel_ratio(6), out_channel_ratio(96), 1, 3), (up_channel_ratio(6), out_channel_ratio(96), 0, 3), (up_channel_ratio(6), out_channel_ratio(96), 0, 3), (up_channel_ratio(6), out_channel_ratio(96), 0, 3), (up_channel_ratio(6), out_channel_ratio(96), 0, 3)], scope="MobilenetV2_part_4") cancat_mv2 = tf.concat([ max_pool(mv2_branch_0, 4, 4, 4, 4, name="mv2_0_max_pool"), max_pool(mv2_branch_1, 2, 2, 2, 2, name="mv2_1_max_pool"), mv2_branch_2, upsample(mv2_branch_3, 2, name="mv2_3_upsample"), upsample(mv2_branch_4, 4, name="mv2_4_upsample") ], axis=3) with tf.variable_scope("Convolutional_Pose_Machine"): l2s = [] prev = None for stage_number in range(STAGE_NUM): if prev is not None: inputs = tf.concat([cancat_mv2, prev], axis=3) else: inputs = cancat_mv2 kernel_size = 7 lastest_channel_size = 128 if stage_number == 0: kernel_size = 3 lastest_channel_size = 512 _ = slim.stack( inputs, inverted_bottleneck, [ (2, out_channel_cpm(32), 0, kernel_size), (up_channel_ratio(4), out_channel_cpm(32), 0, kernel_size), (up_channel_ratio(4), out_channel_cpm(32), 0, kernel_size), ], scope="stage_%d_mv2" % stage_number) _ = slim.stack(_, separable_conv, [(out_channel_ratio(lastest_channel_size), 1, 1), (N_KPOINTS, 1, 1)], scope="stage_%d_mv1" % stage_number) prev = _ cpm_out = upsample(_, 4, "stage_%d_out" % stage_number) l2s.append(cpm_out) return cpm_out, l2s