def _get_NN_prediction(self, image): image = image / 255.0 print(CHANNEL) dummy_channels = 16 - CHANNEL image = tf.concat(3, [image, tf.zeros((tf.shape(image)[0], 84, 84, dummy_channels))]) with argscope(Conv2D, nl=tf.nn.relu): input_sum = tf.reduce_sum(tf.abs(image)) #i = tf.Print(image, [image], message='input image: ', summarize=30) #i2 = tf.Print(i, [input_sum], message='input abs sum: ') l = Conv2D('conv0', image, out_channel=32, kernel_shape=5, use_bias=False, padding='VALID') #l = tf.Print(l, [l], message='conv0: ', summarize=30) l = MaxPooling('pool0', l, 2) l = Conv2D('conv1', l, out_channel=32, kernel_shape=5, use_bias=False, padding='VALID') #l = tf.Print(l, [l], message='conv1: ', summarize=30) l = MaxPooling('pool1', l, 2) l = Conv2D('conv2', l, out_channel=64, kernel_shape=5, use_bias=False, padding='VALID') #l = tf.Print(l, [l], message='conv2: ', summarize=30) l = MaxPooling('pool2', l, 2) l = Conv2D('conv3', l, out_channel=64, kernel_shape=3, use_bias=False, padding='VALID') #l = tf.Print(l, [l], message='conv3: ', summarize=30) l = FullyConnected('fc0', l, 512, nl=tf.identity) if args.artificial_slowdown != 0.0: l2 = tf_sleep(l, sleep=args.artificial_slowdown) l = tf.reshape(l2, tf.shape(l))#.get_shape()) #l = tf.Print(l, [l], message='fc0: ', summarize=15) l = PReLU('prelu', l) policy = FullyConnected('fc-pi', l, out_dim=NUM_ACTIONS, nl=tf.identity) value = FullyConnected('fc-v', l, 1, nl=tf.identity) return policy, value
def _get_DQN_prediction(self, image): """ image: [0,255]""" #image = image / 255.0 with argscope(Conv2D, nl=PReLU.f, use_bias=True): l = Conv2D('conv0', image, out_channel=32, kernel_shape=5) l = MaxPooling('pool0', l, 2) l = Conv2D('conv1', l, out_channel=32, kernel_shape=5) l = MaxPooling('pool1', l, 2) l = Conv2D('conv2', l, out_channel=64, kernel_shape=4) l = MaxPooling('pool2', l, 2) l = Conv2D('conv3', l, out_channel=64, kernel_shape=3) l = FullyConnected('fc0', l, 512, nl=lambda x, name: LeakyReLU.f(x, 0.01, name)) # the original arch #.Conv2D('conv0', image, out_channel=32, kernel_shape=8, stride=4) #.Conv2D('conv1', out_channel=64, kernel_shape=4, stride=2) #.Conv2D('conv2', out_channel=64, kernel_shape=3) if not DUELING: Q = FullyConnected('fct', l, NUM_ACTIONS, nl=tf.identity) else: V = FullyConnected('fctV', l, 1, nl=tf.identity) As = FullyConnected('fctA', l, NUM_ACTIONS, nl=tf.identity) Q = tf.add(As, V - tf.reduce_mean(As, 1, keep_dims=True)) return tf.identity(Q, name='Qvalue')
def shufflenet_unit_no_shortcut(l, out_channel, group, stride): in_shape = l.get_shape().as_list() in_channel = in_shape[1] # We do not apply group convolution on the first pointwise layer # because the number of input channels is relatively small. first_split = group if in_channel != 24 else 1 l = Conv2D('conv1', l, out_channel // 4, kernel_shape=1, split=first_split, nl=c_BNReLU) l = channel_shuffle(l, group) l = DepthConv('dconv', l, out_channel // 4, kernel_shape=3, nl=c_BN, stride=stride) l = Conv2D('conv2', l, out_channel, kernel_shape=1, split=first_split, nl=c_BN) output = tf.nn.relu(l) return output
def _get_NN_prediction(self, image): self._create_unnary_variables_with_summary( image[:, 0, :, 0], (10, 10, 6, 6, 6), ("rewards", "levels", "lives0", "lives1", "lives2")) image = image / 255.0 with argscope(Conv2D, nl=tf.nn.relu): lc0 = Conv2D('conv0', image, out_channel=32, kernel_shape=5) lc0 = MaxPooling('pool0', lc0, 2) lc1 = Conv2D('conv1', lc0, out_channel=32, kernel_shape=5) lc1 = MaxPooling('pool1', lc1, 2) lc2 = Conv2D('conv2', lc1, out_channel=64, kernel_shape=4) lc2 = MaxPooling('pool2', lc2, 2) lc3 = Conv2D('conv3', lc2, out_channel=64, kernel_shape=3) lfc0 = FullyConnected('fc0', lc3, 512, nl=tf.identity) lfc0 = PReLU('prelu', lfc0) policy = FullyConnected('fc-pi', lfc0, out_dim=self.number_of_actions, nl=tf.identity) value = FullyConnected('fc-v', lfc0, 1, nl=tf.identity) # if DEBUGING_INFO: # summary.add_activation_summary(lc0, "conv_0") # summary.add_activation_summary(lc1, "conv_1") # summary.add_activation_summary(lc2, "conv_2") # summary.add_activation_summary(lc3, "conv_3") # summary.add_activation_summary(lfc0, "fc0") # summary.add_activation_summary(policy, "policy") # summary.add_activation_summary(value, "fc-v") return policy, value
def shufflenet_unit_no_shortcut(l, out_channel, group, stride): in_shape = l.get_shape().as_list() in_channel = in_shape[1] first_split = group if in_channel != 24 else 1 l = Conv2D('conv1', l, out_channel // 4, kernel_shape=1, split=first_split, nl=BNReLU) l = channel_shuffle(l, group) l = DepthConv('dconv', l, out_channel // 4, kernel_shape=3, nl=BN, stride=stride) l = Conv2D('conv2', l, out_channel, kernel_shape=1, split=first_split, nl=BN) output = tf.nn.relu(l) return output
def _get_NN_prediction(self, image): self._create_unnary_variables_with_summary( image[:, 0, :, 0], (10, 10, 6, 6, 6), ("rewards", "levels", "lives0", "lives1", "lives2")) NUMBER_OF_REWARD_EVENTS = 10 rewards_events = [] for x in xrange(NUMBER_OF_REWARD_EVENTS): rewards_events.append(tf.reshape(image[:, 0, x, 0], (-1, 1))) image = image / 255.0 with argscope(Conv2D, nl=tf.nn.relu): lc0 = Conv2D('conv0', image, out_channel=32, kernel_shape=5) lc0 = MaxPooling('pool0', lc0, 2) lc1 = Conv2D('conv1', lc0, out_channel=32, kernel_shape=5) lc1 = MaxPooling('pool1', lc1, 2) lc2 = Conv2D('conv2', lc1, out_channel=64, kernel_shape=4) lc2 = MaxPooling('pool2', lc2, 2) lc3 = Conv2D('conv3', lc2, out_channel=64, kernel_shape=3) policies = [] values = [] for x in xrange(10): lfc0 = FullyConnected('fc0{}'.format(x), lc3, 512, nl=tf.identity) lfc0 = PReLU('prelu{}'.format(x), lfc0) policy = FullyConnected('fc-pi{}'.format(x), lfc0, out_dim=self.number_of_actions, nl=tf.identity) value = FullyConnected('fc-v{}'.format(x), lfc0, 1, nl=tf.identity) policies.append(policy) values.append(value) weighted_policies = [] weighted_values = [] for weight, policy, value in zip(rewards_events, policies, values): weighted_policies.append(tf.multiply(weight, policy)) weighted_values.append(tf.multiply(weight, value)) policy = tf.add_n(weighted_policies) value = tf.add_n(weighted_values) # if DEBUGING_INFO: # summary.add_activation_summary(lc0, "conv_0") # summary.add_activation_summary(lc1, "conv_1") # summary.add_activation_summary(lc2, "conv_2") # summary.add_activation_summary(lc3, "conv_3") # summary.add_activation_summary(lfc0, "fc0") # summary.add_activation_summary(policy, "policy") # summary.add_activation_summary(value, "fc-v") return policy, value
def _get_NN_prediction(self, image): image = tf.cast(image, tf.float32) / 255.0 with argscope(Conv2D, activation=tf.nn.relu): l = Conv2D('conv0', image, 32, 5) l = MaxPooling('pool0', l, 2) l = Conv2D('conv1', l, 32, 5) l = MaxPooling('pool1', l, 2) l = Conv2D('conv2', l, 64, 4) l = MaxPooling('pool2', l, 2) l = Conv2D('conv3', l, 64, 3) l = FullyConnected('fc0', l, 512) l = PReLU('prelu', l) logits = FullyConnected('fc-pi', l, self.num_actions) # unnormalized policy value = FullyConnected('fc-v', l, 1) return logits, value
def _get_NN_prediction(self, image): image = image / 255.0 with argscope(Conv2D, nl=tf.nn.relu): if NETWORK_ARCH == '1': l = Conv2D('conv0', image, out_channel=32, kernel_shape=5) l = MaxPooling('pool0', l, 2) l = Conv2D('conv1', l, out_channel=32, kernel_shape=5) l = MaxPooling('pool1', l, 2) l = Conv2D('conv2', l, out_channel=64, kernel_shape=4) l = MaxPooling('pool2', l, 2) l = Conv2D('conv3', l, out_channel=64, kernel_shape=3) # conv3 output: [None, 10, 10, 64] elif NETWORK_ARCH == 'nature': l = Conv2D('conv0', image, out_channel=32, kernel_shape=8, stride=4) l = Conv2D('conv1', l, out_channel=64, kernel_shape=4, stride=2) l = Conv2D('conv2', l, out_channel=64, kernel_shape=3) # conv2 output: [None, 11, 11, 64] conv2 = tf.identity(l, name='convolutional-2') l = FullyConnected('fc0', l, 512, nl=tf.identity) l = PReLU('prelu', l) fc = tf.identity(l, name='fully-connected') policy = FullyConnected('fc-pi', l, out_dim=NUM_ACTIONS, nl=tf.identity) value = FullyConnected('fc-v', l, 1, nl=tf.identity) return policy, value
def _get_NN_prediction(self, image): image = image / 255.0 with argscope(Conv2D, nl=tf.nn.relu): l = Conv2D('conv0', image, out_channel=32, kernel_shape=5) l = MaxPooling('pool0', l, 2) l = Conv2D('conv1', l, out_channel=32, kernel_shape=5) l = MaxPooling('pool1', l, 2) l = Conv2D('conv2', l, out_channel=64, kernel_shape=4) l = MaxPooling('pool2', l, 2) l = Conv2D('conv3', l, out_channel=64, kernel_shape=3) l = FullyConnected('fc0', l, 512, nl=tf.identity) l = PReLU('prelu', l) policy = FullyConnected('fc-pi', l, out_dim=NUM_ACTIONS, nl=tf.identity) return policy
def _get_DQN_prediction(self, image): #TODO: Do we need to add other pre-processing? e.g., subtract mean image = image / 255.0 #TODO: The network structure can be improved? with argscope(Conv2D, nl=tf.nn.relu, use_bias=True): # Activation for each layer l = Conv2D('conv0', image, out_channel=32, kernel_shape=5) l = MaxPooling('pool0', l, 2) l = Conv2D('conv1', l, out_channel=32, kernel_shape=5) l = MaxPooling('pool1', l, 2) l = Conv2D('conv2', l, out_channel=64, kernel_shape=4) l = MaxPooling('pool2', l, 2) l = Conv2D('conv2', l, out_channel=64, kernel_shape=3) # the original arch # .Conv2D('conv0', image, out_channel=32, kernel_shape=8, stride=4) # .Conv2D('conv1', out_channel=64, kernel_shape=4, stride=2) # .Conv2D('conv2', out_channel=64, kernel_shape=3) l = FullyConnected('fc0', l, 512, nl=lambda x, name: LeakyReLU.f(x, 0.01, name)) l = FullyConnected('fct', l, NUM_ACTIONS, nl=tf.identity())
def shufflenet_unit_supervisor(l, out_channel, group, stride): in_shape = l.get_shape().as_list() in_channel = in_shape[1] shortcut = l # We do not apply group convolution on the first pointwise layer # because the number of input channels is relatively small. first_split = group if in_channel != 16 else 1 l = Conv2D('conv1', l, out_channel // 4, kernel_shape=1, split=first_split, nl=BNReLU) l = channel_shuffle(l, group) l = DepthConv('dconv', l, out_channel // 4, kernel_shape=3, nl=BN, stride=stride) l = Conv2D('conv2', l, out_channel if stride == 1 else out_channel - in_channel, kernel_shape=1, split=first_split, nl=BN) if stride == 1: # unit (b) output = tf.nn.relu(shortcut + l) else: # unit (c) shortcut = AvgPooling('avgpool', shortcut, 3, 2, padding='SAME') output = tf.concat([shortcut, tf.nn.relu(l)], axis=1) return output
def shufflenet_unit(l, out_channel, group, stride): in_shape = l.get_shape().as_list() in_channel = in_shape[1] shortcut = l first_split = group if in_channel != 24 else 1 l = Conv2D('conv1', l, out_channel // 4, kernel_shape=1, split=first_split, nl=BNReLU) l = channel_shuffle(l, group) l = DepthConv('dconv', l, out_channel // 4, kernel_shape=3, nl=BN, stride=stride) l = Conv2D('conv2', l, out_channel if stride == 1 else out_channel - in_channel, kernel_shape=1, split=group, nl=BN) if stride == 1: output = tf.nn.relu(shortcut + l) else: shortcut = AvgPooling('avgpool', shortcut, 3, 2, padding='SAME') output = tf.concat([shortcut, tf.nn.relu(l)], axis=1) return output
def _get_NN_prediction_wrapped(self, image): with tf.device('/cpu:0'): with tf.variable_scope(tf.get_variable_scope(), reuse=None): image = image / 255.0 relus = [] self.layer_output_means = {} print(CHANNEL) dummy_channels = TARGET_CHANNELS - CHANNEL image = tf.concat([ image, tf.zeros((tf.shape(image)[0], 84, 84, dummy_channels)) ], 3) with argscope(Conv2D, nl=tf.nn.relu): input_sum = tf.reduce_sum(tf.abs(image)) l = Conv2D('conv0', image, out_channel=32, kernel_shape=5, use_bias=False, padding='VALID', parameter_device=self.device_function, conv_init=args.conv_init) with tf.variable_scope('conv0', reuse=True): self.vars_for_save['conv0/W'] = tf.get_variable('W') self.layer_output_means['conv0_out'] = tf.reduce_mean(l) relus.append(l) # l = tf.layers.batch_normalization(l) l = MaxPooling('pool0', l, 2) l = Conv2D('conv1', l, out_channel=32, kernel_shape=5, use_bias=False, padding='VALID', parameter_device=self.device_function, conv_init=args.conv_init) with tf.variable_scope('conv1', reuse=True): self.vars_for_save['conv1/W'] = tf.get_variable('W') self.layer_output_means['conv1_out'] = tf.reduce_mean(l) relus.append(l) # l = tf.Print(l, [l], message='conv1: ', summarize=30) l = MaxPooling('pool1', l, 2) l = Conv2D('conv2', l, out_channel=64, kernel_shape=5, use_bias=False, padding='VALID', parameter_device=self.device_function, conv_init=args.conv_init) with tf.variable_scope('conv2', reuse=True): self.vars_for_save['conv2/W'] = tf.get_variable('W') self.layer_output_means['conv2_out'] = tf.reduce_mean(l) relus.append(l) l = MaxPooling('pool2', l, 2) l = Conv2D('conv3', l, out_channel=64, kernel_shape=3, use_bias=False, padding='VALID', parameter_device=self.device_function, conv_init=args.conv_init) with tf.variable_scope('conv3', reuse=True): self.vars_for_save['conv3/W'] = tf.get_variable('W') self.layer_output_means['conv3_out'] = tf.reduce_mean(l) relus.append(l) # 1 fully connected layer but split into many tensors # to better split parameters between many PS's if args.replace_with_conv: fc_splits = [] neurons = args.fc_neurons / args.fc_splits for i in range(args.fc_splits): fc = Conv2D('fc1_{}'.format(i), l, out_channel=neurons, kernel_shape=5, nl=tf.identity, padding='VALID', parameter_device=self.device_function, conv_init='uniform2', use_bias=False) with tf.variable_scope('fc1_{}'.format(i), reuse=True): self.vars_for_save['fc1_{}/W'.format( i)] = tf.get_variable('W') fc = tf.reshape(tensor=fc, shape=[-1, neurons]) self.layer_output_means['fc1_{}_out'.format( i)] = tf.reduce_mean(fc) fc_splits.append(fc) l = tf.concat(fc_splits, axis=1) else: fc = [] for i in range(args.ps): fc_part = FullyConnected( 'fc1_{}'.format(i), l, args.fc_neurons / args.ps, nl=tf.identity, parameter_device=self.device_function, fc_init=args.fc_init) with tf.variable_scope('fc1_{}'.format(i), reuse=True): self.vars_for_save['fc1_{}/W'.format( i)] = tf.get_variable('W') self.vars_for_save['fc1_{}/b'.format( i)] = tf.get_variable('b') self.layer_output_means['fc1_{}_out'.format( i)] = tf.reduce_mean(fc_part) fc_part = tf.nn.relu(fc_part, 'relu1') fc.append(fc_part) relus.append(fc_part) l = tf.concat(fc, axis=1) with tf.variable_scope('fc1_0', reuse=True): val = tf.gather(tf.get_variable('W'), 0) val = tf.reshape(val, [-1]) self.fc_fc0 = tf.gather(val, 0) policy = FullyConnected('fc-pi', l, out_dim=NUM_ACTIONS, nl=tf.identity, parameter_device=self.device_function, fc_init=args.fc_init) with tf.variable_scope('fc-pi', reuse=True): self.vars_for_save['fc-pi/W'] = tf.get_variable('W') self.vars_for_save['fc-pi/b'] = tf.get_variable('b') self.layer_output_means['fc_pi_out'] = tf.reduce_mean(policy) value = FullyConnected('fc-v', l, out_dim=1, nl=tf.identity, parameter_device=self.device_function, fc_init=args.fc_init) with tf.variable_scope('fc-v', reuse=True): self.vars_for_save['fc-v/W'] = tf.get_variable('W') self.vars_for_save['fc-v/b'] = tf.get_variable('b') self.layer_output_means['fc_v_out'] = tf.reduce_mean(value) with tf.variable_scope('fc-v', reuse=True): val = tf.gather(tf.get_variable('W'), 0) self.fc_value = tf.gather(val, 0) # number of relu gates that are not 0 self.active_relus = tf.add_n( [tf.count_nonzero(r) for r in relus]) return policy, value
def _add_forward_graph(self, student=0.5): """NN architecture.""" self.image_input, self.input_mask, self.box_delta_input, \ self.box_input, self.labels, self.mimic_mask, self.mimic_mask2 = self.batch_data_queue.dequeue() def shufflenet_unit_supervisor(l, out_channel, group, stride): in_shape = l.get_shape().as_list() in_channel = in_shape[1] shortcut = l # We do not apply group convolution on the first pointwise layer # because the number of input channels is relatively small. first_split = group if in_channel != 16 else 1 l = Conv2D('conv1', l, out_channel // 4, kernel_shape=1, split=first_split, nl=BNReLU) l = channel_shuffle(l, group) l = DepthConv('dconv', l, out_channel // 4, kernel_shape=3, nl=BN, stride=stride) l = Conv2D('conv2', l, out_channel if stride == 1 else out_channel - in_channel, kernel_shape=1, split=first_split, nl=BN) if stride == 1: # unit (b) output = tf.nn.relu(shortcut + l) else: # unit (c) shortcut = AvgPooling('avgpool', shortcut, 3, 2, padding='SAME') output = tf.concat([shortcut, tf.nn.relu(l)], axis=1) return output def shufflenet_unit_add_supervisor(l, out_channel, group, stride): in_shape = l.get_shape().as_list() in_channel = in_shape[1] shortcut = l # We do not apply group convolution on the first pointwise layer # because the number of input channels is relatively small. first_split = group if in_channel != 24 else 1 l = Conv2D('conv1', l, out_channel // 4, kernel_shape=1, split=first_split, nl=BNReLU) l = channel_shuffle(l, group) l = DepthConv('dconv', l, out_channel // 4, kernel_shape=3, nl=BN, stride=stride) l = Conv2D('conv2', l, out_channel, kernel_shape=1, split=first_split, nl=BN) output = tf.nn.relu(shortcut + l) return output def shufflenet_unit_no_shortcut_supervisor(l, out_channel, group, stride): in_shape = l.get_shape().as_list() in_channel = in_shape[1] # We do not apply group convolution on the first pointwise layer # because the number of input channels is relatively small. first_split = group if in_channel != 24 else 1 l = Conv2D('conv1', l, out_channel // 4, kernel_shape=1, split=first_split, nl=BNReLU) l = channel_shuffle(l, group) l = DepthConv('dconv', l, out_channel // 4, kernel_shape=3, nl=BN, stride=stride) l = Conv2D('conv2', l, out_channel, kernel_shape=1, split=first_split, nl=BN) output = tf.nn.relu(l) return output def shufflenet_unit(l, out_channel, group, stride): in_shape = l.get_shape().as_list() in_channel = in_shape[1] shortcut = l # We do not apply group convolution on the first pointwise layer # because the number of input channels is relatively small. first_split = group if in_channel != 24 else 1 l = Conv2D('conv1', l, out_channel // 4, kernel_shape=1, split=first_split, nl=c_BNReLU) l = channel_shuffle(l, group) l = DepthConv('dconv', l, out_channel // 4, kernel_shape=3, nl=c_BN, stride=stride) l = Conv2D('conv2', l, out_channel if stride == 1 else out_channel - in_channel, kernel_shape=1, split=group, nl=c_BN) if stride == 1: # unit (b) output = tf.nn.relu(shortcut + l) else: # unit (c) shortcut = AvgPooling('avgpool', shortcut, 3, 2, padding='SAME') output = tf.concat([shortcut, tf.nn.relu(l)], axis=1) return output def shufflenet_unit_add(l, out_channel, group, stride): in_shape = l.get_shape().as_list() in_channel = in_shape[1] shortcut = l # We do not apply group convolution on the first pointwise layer # because the number of input channels is relatively small. first_split = group if in_channel != 24 else 1 l = Conv2D('conv1', l, out_channel // 4, kernel_shape=1, split=first_split, nl=c_BNReLU) l = channel_shuffle(l, group) l = DepthConv('dconv', l, out_channel // 4, kernel_shape=3, nl=c_BN, stride=stride) l = Conv2D('conv2', l, out_channel, kernel_shape=1, split=first_split, nl=c_BN) output = tf.nn.relu(shortcut + l) return output def shufflenet_unit_no_shortcut(l, out_channel, group, stride): in_shape = l.get_shape().as_list() in_channel = in_shape[1] # We do not apply group convolution on the first pointwise layer # because the number of input channels is relatively small. first_split = group if in_channel != 24 else 1 l = Conv2D('conv1', l, out_channel // 4, kernel_shape=1, split=first_split, nl=c_BNReLU) l = channel_shuffle(l, group) l = DepthConv('dconv', l, out_channel // 4, kernel_shape=3, nl=c_BN, stride=stride) l = Conv2D('conv2', l, out_channel, kernel_shape=1, split=first_split, nl=c_BN) output = tf.nn.relu(l) return output mc = self.mc # if mc.LOAD_PRETRAINED_MODEL: # assert tf.gfile.Exists(mc.PRETRAINED_MODEL_PATH), \ # 'Cannot find pretrained model at the given path:' \ # ' {}'.format(mc.PRETRAINED_MODEL_PATH) with argscope([Conv2D, MaxPooling, AvgPooling, GlobalAvgPooling, BatchNorm], data_format='NCHW'), \ argscope(Conv2D, use_bias=False): with TowerContext(tf.get_default_graph().get_name_scope(), is_training=False): with tf.variable_scope('shuffleDet_supervisor'): group = 3 channels = [240, 480, 960] l = tf.transpose(self.image_input, [0, 3, 1, 2]) l = Conv2D('conv1', l, 16, 3, stride=1, nl=BNReLU) l = MaxPooling('pool1', l, 3, 2, padding='SAME') with tf.variable_scope('group1'): for i in range(4): with tf.variable_scope('block{}'.format(i)): l = shufflenet_unit_supervisor( l, channels[0], group, 2 if i == 0 else 1) with tf.variable_scope('group2'): for i in range(6): with tf.variable_scope('block{}'.format(i)): l = shufflenet_unit_supervisor( l, channels[1], group, 2 if i == 0 else 1) with tf.variable_scope('group3'): for i in range(4): with tf.variable_scope('block{}'.format(i)): l = shufflenet_unit_supervisor( l, channels[2], group, 2 if i == 0 else 1) with tf.variable_scope('added3'): with tf.variable_scope('block{}'.format(0)): l = shufflenet_unit_add_supervisor(l, 960, 3, 1) with tf.variable_scope('block{}'.format(1)): l = shufflenet_unit_no_shortcut_supervisor( l, 768, 3, 1) supervisor_last_feature = tf.transpose(l, [0, 2, 3, 1]) self.inspect_last_feature = supervisor_last_feature with argscope( c_batch_norm, is_main_training_tower=int( tf.get_default_graph().get_name_scope()[-1]) == 0, data_format='NCHW'): with TowerContext( tf.get_default_graph().get_name_scope(), is_training=mc.IS_TRAINING, index=int( tf.get_default_graph().get_name_scope()[-1])): # with TowerContext(tf.get_default_graph().get_name_scope(), is_training=mc.IS_TRAINING): group = 3 # channels = [120, 240, 480] channels = [ int(240 * student), int(480 * student), int(960 * student) ] l = tf.transpose(self.image_input, [0, 3, 1, 2]) l = Conv2D('conv1', l, 24, 3, stride=1, nl=c_BNReLU) l = MaxPooling('pool1', l, 3, 2, padding='SAME') with tf.variable_scope('group1'): for i in range(4): with tf.variable_scope('block{}'.format(i)): l = shufflenet_unit(l, channels[0], group, 2 if i == 0 else 1) with tf.variable_scope('group2'): for i in range(6): with tf.variable_scope('block{}'.format(i)): l = shufflenet_unit(l, channels[1], group, 2 if i == 0 else 1) with tf.variable_scope('group3'): for i in range(4): with tf.variable_scope('block{}'.format(i)): l = shufflenet_unit(l, channels[2], group, 2 if i == 0 else 1) with tf.variable_scope('added3'): with tf.variable_scope('block{}'.format(0)): l = shufflenet_unit_add(l, int(960 * student), 3, 1) with tf.variable_scope('block{}'.format(1)): l = shufflenet_unit_no_shortcut( l, int(768 * student), 3, 1) # 768, 384, 192 l = tf.transpose(l, [0, 2, 3, 1]) with tf.variable_scope('adaptation'): student_adap = self._conv_layer_no_pretrain( 'conv', l, filters=768, size=3, stride=1, padding='SAME', xavier=False, relu=True, stddev=0.0001) # student_adap = Conv2D('conv', l, 768, 3, data_format='channels_last',nl=RELU) ###add for mimic with tf.variable_scope('mimic_loss'): mimic_mask = tf.cast(tf.expand_dims(self.mimic_mask, axis=-1), tf.float32) # this normalization is maybe too harsh # mask mimic if student == 0.5: normalization = tf.reduce_sum(mimic_mask) * 2. else: normalization = tf.reduce_sum(mimic_mask) * 4. self.mimic_loss = tf.div( tf.reduce_sum( tf.square(supervisor_last_feature - student_adap) * mimic_mask), normalization) if self.without_imitation: self.mimic_loss = self.mimic_loss * 0. tf.add_to_collection('losses', self.mimic_loss) dropout11 = tf.nn.dropout(l, self.keep_prob, name='drop11') num_output = mc.ANCHOR_PER_GRID * (mc.CLASSES + 1 + 4) self.preds = self._conv_layer_no_pretrain('conv12', dropout11, filters=num_output, size=3, stride=1, padding='SAME', xavier=False, relu=False, stddev=0.0001)
def _add_forward_graph(self, student=0.5): """NN architecture.""" def shufflenet_unit(l, out_channel, group, stride): in_shape = l.get_shape().as_list() in_channel = in_shape[1] shortcut = l first_split = group if in_channel != 24 else 1 l = Conv2D('conv1', l, out_channel // 4, kernel_shape=1, split=first_split, nl=BNReLU) l = channel_shuffle(l, group) l = DepthConv('dconv', l, out_channel // 4, kernel_shape=3, nl=BN, stride=stride) l = Conv2D('conv2', l, out_channel if stride == 1 else out_channel - in_channel, kernel_shape=1, split=group, nl=BN) if stride == 1: output = tf.nn.relu(shortcut + l) else: shortcut = AvgPooling('avgpool', shortcut, 3, 2, padding='SAME') output = tf.concat([shortcut, tf.nn.relu(l)], axis=1) return output def shufflenet_unit_add(l, out_channel, group, stride): in_shape = l.get_shape().as_list() in_channel = in_shape[1] shortcut = l first_split = group if in_channel != 24 else 1 l = Conv2D('conv1', l, out_channel // 4, kernel_shape=1, split=first_split, nl=BNReLU) l = channel_shuffle(l, group) l = DepthConv('dconv', l, out_channel // 4, kernel_shape=3, nl=BN, stride=stride) l = Conv2D('conv2', l, out_channel, kernel_shape=1, split=first_split, nl=BN) output = tf.nn.relu(shortcut + l) return output def shufflenet_unit_no_shortcut(l, out_channel, group, stride): in_shape = l.get_shape().as_list() in_channel = in_shape[1] first_split = group if in_channel != 24 else 1 l = Conv2D('conv1', l, out_channel // 4, kernel_shape=1, split=first_split, nl=BNReLU) l = channel_shuffle(l, group) l = DepthConv('dconv', l, out_channel // 4, kernel_shape=3, nl=BN, stride=stride) l = Conv2D('conv2', l, out_channel, kernel_shape=1, split=first_split, nl=BN) output = tf.nn.relu(l) return output mc = self.mc with argscope([Conv2D, MaxPooling, AvgPooling, GlobalAvgPooling, BatchNorm], data_format='NCHW'), \ argscope(Conv2D, use_bias=False): with TowerContext('', is_training=mc.IS_TRAINING): group = 3 channels = [ int(240 * student), int(480 * student), int(960 * student) ] l = tf.transpose(self.image_input, [0, 3, 1, 2]) l = Conv2D('conv1', l, 24, 3, stride=1, nl=BNReLU) l = MaxPooling('pool1', l, 3, 2, padding='SAME') with tf.variable_scope('group1'): for i in range(4): with tf.variable_scope('block{}'.format(i)): l = shufflenet_unit(l, channels[0], group, 2 if i == 0 else 1) with tf.variable_scope('group2'): for i in range(6): with tf.variable_scope('block{}'.format(i)): l = shufflenet_unit(l, channels[1], group, 2 if i == 0 else 1) with tf.variable_scope('group3'): for i in range(4): with tf.variable_scope('block{}'.format(i)): l = shufflenet_unit(l, channels[2], group, 2 if i == 0 else 1) with tf.variable_scope('added3'): with tf.variable_scope('block{}'.format(0)): l = shufflenet_unit_add(l, int(960 * student), 3, 1) with tf.variable_scope('block{}'.format(1)): l = shufflenet_unit_no_shortcut( l, int(768 * student), 3, 1) #768, 384, 192 l = tf.transpose(l, [0, 2, 3, 1]) dropout11 = tf.nn.dropout(l, self.keep_prob, name='drop11') num_output = mc.ANCHOR_PER_GRID * (mc.CLASSES + 1 + 4) self.preds = self._conv_layer_no_pretrain('conv12', dropout11, filters=num_output, size=3, stride=1, padding='SAME', xavier=False, relu=False, stddev=0.0001)
def _add_forward_graph(self): """NN architecture.""" def shufflenet_unit_supervisor(l, out_channel, group, stride): in_shape = l.get_shape().as_list() in_channel = in_shape[1] shortcut = l # We do not apply group convolution on the first pointwise layer # because the number of input channels is relatively small. first_split = group if in_channel != 16 else 1 l = Conv2D('conv1', l, out_channel // 4, kernel_shape=1, split=first_split, nl=BNReLU) l = channel_shuffle(l, group) l = DepthConv('dconv', l, out_channel // 4, kernel_shape=3, nl=BN, stride=stride) l = Conv2D('conv2', l, out_channel if stride == 1 else out_channel - in_channel, kernel_shape=1, split=first_split, nl=BN) if stride == 1: # unit (b) output = tf.nn.relu(shortcut + l) else: # unit (c) shortcut = AvgPooling('avgpool', shortcut, 3, 2, padding='SAME') output = tf.concat([shortcut, tf.nn.relu(l)], axis=1) return output def shufflenet_unit_add_supervisor(l, out_channel, group, stride): in_shape = l.get_shape().as_list() in_channel = in_shape[1] shortcut = l # We do not apply group convolution on the first pointwise layer # because the number of input channels is relatively small. first_split = group if in_channel != 24 else 1 l = Conv2D('conv1', l, out_channel // 4, kernel_shape=1, split=first_split, nl=BNReLU) l = channel_shuffle(l, group) l = DepthConv('dconv', l, out_channel // 4, kernel_shape=3, nl=BN, stride=stride) l = Conv2D('conv2', l, out_channel, kernel_shape=1, split=first_split, nl=BN) output = tf.nn.relu(shortcut + l) return output def shufflenet_unit_no_shortcut_supervisor(l, out_channel, group, stride): in_shape = l.get_shape().as_list() in_channel = in_shape[1] # We do not apply group convolution on the first pointwise layer # because the number of input channels is relatively small. first_split = group if in_channel != 24 else 1 l = Conv2D('conv1', l, out_channel // 4, kernel_shape=1, split=first_split, nl=BNReLU) l = channel_shuffle(l, group) l = DepthConv('dconv', l, out_channel // 4, kernel_shape=3, nl=BN, stride=stride) l = Conv2D('conv2', l, out_channel, kernel_shape=1, split=first_split, nl=BN) output = tf.nn.relu(l) return output def shufflenet_unit(l, out_channel, group, stride): in_shape = l.get_shape().as_list() in_channel = in_shape[1] shortcut = l # We do not apply group convolution on the first pointwise layer # because the number of input channels is relatively small. first_split = group if in_channel != 24 else 1 l = Conv2D('conv1', l, out_channel // 4, kernel_shape=1, split=first_split, nl=c_BNReLU) l = channel_shuffle(l, group) l = DepthConv('dconv', l, out_channel // 4, kernel_shape=3, nl=c_BN, stride=stride) l = Conv2D('conv2', l, out_channel if stride == 1 else out_channel - in_channel, kernel_shape=1, split=group, nl=c_BN) if stride == 1: # unit (b) output = tf.nn.relu(shortcut + l) else: # unit (c) shortcut = AvgPooling('avgpool', shortcut, 3, 2, padding='SAME') output = tf.concat([shortcut, tf.nn.relu(l)], axis=1) return output def shufflenet_unit_add(l, out_channel, group, stride): in_shape = l.get_shape().as_list() in_channel = in_shape[1] shortcut = l # We do not apply group convolution on the first pointwise layer # because the number of input channels is relatively small. first_split = group if in_channel != 24 else 1 l = Conv2D('conv1', l, out_channel // 4, kernel_shape=1, split=first_split, nl=c_BNReLU) l = channel_shuffle(l, group) l = DepthConv('dconv', l, out_channel // 4, kernel_shape=3, nl=c_BN, stride=stride) l = Conv2D('conv2', l, out_channel, kernel_shape=1, split=first_split, nl=c_BN) output = tf.nn.relu(shortcut + l) return output def shufflenet_unit_no_shortcut(l, out_channel, group, stride): in_shape = l.get_shape().as_list() in_channel = in_shape[1] # We do not apply group convolution on the first pointwise layer # because the number of input channels is relatively small. first_split = group if in_channel != 24 else 1 l = Conv2D('conv1', l, out_channel // 4, kernel_shape=1, split=first_split, nl=c_BNReLU) l = channel_shuffle(l, group) l = DepthConv('dconv', l, out_channel // 4, kernel_shape=3, nl=c_BN, stride=stride) l = Conv2D('conv2', l, out_channel, kernel_shape=1, split=first_split, nl=c_BN) output = tf.nn.relu(l) return output mc = self.mc with argscope([Conv2D, MaxPooling, AvgPooling, GlobalAvgPooling, BatchNorm], data_format='NCHW'), \ argscope(Conv2D, use_bias=False): with TowerContext('', is_training=mc.IS_TRAINING): group = 3 channels = [240, 480, 960] l = tf.transpose(self.image_input, [0, 3, 1, 2]) l = Conv2D('conv1', l, 16, 3, stride=1, nl=BNReLU) l = MaxPooling('pool1', l, 3, 2, padding='SAME') with tf.variable_scope('group1'): for i in range(4): with tf.variable_scope('block{}'.format(i)): l = shufflenet_unit_supervisor( l, channels[0], group, 2 if i == 0 else 1) with tf.variable_scope('group2'): for i in range(6): with tf.variable_scope('block{}'.format(i)): l = shufflenet_unit_supervisor( l, channels[1], group, 2 if i == 0 else 1) with tf.variable_scope('group3'): for i in range(4): with tf.variable_scope('block{}'.format(i)): l = shufflenet_unit_supervisor( l, channels[2], group, 2 if i == 0 else 1) with tf.variable_scope('added3'): with tf.variable_scope('block{}'.format(0)): l = shufflenet_unit_add_supervisor(l, 960, 3, 1) with tf.variable_scope('block{}'.format(1)): l = shufflenet_unit_no_shortcut_supervisor( l, 768, 3, 1) l = tf.transpose(l, [0, 2, 3, 1]) dropout11 = tf.nn.dropout(l, self.keep_prob, name='drop11') num_output = mc.ANCHOR_PER_GRID * (mc.CLASSES + 1 + 4) # modify for shuffleunit det head self.preds = self._conv_layer_no_pretrain('conv12', dropout11, filters=num_output, size=3, stride=1, padding='SAME', xavier=False, relu=False, stddev=0.0001)