def build_model(self, dev): with tf.variable_scope(self.name) and tf.device(dev): ## inputs of networks self.screen = tf.placeholder( tf.float32, [None, PP.screen_channel(), self.ssize, self.ssize], name='screen') ## build networks net = build_net(self.screen) self.spatial_action, self.value = net ## targets & masks self.valid_spatial_action = tf.placeholder( tf.float32, [None], name='valid_spatial_action') self.spatial_action_selected = tf.placeholder( tf.float32, [None, self.ssize**2], name='spatial_action_selected') self.value_target = tf.placeholder(tf.float32, [None], name='value_target') ## compute log probability spatial_action_prob = tf.reduce_sum(self.spatial_action * self.spatial_action_selected, axis=1) spatial_action_log_prob = tf.log( tf.clip_by_value(spatial_action_prob, 1e-10, 1.)) ## policy loss & value loss action_log_prob = self.valid_spatial_action * spatial_action_log_prob advantage = tf.stop_gradient(self.value_target - self.value) policy_loss = -tf.reduce_mean(action_log_prob * advantage) value_loss = -tf.reduce_mean(self.value * advantage) self.summary.append(tf.summary.scalar('policy_loss', policy_loss)) self.summary.append(tf.summary.scalar('value_loss', value_loss)) loss = policy_loss + value_loss ## RMSProp optimizer self.learning_rate = tf.placeholder(tf.float32, None, name='learning_rate') opt = tf.train.RMSPropOptimizer(self.learning_rate, decay=0.99, epsilon=1e-10) grads = opt.compute_gradients(loss) cliped_grad = [] for grad, var in grads: self.summary.append(tf.summary.histogram(var.op.name, var)) self.summary.append( tf.summary.histogram(var.op.name + '/grad', grad)) grad = tf.clip_by_norm(grad, 10.0) cliped_grad.append([grad, var]) self.train_op = opt.apply_gradients(cliped_grad) self.summary_op = tf.summary.merge(self.summary) self.saver = tf.train.Saver(max_to_keep=100)
elif isinstance(tup, type(np.empty(0))): annos = torch.from_numpy(tup).float() targets.append(annos) return (torch.stack(imgs, 0), targets) USE_PRE_TRAIN = False pre_train_path = '' IS_TRAIN = True MAX_ITER = 15000 BATCH = 3 ANCHORS = gen_anchors() load_data = VOC_load() net = build_net('train', 304) Loss_function = MultiBoxLoss() if USE_PRE_TRAIN: net.base.load_state_dict(torch.load(pre_train_path)) op = optim.SGD(net.parameters(), lr=4e-3, momentum=0.9, weight_decay=5e-4) #or DATA.tensordata(x=,y=) #collate_fn:取数据的函数,这里本来可以直接用false,但是target第一个维度不唯一,不重新定义会报错 train_data = data.DataLoader(load_data, BATCH, True, collate_fn=detection_collate) for i in range(len(load_data)):
def build_model(self, reuse, dev, ntype): with tf.variable_scope(self.name) and tf.device(dev): if reuse: tf.get_variable_scope().reuse_variables() assert tf.get_variable_scope().reuse # Set inputs of networks # mininap: feature minimap in observation # screen : feature screen in observation # info : available action for current observation self.minimap = tf.placeholder( tf.float32, [None, U.minimap_channel(), self.msize, self.msize], name='minimap') self.screen = tf.placeholder( tf.float32, [None, U.screen_channel(), self.ssize, self.ssize], name='screen') self.info = tf.placeholder(tf.float32, [None, self.isize], name='info') # Build networks net = build_net(self.minimap, self.screen, self.info, self.msize, self.ssize, self.isize, ntype) # net output self.spatial_action, self.non_spatial_action, self.value = net # Set targets and masks self.valid_spatial_action = tf.placeholder( tf.float32, [None], name='valid_spatial_action') self.spatial_action_selected = tf.placeholder( tf.float32, [None, self.ssize**2], name='spatial_action_selected') self.valid_non_spatial_action = tf.placeholder( tf.float32, [None, len(actions.FUNCTIONS)], name='valid_non_spatial_action') self.non_spatial_action_selected = tf.placeholder( tf.float32, [None, len(actions.FUNCTIONS)], name='non_spatial_action_selected') self.value_target = tf.placeholder(tf.float32, [None], name='value_target') # Compute logarithm probability spatial_action_prob = tf.reduce_sum(self.spatial_action * self.spatial_action_selected, axis=1) spatial_action_log_prob = tf.log( tf.clip_by_value(spatial_action_prob, 1e-10, 1.)) non_spatial_action_prob = tf.reduce_sum( self.non_spatial_action * self.non_spatial_action_selected, axis=1) valid_non_spatial_action_prob = tf.reduce_sum( self.non_spatial_action * self.valid_non_spatial_action, axis=1) valid_non_spatial_action_prob = tf.clip_by_value( valid_non_spatial_action_prob, 1e-10, 1.) non_spatial_action_prob = non_spatial_action_prob / valid_non_spatial_action_prob non_spatial_action_log_prob = tf.log( tf.clip_by_value(non_spatial_action_prob, 1e-10, 1.)) self.summary.append( tf.summary.histogram('spatial_action_prob', spatial_action_prob)) self.summary.append( tf.summary.histogram('non_spatial_action_prob', non_spatial_action_prob)) # Compute losses, more details in https://arxiv.org/abs/1602.01783 # Policy loss and value loss action_log_prob = self.valid_spatial_action * spatial_action_log_prob + non_spatial_action_log_prob advantage = tf.stop_gradient(self.value_target - self.value) policy_loss = -tf.reduce_mean(action_log_prob * advantage) value_loss = -tf.reduce_mean(self.value * advantage) self.summary.append(tf.summary.scalar('policy_loss', policy_loss)) self.summary.append(tf.summary.scalar('value_loss', value_loss)) # TODO: policy penalty loss = policy_loss + value_loss # Build the optimizer self.learning_rate = tf.placeholder(tf.float32, None, name='learning_rate') opt = tf.train.RMSPropOptimizer(self.learning_rate, decay=0.99, epsilon=1e-10) grads = opt.compute_gradients(loss) cliped_grad = [] for grad, var in grads: self.summary.append(tf.summary.histogram(var.op.name, var)) self.summary.append( tf.summary.histogram(var.op.name + '/grad', grad)) grad = tf.clip_by_norm(grad, 10.0) cliped_grad.append([grad, var]) self.train_op = opt.apply_gradients(cliped_grad) self.summary_op = tf.summary.merge(self.summary) self.saver = tf.train.Saver(max_to_keep=100)