class DDPG(object): def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.actor_tf = actor(normalized_obs0) self.normalized_critic_tf = critic(normalized_obs0, self.actions) self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() def setup_target_network_updates(self): actor_init_updates, actor_soft_updates = get_target_updates(self.actor.vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates(self.critic.vars, self.target_critic.vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' self.perturbed_actor_tf = param_noise_actor(normalized_obs0) logger.info('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates(self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars] actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = Adam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format(self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars ) self.critic_loss += critic_reg critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_vars] critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = Adam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [M.assign(M * self.old_std / new_std)] self.renormalize_Q_outputs_op += [b.assign((b * self.old_std + self.old_mean - new_mean) / new_std)] def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['ret_rms_mean', 'ret_rms_std'] if self.normalize_observations: ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)] names += ['obs_rms_mean', 'obs_rms_std'] ops += [tf.reduce_mean(self.critic_tf)] names += ['reference_Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['reference_Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ['reference_actor_Q_mean'] ops += [reduce_std(self.critic_with_actor_tf)] names += ['reference_actor_Q_std'] ops += [tf.reduce_mean(self.actor_tf)] names += ['reference_action_mean'] ops += [reduce_std(self.actor_tf)] names += ['reference_action_std'] if self.param_noise: ops += [tf.reduce_mean(self.perturbed_actor_tf)] names += ['reference_perturbed_action_mean'] ops += [reduce_std(self.perturbed_actor_tf)] names += ['reference_perturbed_action_std'] self.stats_ops = ops self.stats_names = names def pi(self, obs, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: [obs]} if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None action = action.flatten() if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q def store_transition(self, obs0, action, reward, obs1, terminal1): reward *= self.reward_scale self.memory.append(obs0, action, reward, obs1, terminal1) if self.normalize_observations: self.obs_rms.update(np.array([obs0])) def train(self): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) if self.normalize_returns and self.enable_popart: old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) self.ret_rms.update(target_Q.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std: np.array([old_std]), self.old_mean: np.array([old_mean]), }) else: target_Q = self.sess.run(self.target_Q, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) # Get all gradients and perform a synced update. ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss] actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], self.critic_target: target_Q, }) self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) return critic_loss, actor_loss def initialize(self, sess): self.sess = sess self.sess.run(tf.global_variables_initializer()) self.sess.run(self.target_init_updates) def update_target_net(self): self.sess.run(self.target_soft_updates) def get_stats(self): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) values = self.sess.run(self.stats_ops, feed_dict={ self.obs0: self.stats_sample['obs0'], self.actions: self.stats_sample['actions'], }) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) if self.param_noise is not None: stats = {**stats, **self.param_noise.get_stats()} return stats def adapt_param_noise(self): if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. batch = self.memory.sample(batch_size=self.batch_size) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ self.obs0: batch['obs0'], self.param_noise_stddev: self.param_noise.current_stddev, }) self.param_noise.adapt(distance) return distance def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, })
def main(): parser = argparse.ArgumentParser() parser.add_argument('--D', '-d', type=int, default=8, help='Dimension of feature vector') parser.add_argument('--T', '-t', type=int, default=2, help='Max step of aggregation') parser.add_argument('--epoch', '-e', type=int, default=100, help='Number of training dataset') parser.add_argument('--batch', '-b', type=int, default=256, help='batch size') parser.add_argument('--flag', '-f', action='store_true', help='make prediction file') args = parser.parse_args() train_H, train_y, train_node_size = get_train() seed = 1996 train_H, train_y, val_H, val_y, train_node_size, val_node_size = shuffle_split(train_H, train_y, train_node_size, split_size=0.7, seed=seed) # feature dimension D = args.D # step size T = args.T # learning rate alpha = 0.015 # epoch size max_epoch = args.epoch # batch size batch_size = args.batch # get step per epoch train_size = len(train_H) iter_per_epoch = train_size//batch_size if (train_size%batch_size) == 0 else (train_size//batch_size)+1 make_pred = args.flag ## make feature vector(train) train_x = get_feature(D, train_H, train_node_size) ## make feature vector(validation) val_x = get_feature(D, val_H, val_node_size) model = GNN(D, T) optimizer = Adam(alpha=alpha, beta1=0.9, beta2=0.999, eps=1e-8) train_loss_list = [] train_acc_list = [] val_loss_list = [] val_acc_list = [] for epoch in range(max_epoch): np.random.seed(int(epoch*1234)) shuffle_idx = np.random.permutation(train_H.shape[0]) train_H = train_H[shuffle_idx] train_x = train_x[shuffle_idx] train_y = train_y[shuffle_idx] for num in range(iter_per_epoch): if train_size > (num+1)*batch_size: batch_H = train_H[num*batch_size:(num+1)*batch_size] batch_x = train_x[num*batch_size:(num+1)*batch_size] batch_y = train_y[num*batch_size:(num+1)*batch_size] else: batch_H = train_H[num*(batch_size):] batch_x = train_x[num*(batch_size):] batch_y = train_y[num*(batch_size):] # get batch gradient and update parameters batch_grads = None for idx in range(len(batch_H)): grad = model.get_gradient(batch_x[idx], batch_H[idx], batch_y[idx]) if batch_grads == None: batch_grads = {} for key, val in grad.items(): batch_grads[key] = np.zeros_like(val) for key in grad.keys(): batch_grads[key] += (grad[key] / len(batch_H)) optimizer.update(model.params, batch_grads) # train loss and average accuracy loss = 0 train_pred = np.zeros((len(train_y), 1)) for idx in range(len(train_H)): loss += model.loss(train_x[idx], train_H[idx], train_y[idx]) / len(train_H) predict = 0 if model.predict(train_x[idx], train_H[idx]) < 1/2 else 1 train_pred[idx] = predict train_score = avg_acc(train_y, train_pred) # validation loss and average accuracy val_loss = 0 val_pred = np.zeros((len(val_y), 1)) for idx in range(len(val_H)): val_loss += model.loss(val_x[idx], val_H[idx], val_y[idx]) / len(val_H) predict = 0 if model.predict(val_x[idx], val_H[idx]) < 1/2 else 1 val_pred[idx] = predict val_score = avg_acc(val_y, val_pred) print('epoch:{} loss:{:.5f} val_loss:{:.5f} avg_acc:{:.5f} val_avg_acc:{:.5f}'.format(epoch+1, loss, val_loss, train_score, val_score)) train_loss_list.append(loss) val_loss_list.append(val_loss) train_acc_list.append(train_score) val_acc_list.append(val_score) fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(10,4)) x = np.arange(len(train_loss_list)) ax1.plot(x, train_loss_list, label='train') x = np.arange(len(val_loss_list)) ax1.plot(x, val_loss_list, label='validation') ax1.legend() ax1.set_xlabel('epoch') ax1.set_ylabel('loss') x = np.arange(len(train_acc_list)) ax2.plot(x, train_acc_list, label='train') x = np.arange(len(val_acc_list)) ax2.plot(x, val_acc_list, label='validation') ax2.legend() ax2.set_xlabel('epoch') ax2.set_ylabel('average accuracy') fig.savefig('src/graph/GNN_Adam.png') plt.close() if make_pred: ## predict test data test_H, test_node_size = get_test() ## make feature vector(test) test_x = get_feature(D, test_H, test_node_size) with open('prediction.txt', mode='w') as f: for idx in range(len(test_node_size)): predict = 0 if model.predict(test_x[idx], test_H[idx]) < 1/2 else 1 f.write('{}'.format(predict) + '\n')
trainer.train() """ epoch_num = 0 start = time.time() for i in range(train_num): batch_mask = np.random.choice(x_train.shape[0], batch_size) x_batch = x_train[batch_mask] t_batch = t_train[batch_mask] batch_mask_test = np.random.choice(x_test.shape[0], batch_size) x_test_batch = x_test[batch_mask_test] t_test_batch = t_test[batch_mask_test] grad = network.gradient(x_batch, t_batch) # パラメータの更新 optimizer.update(network.params, grad) #for key in grad.keys(): # network.params[key] -= learning_rate * grad[key] # 損失関数計算 loss = network.loss(x_batch, t_batch) train_loss_list.append(loss) train_acc = network.accuracy(x_batch, t_batch) test_acc = network.accuracy(x_test_batch, t_test_batch) train_acc_list.append(train_acc) test_acc_list.append(test_acc) print(".", end="", flush=True)