def thread_memory(config, memory_queue, batch_queue, update_p_queue, priority_environment_queue): memory = ReplayMemory(config, memory_queue, batch_queue, update_p_queue, priority_environment_queue) memory.loop() return
resize_shape = (1, 30, 90) # 训练缩放的大小 FPS = 10 # 控制游戏截图帧数 # 实例化一个游戏环境,参数为游戏名称 env = DinoGame(reshape=resize_shape) # 图像输入形状和动作维度 obs_dim = env.observation_space.shape[0] action_dim = env.action_space.n # 创建策略模型和目标模型,目标模型不参与训练 policyQ = Model(obs_dim, action_dim) targetQ = Model(obs_dim, action_dim) targetQ.eval() # 数据记录器 rpm = ReplayMemory(memory_size) # 优化方法 optimizer = paddle.optimizer.Adam(parameters=policyQ.parameters(), learning_rate=learning_rate) # 评估模型 def evaluate(): total_reward = 0 obs = env.reset() last_time = time.time() while True: obs = np.expand_dims(obs, axis=0) obs = paddle.to_tensor(obs, dtype='float32') action = targetQ(obs) action = paddle.argmax(action).numpy()[0]
def __init__(self, dimO, dimA,num_layer,num_nodes): self.dimA = dimA[0] dimA = list(dimA) dimO = list(dimO) if num_layer == 2: if num_nodes == 1: import ddpg_nets_dm_conv2_1 nets = ddpg_nets_dm_conv2_1 elif num_nodes == 2: import ddpg_nets_dm_conv2_2 nets = ddpg_nets_dm_conv2_2 elif num_layer == 3: if num_nodes == 1: import ddpg_nets_dm_conv3_1 nets = ddpg_nets_dm_conv3_1 elif num_nodes == 2: import ddpg_nets_dm_conv3_2 nets = ddpg_nets_dm_conv3_2 tau = FLAGS.tau discount = FLAGS.discount pl2norm = FLAGS.pl2norm l2norm = FLAGS.l2norm plearning_rate = FLAGS.prate learning_rate = FLAGS.rate outheta = FLAGS.outheta ousigma = FLAGS.ousigma # init replay memory self.rm = ReplayMemory(FLAGS.rmsize, dimO, dimA) # start tf session self.sess = tf.Session(config=tf.ConfigProto( inter_op_parallelism_threads=FLAGS.thread, log_device_placement=False, allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=True))) # create tf computational graph # self.theta_p = nets.theta_p(dimO, dimA, FLAGS.l1size, FLAGS.l2size) self.theta_q = nets.theta_q(dimO, dimA, FLAGS.l1size, FLAGS.l2size) # self.thetaq_cvx_ = [v for v in self.theta_q # if 'conv' in v.name] #self.makeCvx = [v.assign(-tf.abs(v)) for v in self.thetaq_cvx_] #self.proj = [v.assign(tf.minimum(v, 0)) for v in self.thetaq_cvx_] self.theta_pt, update_pt = exponential_moving_averages(self.theta_p, tau) self.theta_qt, update_qt = exponential_moving_averages(self.theta_q, tau) obs = tf.placeholder(tf.float32, [None] + dimO, "obs") act_test = nets.policy(obs, self.theta_p) # explore self.epsilon = 1 self.noise = np.zeros(self.dimA) self.noise -= FLAGS.outheta*self.noise - \ FLAGS.ousigma*npr.randn(self.dimA) act_expl = act_test +self.epsilon* self.noise #self.epsilon -= 1/5000000 # test q = nets.qfunction(obs, act_test, self.theta_q) # training # q optimization act_train = tf.placeholder(tf.float32, [FLAGS.bsize] + dimA, "act_train") rew = tf.placeholder(tf.float32, [FLAGS.bsize], "rew") obs2 = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO, "obs2") term2 = tf.placeholder(tf.bool, [FLAGS.bsize], "term2") # policy loss act_train_policy = nets.policy(obs, self.theta_p) q_train_policy = nets.qfunction(obs, act_train_policy, self.theta_q) meanq = tf.reduce_mean(q_train_policy, 0) wd_p = tf.add_n([pl2norm * tf.nn.l2_loss(var) for var in self.theta_p]) # weight decay loss_p = -meanq + wd_p # policy optimization optim_p = tf.train.AdamOptimizer(learning_rate=plearning_rate, epsilon=1e-4) grads_and_vars_p = optim_p.compute_gradients(loss_p, var_list=self.theta_p) optimize_p = optim_p.apply_gradients(grads_and_vars_p) with tf.control_dependencies([optimize_p]): train_p = tf.group(update_pt) # q q_train = nets.qfunction(obs, act_train, self.theta_q) # q targets act2 = nets.policy(obs2, theta=self.theta_pt) q2 = nets.qfunction(obs2, act2, theta=self.theta_qt) q_target = tf.stop_gradient(tf.where(term2, rew, rew + discount * q2)) # q_target = tf.stop_gradient(rew + discount * q2) # q loss td_error = q_train - q_target ms_td_error = tf.reduce_mean(tf.square(td_error), 0) wd_q = tf.add_n([l2norm * tf.nn.l2_loss(var) for var in self.theta_q]) # weight decay loss_q = ms_td_error + wd_q # q optimization optim_q = tf.train.AdamOptimizer(learning_rate=learning_rate, epsilon=1e-4) grads_and_vars_q = optim_q.compute_gradients(loss_q, var_list=self.theta_q) optimize_q = optim_q.apply_gradients(grads_and_vars_q) with tf.control_dependencies([optimize_q]): train_q = tf.group(update_qt) summary_writer = tf.summary.FileWriter(os.path.join(FLAGS.outdir, 'board'), self.sess.graph) summary_list = [] summary_list.append(tf.summary.scalar('Qvalue', tf.reduce_mean(q_train))) summary_list.append(tf.summary.scalar('loss', ms_td_error)) summary_list.append(tf.summary.scalar('reward', tf.reduce_mean(rew))) # tf functions with self.sess.as_default(): self._act_test = Fun(obs, act_test) self._act_expl = Fun(obs, act_expl) #self._reset = Fun([], self.ou_reset) self._train = Fun([obs, act_train, rew, obs2, term2], [train_p, train_q, loss_q], summary_list, summary_writer) # initialize tf variables self.saver = tf.train.Saver(max_to_keep=1) ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf") if ckpt: self.saver.restore(self.sess, ckpt) else: self.sess.run(tf.initialize_all_variables()) #self.sess.run(self.makeCvx) self.sess.graph.finalize() self.t = 0 # global training time (number of observations)
def train(args, net, env): # Begin tf session with tf.Session() as sess: # Initialize variables tf.global_variables_initializer().run() saver = tf.train.Saver(tf.global_variables(), max_to_keep=5) # load from previous save if len(args.ckpt_name) > 0: saver.restore(sess, os.path.join(args.save_dir, args.ckpt_name)) # Load data shift = sess.run(net.shift) scale = sess.run(net.scale) shift_u = sess.run(net.shift_u) scale_u = sess.run(net.scale_u) replay_memory = ReplayMemory(args, shift, scale, shift_u, scale_u, env, net, sess) # Store normalization parameters sess.run(tf.assign(net.shift, replay_memory.shift_x)) sess.run(tf.assign(net.scale, replay_memory.scale_x)) sess.run(tf.assign(net.shift_u, replay_memory.shift_u)) sess.run(tf.assign(net.scale_u, replay_memory.scale_u)) #Function to evaluate loss on validation set def val_loss(kl_weight): replay_memory.reset_batchptr_val() loss = 0.0 for b in range(replay_memory.n_batches_val): # Get inputs batch_dict = replay_memory.next_batch_val() x = batch_dict["states"] u = batch_dict['inputs'] # Construct inputs for network feed_in = {} feed_in[net.x] = np.reshape( x, (2 * args.batch_size * args.seq_length, args.state_dim)) feed_in[net.u] = u if args.kl_weight > 0.0: feed_in[net.kl_weight] = kl_weight else: feed_in[net.kl_weight] = 1.0 # Find loss feed_out = net.cost cost = sess.run(feed_out, feed_in) loss += cost return loss / replay_memory.n_batches_val # Initialize variable to track validation score over time old_score = 1e9 count_decay = 0 decay_epochs = [] # Define temperature for annealing kl_weight T = args.anneal_time * replay_memory.n_batches_train count = 0 # Loop over epochs for e in range(args.num_epochs): visualize_predictions(args, sess, net, replay_memory, env, e) # Initialize loss loss = 0.0 rec_loss = 0.0 kl_loss = 0.0 loss_count = 0 replay_memory.reset_batchptr_train() # Loop over batches for b in range(replay_memory.n_batches_train): start = time.time() count += 1 # Update kl_weight if e < args.start_kl: kl_weight = 1e-3 else: count += 1 kl_weight = min(args.kl_weight, 1e-3 + args.kl_weight * count / float(T)) # Get inputs batch_dict = replay_memory.next_batch_train() x = batch_dict["states"] u = batch_dict['inputs'] # Construct inputs for network feed_in = {} feed_in[net.x] = np.reshape( x, (2 * args.batch_size * args.seq_length, args.state_dim)) feed_in[net.u] = u feed_in[net.kl_weight] = kl_weight # Find loss and perform training operation feed_out = [ net.cost, net.loss_reconstruction, net.kl_loss, net.train ] out = sess.run(feed_out, feed_in) # Update and display cumulative losses loss += out[0] rec_loss += out[1] kl_loss += out[2] loss_count += 1 end = time.time() # Print loss if (e * replay_memory.n_batches_train + b) % 100 == 0 and b > 0: print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(e * replay_memory.n_batches_train + b, args.num_epochs * replay_memory.n_batches_train, e, loss/loss_count, end - start)) print("{}/{} (epoch {}), rec_loss = {:.3f}, time/batch = {:.3f}" \ .format(e * replay_memory.n_batches_train + b, args.num_epochs * replay_memory.n_batches_train, e, rec_loss/loss_count, end - start)) print("{}/{} (epoch {}), kl_loss = {:.3f}, time/batch = {:.3f}" \ .format(e * replay_memory.n_batches_train + b, args.num_epochs * replay_memory.n_batches_train, e, kl_loss/loss_count, end - start)) print('') loss = 0.0 rec_loss = 0.0 kl_loss = 0.0 loss_count = 0 # Evaluate loss on validation set score = val_loss(args.kl_weight * (e >= args.start_kl)) print('Validation Loss: {0:f}'.format(score)) # Set learning rate if (old_score - score) < 0.01 and e != args.start_kl: count_decay += 1 decay_epochs.append(e) if len(decay_epochs) >= 3 and np.sum( np.diff(decay_epochs)[-2:]) == 2: break print('setting learning rate to ', args.learning_rate * (args.decay_rate**count_decay)) sess.run( tf.assign( net.learning_rate, args.learning_rate * (args.decay_rate**count_decay))) if args.learning_rate * (args.decay_rate**count_decay) < 1e-5: break print('learning rate is set to ', args.learning_rate * (args.decay_rate**count_decay)) old_score = score # Save model every epoch checkpoint_path = os.path.join(args.save_dir, args.save_name + '.ckpt') saver.save(sess, checkpoint_path, global_step=e) print("model saved to {}".format(checkpoint_path))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--num-envs', type=int, default=32) parser.add_argument('--t-max', type=int, default=1) parser.add_argument('--learning-rate', type=float, default=0.0002) parser.add_argument('--seed', type=int, default=0) parser.add_argument('--steps-per-epoch', type=int, default=100000) parser.add_argument('--testing', type=int, default=0) parser.add_argument('--continue-training', type=int, default=4) parser.add_argument('--epoch-num', type=int, default=20) parser.add_argument('--start-epoch', type=int, default=20) parser.add_argument('--testing-epoch', type=int, default=0) parser.add_argument('--save-log', type=str, default='log') parser.add_argument('--signal-num', type=int, default=4) parser.add_argument('--toxin', type=int, default=0) parser.add_argument('--a1-AC-folder', type=str, default='basic/a1_Qnet') parser.add_argument('--a2-AC-folder', type=str, default='basic/a2_Qnet') parser.add_argument('--eps-start', type=float, default=1.0) parser.add_argument('--replay-start-size', type=int, default=50000) parser.add_argument('--decay-rate', type=int, default=50000) parser.add_argument('--replay-memory-size', type=int, default=1000000) parser.add_argument('--eps-min', type=float, default=0.1) args = parser.parse_args() config = Config(args) t_max = args.t_max q_ctx = config.ctx steps_per_epoch = args.steps_per_epoch np.random.seed(args.seed) start_epoch = args.start_epoch testing_epoch = args.testing_epoch save_log = args.save_log epoch_num = args.epoch_num epoch_range = range(epoch_num) signal_num = args.signal_num toxin = args.toxin a1_Qnet_folder = args.a1_AC_folder a2_Qnet_folder = args.a2_AC_folder freeze_interval = 10000 update_interval = 5 replay_memory_size = args.replay_memory_size discount = 0.99 replay_start_size = args.replay_start_size history_length = 1 eps_start = args.eps_start eps_min = args.eps_min eps_decay = (eps_start - eps_min) / args.decay_rate eps_curr = eps_start freeze_interval /= update_interval minibatch_size = 32 testing = args.testing testing = True if testing == 1 else False continue_training = args.continue_training continue_training = True if continue_training == 1 else False rewards = { "positive": 1.0, "negative": -1.0, "tick": -0.002, "loss": -2.0, "win": 2.0 } game = HunterWorld(width=256, height=256, num_preys=10, draw=False, num_hunters=2, num_toxins=toxin) env = PLE(game, fps=30, force_fps=True, display_screen=False, reward_values=rewards, resized_rows=80, resized_cols=80, num_steps=3) action_set = env.get_action_set() action_map1 = [] for action in action_set[0].values(): action_map1.append(action) action_map2 = [] for action in action_set[1].values(): action_map2.append(action) action_num = len(action_map1) replay_memory1 = ReplayMemory(state_dim=(74, ), history_length=history_length, memory_size=replay_memory_size, replay_start_size=replay_start_size, state_dtype='float32') a1_target1 = Qnetwork(actions_num=action_num, q_ctx=q_ctx, isTrain=False, batch_size=1, dir=dir, folder=a1_Qnet_folder) a1_target32 = Qnetwork(actions_num=action_num, q_ctx=q_ctx, isTrain=False, batch_size=32, dir=dir, folder=a1_Qnet_folder) Qnet1 = Qnetwork(actions_num=action_num, q_ctx=q_ctx, isTrain=True, batch_size=32, dir=dir, folder=a1_Qnet_folder) a2_target1 = Qnetwork(actions_num=action_num, q_ctx=q_ctx, isTrain=False, batch_size=1, dir=dir, folder=a2_Qnet_folder) a2_target32 = Qnetwork(actions_num=action_num, q_ctx=q_ctx, isTrain=False, batch_size=32, dir=dir, folder=a2_Qnet_folder) Qnet2 = Qnetwork(actions_num=action_num, q_ctx=q_ctx, isTrain=True, batch_size=32, dir=dir, folder=a2_Qnet_folder) training_steps = 0 total_steps = 0 if testing: env.force_fps = False env.game.draw = True env.display_screen = True Qnet1.load_params(testing_epoch) Qnet2.load_params(testing_epoch) elif continue_training: epoch_range = range(start_epoch, epoch_num + start_epoch) Qnet1.load_params(start_epoch - 1) Qnet2.load_params(start_epoch - 1) logging_config(logging, dir, save_log, file_name) else: logging_config(logging, dir, save_log, file_name) copyTargetQNetwork(Qnet1.model, a1_target1.model) copyTargetQNetwork(Qnet1.model, a1_target32.model) copyTargetQNetwork(Qnet2.model, a2_target1.model) copyTargetQNetwork(Qnet2.model, a2_target32.model) logging.info('args=%s' % args) logging.info('config=%s' % config.__dict__) print_params(logging, Qnet1.model) print_params(logging, Qnet2.model) for epoch in epoch_range: steps_left = steps_per_epoch episode = 0 epoch_reward = 0 start = time.time() env.reset_game() while steps_left > 0: episode += 1 episode_loss = 0.0 episode_q_value = 0.0 episode_update_step = 0 episode_action_step = 0 episode_reward = 0 episode_step = 0 collisions = 0.0 time_episode_start = time.time() env.reset_game() next_ob = env.get_states() while not env.game_over(): if replay_memory1.size >= history_length and replay_memory1.size > replay_start_size: do_exploration = (np.random.rand() < eps_curr) eps_curr = max(eps_curr - eps_decay, eps_min) if do_exploration: action1 = np.random.randint(action_num) action2 = np.random.randint(action_num) else: current_state1 = next_ob[0].reshape(1, 74) current_state2 = next_ob[1].reshape(1, 74) state1 = nd.array( current_state1.reshape((1, ) + current_state1.shape), ctx=q_ctx) state2 = nd.array( current_state2.reshape((1, ) + current_state2.shape), ctx=q_ctx) a1_target1.model.forward(mx.io.DataBatch([state1], [])) a2_target1.model.forward(mx.io.DataBatch([state2], [])) q_value1 = a1_target1.model.get_outputs()[0].asnumpy( )[0] q_value2 = a2_target1.model.get_outputs()[0].asnumpy( )[0] action1 = numpy.argmax(q_value1) action2 = numpy.argmax(q_value2) episode_q_value += q_value1[action1] episode_q_value += q_value2[action2] episode_action_step += 1 else: action1 = np.random.randint(action_num) action2 = np.random.randint(action_num) next_ob, reward, terminal_flag = env.act( [action_map1[action1], action_map2[action2]]) replay_memory1.append(next_ob[0], action1, reward[0], terminal_flag) total_steps += 1 sum_reward = sum(reward) episode_reward += sum_reward if sum_reward < 0: collisions += 1 episode_step += 1 if total_steps % update_interval == 0 and replay_memory1.size > replay_start_size: training_steps += 1 state_batch1, actions1, rewards1, nextstate_batch1, terminate_flags1 = replay_memory1.sample( batch_size=minibatch_size) state_batch2, actions2, rewards2, nextstate_batch2, terminate_flags2 = replay_memory1.sample( batch_size=minibatch_size) state_batch1 = nd.array(state_batch1, ctx=q_ctx) actions_batch1 = nd.array(actions1, ctx=q_ctx) reward_batch1 = nd.array(rewards1, ctx=q_ctx) terminate_flags1 = nd.array(terminate_flags1, ctx=q_ctx) state_batch2 = nd.array(state_batch2, ctx=q_ctx) actions_batch2 = nd.array(actions2, ctx=q_ctx) reward_batch2 = nd.array(rewards2, ctx=q_ctx) terminate_flags2 = nd.array(terminate_flags2, ctx=q_ctx) a1_target32.model.forward( mx.io.DataBatch( [nd.array(nextstate_batch1, ctx=q_ctx)], [])) Qvalue1 = a1_target32.model.get_outputs()[0] y_batch1 = reward_batch1 + nd.choose_element_0index( Qvalue1, nd.argmax_channel(Qvalue1)) * ( 1.0 - terminate_flags1) * discount Qnet1.model.forward(mx.io.DataBatch( [state_batch1, actions_batch1, y_batch1], []), is_train=True) Qnet1.model.backward() Qnet1.model.update() a2_target32.model.forward( mx.io.DataBatch( [nd.array(nextstate_batch2, ctx=q_ctx)], [])) Qvalue2 = a2_target32.model.get_outputs()[0] y_batch2 = reward_batch2 + nd.choose_element_0index( Qvalue2, nd.argmax_channel(Qvalue2)) * ( 1.0 - terminate_flags2) * discount Qnet2.model.forward(mx.io.DataBatch( [state_batch2, actions_batch2, y_batch2], []), is_train=True) Qnet2.model.backward() Qnet2.model.update() if training_steps % 10 == 0: loss1 = 0.5 * nd.square( nd.choose_element_0index( Qnet1.model.get_outputs()[0], actions_batch1) - y_batch1) loss2 = 0.5 * nd.square( nd.choose_element_0index( Qnet2.model.get_outputs()[0], actions_batch2) - y_batch2) episode_loss += nd.sum(loss1).asnumpy() episode_loss += nd.sum(loss2).asnumpy() episode_update_step += 1 if training_steps % freeze_interval == 0: copyTargetQNetwork(Qnet1.model, a1_target1.model) copyTargetQNetwork(Qnet1.model, a1_target32.model) copyTargetQNetwork(Qnet2.model, a2_target1.model) copyTargetQNetwork(Qnet2.model, a2_target32.model) steps_left -= episode_step time_episode_end = time.time() epoch_reward += episode_reward info_str = "Epoch:%d, Episode:%d, Steps Left:%d/%d/%d, Reward:%f, fps:%f, Exploration:%f" \ % (epoch, episode, steps_left, episode_step, steps_per_epoch, episode_reward, episode_step / (time_episode_end - time_episode_start), eps_curr) info_str += ", Collision:%f/%d " % (collisions / episode_step, collisions) if episode_update_step > 0: info_str += ", Avg Loss:%f/%d" % (episode_loss / episode_update_step, episode_update_step * 10) if episode_action_step > 0: info_str += ", Avg Q Value:%f/%d " % ( episode_q_value / episode_action_step, episode_action_step) if episode % 1 == 0: logging.info(info_str) print info_str end = time.time() fps = steps_per_epoch / (end - start) Qnet1.save_params(epoch) Qnet2.save_params(epoch) logging.info("Epoch:%d, FPS:%f, Avg Reward: %f/%d" % (epoch, fps, epoch_reward / float(episode), episode))
if args.random_seed: random.seed(args.random_seed) # instantiate classes if args.environment == 'ale': env = ALEEnvironment(args.game, args) logger.info("Using ALE Environment") elif args.environment == 'gym': logger.handlers.pop() env = GymEnvironment(args.game, args) logger.info("Using Gym Environment") else: assert False, "Unknown environment" + args.environment mem = ReplayMemory(args.replay_size, args) net = DeepQNetwork(env.numActions(), args) agent = Agent(env, mem, net, args) stats = Statistics(agent, net, mem, env, args) if args.load_weights: logger.info("Loading weights from %s" % args.load_weights) net.load_weights(args.load_weights) if args.play_games: logger.info("Playing for %d game(s)" % args.play_games) # Set env mode test so that loss of life is not considered as terminal env.setMode('test') stats.reset() agent.play(args.play_games) stats.write(0, "play")
parser.add_argument('--train_freq', type=int, default=1) parser.add_argument('--save_freq', type=int, default=10000) parser.add_argument('--log_freq', type=int, default=1000) args = parser.parse_args() # create environment and add standard wrappers env = gym.make(args.env_name) # create main model and target model. model = create_model(env.observation_space, env.action_space, args) target_model = create_model(env.observation_space, env.action_space, args) # copy main model weights to target update_target(model, target_model) # create replay memory replay_memory = ReplayMemory(args.replay_size, env.observation_space.shape) # statistics loss = 0 qmean = 0 rewards = [] lengths = [] episode_num = 0 episode_reward = 0 episode_length = 0 num_iterations = 0 # reset the environment obs = env.reset() # loop for args.num_timesteps steps for t in range(args.num_timesteps):
from agent import Agent from model_utils import saveModel, loadModel # Environment Settings env = gym.make('LunarLander-v2') env.seed(0) state_space = env.observation_space.shape[0] action_space = env.action_space.n print('State shape: ', state_space) print('Number of actions: ', action_space) # ReplayMemory Settings capacity = 5000 batch_size = 64 replayMemory = ReplayMemory(capacity, batch_size) # Strategy Settings strategy = EpsilonGreedyStrategy(1, 0, 1E-3) # Deep Learning Model model = Model(state_space, action_space) path = './checkpoint.pth.tar' # To save/load the model pathExist = os.path.isfile(path) pathExist and input( 'The file {} will be replaced, do you wish to continue? (if not, press ctrl+c)' .format(path)) # if you want load some model uncoment the lines below not (pathExist) and print('{} don\'t founded'.format(path)) model, _, _, _ = loadModel(
def policy_var(policy, mcts, n=200): states, actions, qvalues = mcts.memory.sample(n) states = torch.FloatTensor(states).to(device) actions, _, mean = policy.sample(states) return (actions - mean).norm(2) / n policy = GaussianPolicy(state_dim, action_dim).to(device=device) if args.double_Q: critic = DoubleQNetwork(state_dim, action_dim, args.hidden_size).to(device=device) else: critic = QNetwork(state_dim, action_dim, args.hidden_size).to(device=device) memory = ReplayMemory(20000, args.seed, state_dim, action_dim) traj_memory = TrajReplayMemory(200000, args.seed, state_dim, action_dim) alg = ActorCriticMCTS(policy, critic, env, memory, traj_memory, args) __ = 0 while len(memory) < 300: state = env.reset() ps, pa, pr, _ = alg.Interaction(state, done=False, steps=0, max_steps=args.max_interact_steps, rand_act=True) parse_path(ps, pa, pr, memory) parse_path_to_traj(ps, pa, pr, traj_memory,
total_samples = 0 # Making the network to be in evaluation mode so that losses won't be accumulated, thereby saving memory self.eval() poi_vals = env.set_poi_values() if not os.path.exists(args.save_foldername): os.makedirs(args.save_foldername) if args.algo == "NAF": agent = NAF(args.gamma, args.tau, args.num_hnodes, args.autoencoder_output_length, env.action_space, args) else: agent = DDPG(args.gamma, args.tau, args.num_hnodes, args.autoencoder_output_length, env.action_space, args) memory = ReplayMemory(args.buffer_size) ounoise = OUNoise(env.action_space.shape[0]) model1 = AutoEncoder(40) model1.load_state_dict(torch.load('DDDPG_4_10_20_LS_20.pth')) model1 = model1.to(device) model1.eval() episode_rewards_list = [] rover_path_list = [] poi_pos_list = [] poi_status_list = [] for i_episode in range(args.num_episodes): joint_state = utils.to_tensor(np.array( env.reset())) # reset the environment
policy_net = DQN(H, W, n_actions).to(device) target_net = DQN(H, W, n_actions).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() writer = SummaryWriter() Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward')) optimizer = optim.RMSprop(policy_net.parameters(), lr=LEARNING_RATE) # optimizer = optim.Adam(policy.parameters(), lr=LEARNING_RATE) MEMORY_CAPACITY = 100000 memory = ReplayMemory(MEMORY_CAPACITY) def get_screen(): screen = env.render(render_scale=1, complete=False, store=False) H, W = screen.shape # screen = np.reshape(screen,(1, H, W)) # print(screen.shape) screen = np.ascontiguousarray(screen, dtype=np.float32) screen = torch.from_numpy(screen) screen = transform(screen.unsqueeze(0)) # pdb.set_trace() return screen.to(device) def preprocess_frame(frame):
def setUp(self): self.heap = BinaryHeap() self.replayMemory = ReplayMemory(10, 32, 4, 84, 84)
def __init__(self, env): self.env = env tf.reset_default_graph() self.sess = tf.Session() # A few starter hyperparameters # hyperparameters self.gamma = 0.99 self.h1 = 64 self.h2 = 64 self.h3 = 64 self.l2_reg = 1e-6 self.max_episode_step = 1000 self.update_slow_target_every = 100 self.batch_size = 1024 self.eps_start = 1.0 self.epsilon_end = 0.05 self.epsilon_decay_length = 1e5 self.epsilon_decay_exp = 0.97 self.num_episodes = 0 self.num_steps = 0 self.epsilon_linear_step = ( self.eps_start - self.epsilon_end) / self.epsilon_decay_length # memory self.replay_memory = ReplayMemory(1e6) # Perhaps you want to have some samples in the memory before starting to train? self.min_replay_size = 2000 # define yours training operations here... self.observation_input = tf.placeholder( tf.float32, shape=[None] + list(self.env.observation_space.shape)) self.target_input = tf.placeholder( dtype=tf.float32, shape=[None] + list(self.env.observation_space.shape) ) # input to slow target network with tf.variable_scope('q_network') as scope: self.q_values = self.build_model(self.observation_input) with tf.variable_scope('target_network') as scope: self.target_q_values = self.build_model(self.observation_input, False) self.q_network_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='q_network') self.q_target_network_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='target_network') # update values for slowly-changing target network to match current critic network update_slow_target_ops = [] for i, slow_target_var in enumerate(self.q_target_network_vars): update_slow_target_op = slow_target_var.assign( self.q_network_vars[i]) update_slow_target_ops.append(update_slow_target_op) self.update_slow_target_op = tf.group(*update_slow_target_ops, name='update_slow_target') # define your update operations here... self.saver = tf.train.Saver(tf.trainable_variables()) self.target = tf.placeholder(tf.float32, shape=[None]) self.actions = tf.placeholder(shape=[None], dtype=tf.int32) #Calculating the action q value is taken from https://github.com/dennybritz/reinforcement-learning/tree/master/DQN gather_indices = tf.range(self.batch_size) * tf.shape( self.q_values)[1] + self.actions self.action_predictions = tf.gather(tf.reshape(self.q_values, [-1]), gather_indices) self.loss = tf.losses.huber_loss( self.target, self.action_predictions ) #tf.squared_difference(self.target, self.action_predictions) #Adding a regularization term for the weights for var in self.q_network_vars: if not 'bias' in var.name: self.loss += self.l2_reg * 0.5 * tf.nn.l2_loss(var) #self.loss = (self.target-self.action_predictions)**2 #self.losses = tf.reduce_mean(self.loss) self.minimizer = tf.train.AdamOptimizer(learning_rate=1e-6).minimize( self.loss ) #tf.train.GradientDescentOptimizer(1e-5).minimize(self.losses) self.sess.run(tf.global_variables_initializer()) self.writer = tf.summary.FileWriter(LOGDIR) self.writer.add_graph(self.sess.graph) self.count = 0 # Summaries for Tensorboard tf.summary.scalar("loss", self.loss) #tf.summary.scalar("loss_hist", self.losses), tf.summary.histogram("q_values_hist", self.q_values), tf.summary.scalar("max_q_value", tf.reduce_max(self.q_values)) self.summ = tf.summary.merge_all()
def __init__(self, simulator, gamma=0.99, mem_size=int(1e5), lr=9e-4, batch_size=32, ode_tol=1e-3, ode_dim=20, enc_hidden_to_latent_dim=20, latent_dim=10, eps_decay=1e-4, weight_decay=1e-3, model=None, timer_type='', latent_policy=False, obs_normal=False, exp_id=0, trained_model_path='', ckpt_path='', traj_data_path='', logger=None): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.exp_id = exp_id self.simulator = simulator self.batch_size = batch_size self.memory_traj_train = ReplayMemory(mem_size, Trajectory) self.memory_traj_test = ReplayMemory(mem_size // 10, Trajectory) self.input_dim = self.simulator.num_states + self.simulator.num_actions self.output_dim = self.simulator.num_states self.latent_dim = latent_dim self.ckpt_path = ckpt_path self.logger = logger self.rms = RunningStats(dim=self.simulator.num_states, device=self.device) if obs_normal else None # policy and replay buffer assert not (model == 'free' and latent_policy) if 'HalfCheetah' in repr(simulator) or 'Swimmer' in repr( simulator) or 'Hopper' in repr(simulator): self.policy = PolicyDDPG(state_dim=self.simulator.num_states, action_dim=self.simulator.num_actions, device=self.device, gamma=gamma, latent=latent_policy) self.memory_trans = ReplayMemory(mem_size, Transition) else: state_dim = self.simulator.num_states + latent_dim if latent_policy else self.simulator.num_states self.policy = PolicyDQN(state_dim=state_dim, action_dim=self.simulator.num_actions, device=self.device, gamma=gamma, latent=latent_policy) self.memory_trans = PrioritizedReplayMemory(mem_size, Transition) # model min_t, max_t, max_time_length, is_cont = simulator.get_time_info() timer_choice = Timer if timer_type == 'fool' else MLPTimer timer = timer_choice(input_dim=self.input_dim + self.latent_dim, output_dim=1 if is_cont else max_t - min_t + 1, min_t=min_t, max_t=max_t, max_time_length=max_time_length, device=self.device).to(self.device) # ode network if 'ode' in model: gen_ode_func = ODEFunc( ode_func_net=utils.create_net(latent_dim, latent_dim, n_layers=2, n_units=ode_dim, nonlinear=nn.Tanh)).to( self.device) diffq_solver = DiffeqSolver(gen_ode_func, 'dopri5', odeint_rtol=ode_tol, odeint_atol=ode_tol / 10) # encoder if model == 'vae-rnn' or model == 'latent-ode': encoder = Encoder_z0_RNN( latent_dim, self.input_dim, hidden_to_z0_units=enc_hidden_to_latent_dim, device=self.device).to(self.device) z0_prior = Normal( torch.tensor([0.]).to(self.device), torch.tensor([1.]).to(self.device)) # decoder decoder = Decoder(latent_dim, self.output_dim, n_layers=0).to(self.device) if model == 'free' or model == 'rnn': self.model = VanillaGRU(input_dim=self.input_dim, latent_dim=latent_dim, eps_decay=eps_decay, decoder=decoder, timer=timer, device=self.device).to(self.device) elif model == 'deltaT-rnn': self.model = DeltaTGRU(input_dim=self.input_dim, latent_dim=latent_dim, eps_decay=eps_decay, decoder=decoder, timer=timer, device=self.device).to(self.device) elif model == 'decay-rnn': self.model = ExpDecayGRU(input_dim=self.input_dim, latent_dim=latent_dim, eps_decay=eps_decay, decoder=decoder, timer=timer, device=self.device).to(self.device) elif model == 'ode-rnn': self.model = ODEGRU(input_dim=self.input_dim, latent_dim=latent_dim, eps_decay=eps_decay, decoder=decoder, diffeq_solver=diffq_solver, timer=timer, device=self.device).to(self.device) elif model == 'vae-rnn': self.model = VAEGRU(input_dim=self.input_dim, latent_dim=latent_dim, eps_decay=eps_decay, encoder_z0=encoder, decoder=decoder, z0_prior=z0_prior, timer=timer, device=self.device).to(self.device) elif model == 'latent-ode': self.model = LatentODE(input_dim=self.input_dim, latent_dim=latent_dim, eps_decay=eps_decay, encoder_z0=encoder, decoder=decoder, diffeq_solver=diffq_solver, z0_prior=z0_prior, timer=timer, device=self.device).to(self.device) else: raise NotImplementedError if trained_model_path: self.model.load_state_dict( torch.load(trained_model_path, map_location=self.device)['model_state_dict']) if traj_data_path: self.load_traj_buffer(traj_data_path) self.optimizer = optim.Adam(self.model.parameters(), lr=lr, weight_decay=weight_decay)
def main(): sess = tf.Session() K.set_session(sess) env = gym.make("MountainCarContinuous-v0") #Parameters memory_size = 100000 batch_size = 32 tau = 0.001 lr_actor = 0.0001 lr_critic = 0.001 discount_factor = 0.99 episodes = 1001 time_steps = 501 collect_experience = 50000 save_frequency = 250 ep_reward = [] training = False #Noise objecct noise = OUNoise(env.action_space) #Initialize actor and critic objects actor = Actor(env, sess, lr_actor, tau) #Uncomment to the following line to save the actor model architecture as json file. Need to be saved #once only # actor.save_model_architecture("Actor_model_architecture.json") critic = Critic(env, sess, lr_critic, tau, discount_factor) #Initialize replay memory of size defined by memory_size replay_memory = ReplayMemory(memory_size) #Toggle between true and false for debugging purposes. For training it is always true run = True if run: #Loop over the number of episodes. At eqach new episode reset the environment, reset the noise #state and set total episode reward to 0 for episode in range(episodes): state = env.reset() noise.reset() episode_reward = 0 #Loop over the number of steps in an episode for time in range(time_steps): #Uncomment the following line of you want to visualize the mountain car during training. #Can also be trained without visualization for the case where we are using #position and velocities as state variables. # env.render() #Predict an action from the actor model using the current state action = actor.predict_action(state.reshape((1, 2)))[0] #Add ohlnbeck noise to the predicted action to encourage exploration of the environment exploratory_action = noise.get_action(action, time) #Take the noisy action to enter the next state next_state, reward, done, _ = env.step(exploratory_action) #Predict the action to be taken given the next_state. This next state action is predicted #using the actor's target model next_action = actor.predict_next_action( next_state.reshape((1, 2)))[0] #Append this experience sample to the replay memory replay_memory.append(state, exploratory_action, reward, next_state, next_action, done) #Only start training when there are a minimum number of experience samples available in #memory if replay_memory.count() == collect_experience: training = True print('Start training') #When training: if training: # 1)first draw a random batch of samples from the replay memory batch = replay_memory.sample(batch_size) # 2) using this sample calculate dQ/dA from the critic model grads = critic.calc_grads(batch) # 3) calculate dA/dTheta from the actor using the same batch # 4) multiply dA/dTheta by negative dQ/dA to get dJ/dTheta # 5) Update actor weights such that dJ/dTheta is maximized # 6) The above operation is easily performed by minimizing the value obtained in (4) t_grads = actor.train(batch, grads) # update critic weights by minimizing the bellman loss. Use actor target to compute # next action in the next state (already computed and stored in replay memory) # in order to compute TD target critic.train(batch) #After each weight update of the actor and critic online model perform soft updates # of their targets so that they can smoothly and slowly track the online model's #weights actor.update_target() critic.update_target() #Add each step reward to the episode reward episode_reward += reward #Set current state as next state state = next_state #If target reached before the max allowed time steps, break the inner for loop if done: break #Store episode reward ep_reward.append([episode, episode_reward]) #Print info for each episode to track training progress print( "Completed in {} steps.... episode: {}/{}, episode reward: {} " .format(time, episode, episodes, episode_reward)) #Save model's weights and episode rewards after each save_frequency episode if training and (episode % save_frequency) == 0: print('Data saved at epsisode:', episode) actor.save_weights( './Model/DDPG_actor_model_{}.h5'.format(episode)) pickle.dump( ep_reward, open('./Rewards/rewards_{}.dump'.format(episode), 'wb')) # Close the mountain car environment env.close()
Q.load_state_dict(weights['Q']) Q_targ.load_state_dict(weights['Q_targ']) #Learn params gamma = 0.99 #Hyperparams frame_count = 40000 eps_decay_time = 0.5 eps_start = 1 eps_end = 0.05 l = -math.log(eps_end) / (frame_count * eps_decay_time) replay_mem = ReplayMemory( max_size=500000, alpha=0.5, eps=0.0) if prev_state is None else prev_state['replay_mem'] episode_depth = 10000 batch_size = 32 Q_targ_update_freq = 300 save_freq = frame_count / 100 #Episode loop curr_eps = eps_start if prev_state is None else prev_state['end_eps'] episode_num = 0 if prev_state is None else (prev_state['end_episode'] + 1) curr_frame_count = 0 if prev_state is None else prev_state[
def __init__(self, dqn, num_actions, gamma=0.99, learning_rate=0.00025, replay_start_size=50000, epsilon_start=1.0, epsilon_end=0.01, epsilon_steps=1000000, update_freq=4, target_copy_freq=30000, replay_memory_size=1000000, frame_history=4, batch_size=32, error_clip=1, restore_network_file=None, double=True): self.dqn = dqn config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) self.inp_actions = tf.placeholder(tf.float32, [None, num_actions]) inp_shape = [None] + list(self.dqn.get_input_shape()) + [frame_history] inp_dtype = self.dqn.get_input_dtype() assert type(inp_dtype) is str self.inp_frames = tf.placeholder(inp_dtype, inp_shape) self.inp_sp_frames = tf.placeholder(inp_dtype, inp_shape) self.inp_terminated = tf.placeholder(tf.bool, [None]) self.inp_reward = tf.placeholder(tf.float32, [None]) self.inp_mask = tf.placeholder(inp_dtype, [None, frame_history]) self.inp_sp_mask = tf.placeholder(inp_dtype, [None, frame_history]) self.gamma = gamma with tf.variable_scope('online'): mask_shape = [-1] + [1] * len(self.dqn.get_input_shape()) + [ frame_history ] mask = tf.reshape(self.inp_mask, mask_shape) masked_input = self.inp_frames * mask self.q_online = self.dqn.construct_q_network(masked_input) with tf.variable_scope('target'): mask_shape = [-1] + [1] * len(self.dqn.get_input_shape()) + [ frame_history ] sp_mask = tf.reshape(self.inp_sp_mask, mask_shape) masked_sp_input = self.inp_sp_frames * sp_mask self.q_target = self.dqn.construct_q_network(masked_sp_input) if double: with tf.variable_scope('online', reuse=True): self.q_online_prime = self.dqn.construct_q_network( masked_sp_input) self.maxQ = tf.gather_nd( self.q_target, tf.transpose([ tf.range(0, 32, dtype=tf.int32), tf.cast(tf.argmax(self.q_online_prime, axis=1), tf.int32) ], [1, 0])) else: self.maxQ = tf.reduce_max(self.q_target, reduction_indices=1) self.r = tf.sign(self.inp_reward) use_backup = tf.cast(tf.logical_not(self.inp_terminated), dtype=tf.float32) self.y = self.r + use_backup * gamma * self.maxQ self.delta = tf.reduce_sum(self.inp_actions * self.q_online, reduction_indices=1) - self.y self.error = tf.where( tf.abs(self.delta) < error_clip, 0.5 * tf.square(self.delta), error_clip * tf.abs(self.delta)) self.loss = tf.reduce_sum(self.error) self.g = tf.gradients(self.loss, self.q_online) optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate, decay=0.95, centered=True, epsilon=0.01) self.train_op = optimizer.minimize(self.loss, var_list=th.get_vars('online')) self.copy_op = th.make_copy_op('online', 'target') self.saver = tf.train.Saver(var_list=th.get_vars('online')) self.replay_buffer = ReplayMemory(self.dqn.get_input_shape(), self.dqn.get_input_dtype(), replay_memory_size, frame_history) self.frame_history = frame_history self.replay_start_size = replay_start_size self.epsilon = epsilon_start self.epsilon_min = epsilon_end self.epsilon_steps = epsilon_steps self.epsilon_delta = (self.epsilon - self.epsilon_min) / self.epsilon_steps self.update_freq = update_freq self.target_copy_freq = target_copy_freq self.action_ticker = 1 self.num_actions = num_actions self.batch_size = batch_size self.sess.run(tf.initialize_all_variables()) if restore_network_file is not None: self.saver.restore(self.sess, restore_network_file) print('Restored network from file') self.sess.run(self.copy_op)
args.sample_size, sample_ratio=0.5, is_valid=True, need_feat=args.history) train = BatchProvider(train_iter, train_lst, True, args.sample_size, sample_ratio=0.5, need_feat=args.history) N = args.num_id cmcs, ap, cmcn, vscores, vturns = [[], [], [], []], [], [1, 5, 10, 20], [], [] iterations = args.num_examples memory = ReplayMemory(replay_size=args.memory_size, alpha=args.pr_alpha) epsilon = 1.0 final_epsilon = args.final_epsilon rand_ep, fix_ep = 0, int(args.num_epoches * args.exp_ratio) epsilon_shr = (epsilon - final_epsilon) / (fix_ep - rand_ep) / iterations max_penalty = 1 frf = open(('figurelog/%s' % args.mode), 'w') for e in xrange(args.num_epoches): if args.verbose: print 'Epoch', e for batch in xrange(iterations): if args.verbose: print 'Epoch', e, 'batch', batch
def start(): torch.cuda.empty_cache() rospy.init_node('deepracer_controller_mpc', anonymous=True) pose_sub2 = rospy.Subscriber("/gazebo/model_states_drop",ModelStates,get_vehicle_state) # x_sub1 = rospy.Subscriber("/move_base_simple/goal",PoseStamped,get_clicked_point) lidar_sub2 = rospy.Subscriber("/scan", LaserScan, get_lidar_data) pose_sub = message_filters.Subscriber("/gazebo/model_states_drop", ModelStates) lidar_sub = message_filters.Subscriber("/scan", LaserScan) ts = message_filters.ApproximateTimeSynchronizer([pose_sub,lidar_sub],10,0.1,allow_headerless=True) ts.registerCallback(filtered_data) target_point = [10, 8.5] env = DeepracerGym(target_point) # while not rospy.is_shutdown(): # time.sleep(1) # print('---------------------------',check_env(env)) # max_time_step = 3000 # max_eposide = 1 # e = 0 # while not rospy.is_shutdown(): # time.sleep(1) #Do not remove this # state = env.reset() # env.stop_car() # time.sleep(1) # while(e < max_eposide): # e += 1 # # state = env.reset() # for _ in range(max_time_step): # action = np.array([0.1,-1]) # n_state,reward,done,info = env.step(action) # # display(n_state[2]) # time.sleep(0.01) # print(n_state[2],end='\r') # if done: # state = env.reset() # break # return True # rospy.spin() while not rospy.is_shutdown(): # Training Script rospy.sleep(1) #Do not remove this state = env.reset() #Do not remove this torch.manual_seed(args.seed) np.random.seed(args.seed) agent = SAC(env.observation_space.shape[0], env.action_space, args) #Pretrained Agent # actor_path = "models/sac_actor_<DeepracerGym instance>_" # critic_path = "models/sac_critic_<DeepracerGym instance>_" # agent.load_model(actor_path, critic_path) # Memory memory = ReplayMemory(args.replay_size, args.seed) #Tesnorboard writer = SummaryWriter('runs/{}_SAC_{}_{}_{}'.format(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), 'DeepracerGym', args.policy, "autotune" if args.automatic_entropy_tuning else "")) total_numsteps = 0 updates = 0 num_goal_reached = 0 for i_episode in itertools.count(1): # print("New episode") episode_reward = 0 episode_steps = 0 done = False state = env.reset() while not done: start_time = time.time() if args.start_steps > total_numsteps: action = env.action_space.sample() # Sample random action else: action = agent.select_action(state) # Sample action from policy rospy.sleep(0.02) next_state, reward, done, _ = env.step(action) # Step if (reward > 9) and (episode_steps > 1): #Count the number of times the goal is reached num_goal_reached += 1 episode_steps += 1 total_numsteps += 1 episode_reward += reward if episode_steps > args.max_episode_length: done = True # Ignore the "done" signal if it comes from hitting the time horizon. # (https://github.com/openai/spinningup/blob/master/spinup/algos/sac/sac.py) mask = 1 if episode_steps == args.max_episode_length else float(not done) # mask = float(not done) memory.push(state, action, reward, next_state, mask) # Append transition to memory state = next_state print(done) # if i_episode % UPDATE_EVERY == 0: if len(memory) > args.batch_size: # Number of updates per step in environment for i in range(args.updates_per_step*args.max_episode_length): # Update parameters of all the networks critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.update_parameters(memory, args.batch_size, updates) writer.add_scalar('loss/critic_1', critic_1_loss, updates) writer.add_scalar('loss/critic_2', critic_2_loss, updates) writer.add_scalar('loss/policy', policy_loss, updates) writer.add_scalar('loss/entropy_loss', ent_loss, updates) writer.add_scalar('entropy_temprature/alpha', alpha, updates) updates += 1 if total_numsteps > args.num_steps: break if (episode_steps > 1): writer.add_scalar('reward/train', episode_reward, i_episode) writer.add_scalar('reward/episode_length',episode_steps, i_episode) writer.add_scalar('reward/num_goal_reached',num_goal_reached, i_episode) print("Episode: {}, total numsteps: {}, episode steps: {}, reward: {}".format(i_episode, total_numsteps, episode_steps, round(episode_reward, 2))) print("Number of Goals Reached: ",num_goal_reached) print('----------------------Training Ending----------------------') env.stop_car() agent.save_model("corridor_straight", suffix = "1") return True rospy.spin()
if __name__ == '__main__': parser = argparse.ArgumentParser(description='Multi-agent DDPG') # add argument parser.add_argument('--grid_size', default=100, type=int, help='the size of a grid world') parser.add_argument('--n_actions', default=7, type=int, help='total number of actions an agent can take') parser.add_argument('--filename', default='../data/pr.txt', type=str, help='Pick-up probability file') parser.add_argument('--n_agents', default=4, type=int, help='the number of agent play in the environment') parser.add_argument('--runs', default=1, type=int, help='the number of times run the game') parser.add_argument('--aggre', default=False, help='the number of times run the game') # parser args args = parser.parse_args() env = GridWorld(args=args, terminal_time=1000, reward_stay=-0.1, reward_hitwall=-1, reward_move=-0.1, reward_pick=10) # Create memory memory = ReplayMemory(buffer=50000, batchSize=500) # Create a network dqn = DQN(memory=memory) # Evaluating...... print('\nCollecting experience...') text_file = open("../results/output_dqn.txt", "w") for i_episode in range(4000): s, done = env.reset() a = torch.LongTensor(args.n_agents) ep_r = 0 while True: for i in range(args.n_agents): (x, y) = s[i]['loc'] one_hot_state = torch.Tensor(100 * 100)
def __init__(self, memory_entry_size): self.discount = .99 self.double_q = True self.memory_entry_size = memory_entry_size self.memory = ReplayMemory(self.memory_entry_size)
def train(config_filepath, save_dir, device, visualize_interval): conf = load_toml_config(config_filepath) data_dir, log_dir = create_save_dir(save_dir) # Save config file shutil.copyfile(config_filepath, os.path.join(save_dir, os.path.basename(config_filepath))) device = torch.device(device) # Set up log metrics metrics = { 'episode': [], 'episodic_step': [], 'collected_total_samples': [], 'reward': [], 'q_loss': [], 'policy_loss': [], 'alpha_loss': [], 'alpha': [], 'policy_switch_epoch': [], 'policy_switch_sample': [], 'test_episode': [], 'test_reward': [], } policy_switch_samples = conf.policy_switch_samples if hasattr( conf, "policy_switch_samples") else None total_collected_samples = 0 # Create environment env = make_env(conf.environment, render=False) # Instantiate modules # memory = ReplayBuffer(int(conf.replay_buffer_capacity), env.observation_space.shape, env.action_space.shape) memory = ReplayMemory(conf.replay_buffer_capacity) agent = getattr(agents, conf.agent_type)(env.observation_space, env.action_space, device=device, **conf.agent) # Load checkpoint if specified in config if conf.checkpoint != '': ckpt = torch.load(conf.checkpoint, map_location=device) metrics = ckpt['metrics'] agent.load_state_dict(ckpt['agent']) memory.load_state_dict(ckpt['memory']) policy_switch_samples = ckpt['policy_switch_samples'] total_collected_samples = ckpt['total_collected_samples'] def save_checkpoint(): # Save checkpoint ckpt = { 'metrics': metrics, 'agent': agent.state_dict(), 'memory': memory.state_dict(), 'policy_switch_samples': policy_switch_samples, 'total_collected_samples': total_collected_samples } path = os.path.join(data_dir, 'checkpoint.pth') torch.save(ckpt, path) # Save agent model only model_ckpt = {'agent': agent.state_dict()} model_path = os.path.join(data_dir, 'model.pth') torch.save(model_ckpt, model_path) # Save metrics only metrics_ckpt = {'metrics': metrics} metrics_path = os.path.join(data_dir, 'metrics.pth') torch.save(metrics_ckpt, metrics_path) # Train agent init_episode = 0 if len( metrics['episode']) == 0 else metrics['episode'][-1] + 1 pbar = tqdm.tqdm(range(init_episode, conf.episodes)) reward_moving_avg = None agent_update_count = 0 for episode in pbar: episodic_reward = 0 o = env.reset() q1_loss, q2_loss, policy_loss, alpha_loss, alpha = None, None, None, None, None for t in range(conf.horizon): if total_collected_samples <= conf.random_sample_num: # Select random actions at the begining of training. h = env.action_space.sample() elif memory.step <= conf.random_sample_num: # Select actions from random latent variable soon after inserting a new subpolicy. h = agent.select_action(o, random=True) else: h = agent.select_action(o) a = agent.post_process_action( o, h) # Convert abstract action h to actual action a o_next, r, done, _ = env.step(a) total_collected_samples += 1 episodic_reward += r memory.push(o, h, r, o_next, done) o = o_next if memory.step > conf.random_sample_num: # Update agent batch_data = memory.sample(conf.agent_update_batch_size) q1_loss, q2_loss, policy_loss, alpha_loss, alpha = agent.update_parameters( batch_data, agent_update_count) agent_update_count += 1 if done: break # Describe and save episodic metrics reward_moving_avg = ( 1. - MOVING_AVG_COEF ) * reward_moving_avg + MOVING_AVG_COEF * episodic_reward if reward_moving_avg else episodic_reward pbar.set_description( "EPISODE {} (total samples {}, subpolicy samples {}) --- Step {}, Reward {:.1f} (avg {:.1f})" .format(episode, total_collected_samples, memory.step, t, episodic_reward, reward_moving_avg)) metrics['episode'].append(episode) metrics['reward'].append(episodic_reward) metrics['episodic_step'].append(t) metrics['collected_total_samples'].append(total_collected_samples) if episode % visualize_interval == 0: # Visualize metrics lineplot(metrics['episode'][-len(metrics['reward']):], metrics['reward'], 'REWARD', log_dir) reward_avg = np.array(metrics['reward']) / np.array( metrics['episodic_step']) lineplot(metrics['episode'][-len(reward_avg):], reward_avg, 'AVG_REWARD', log_dir) lineplot( metrics['collected_total_samples'][-len(metrics['reward']):], metrics['reward'], 'SAMPLE-REWARD', log_dir, xaxis='sample') # Save metrics for agent update if q1_loss is not None: metrics['q_loss'].append(np.mean([q1_loss, q2_loss])) metrics['policy_loss'].append(policy_loss) metrics['alpha_loss'].append(alpha_loss) metrics['alpha'].append(alpha) if episode % visualize_interval == 0: lineplot(metrics['episode'][-len(metrics['q_loss']):], metrics['q_loss'], 'Q_LOSS', log_dir) lineplot(metrics['episode'][-len(metrics['policy_loss']):], metrics['policy_loss'], 'POLICY_LOSS', log_dir) lineplot(metrics['episode'][-len(metrics['alpha_loss']):], metrics['alpha_loss'], 'ALPHA_LOSS', log_dir) lineplot(metrics['episode'][-len(metrics['alpha']):], metrics['alpha'], 'ALPHA', log_dir) # Insert new subpolicy layer and reset memory if a specific amount of samples is collected if policy_switch_samples and len( policy_switch_samples ) > 0 and total_collected_samples >= policy_switch_samples[0]: print( "----------------------\nInser new policy\n----------------------" ) agent.insert_subpolicy() memory.reset() metrics['policy_switch_epoch'].append(episode) metrics['policy_switch_sample'].append(total_collected_samples) policy_switch_samples = policy_switch_samples[1:] # Test a policy if episode % conf.test_interval == 0: test_rewards = [] for _ in range(conf.test_times): episodic_reward = 0 obs = env.reset() for t in range(conf.horizon): h = agent.select_action(obs, eval=True) a = agent.post_process_action(o, h) obs_next, r, done, _ = env.step(a) episodic_reward += r obs = obs_next if done: break test_rewards.append(episodic_reward) test_reward_avg, test_reward_std = np.mean(test_rewards), np.std( test_rewards) print(" TEST --- ({} episodes) Reward {:.1f} (pm {:.1f})".format( conf.test_times, test_reward_avg, test_reward_std)) metrics['test_episode'].append(episode) metrics['test_reward'].append(test_rewards) lineplot(metrics['test_episode'][-len(metrics['test_reward']):], metrics['test_reward'], "TEST_REWARD", log_dir) # Save checkpoint if episode % conf.checkpoint_interval: save_checkpoint() # Save the final model torch.save({'agent': agent.state_dict()}, os.path.join(data_dir, 'final_model.pth'))
def main(): agent = SAC(state_dim, env.action_space, device, hidden_size, lr, gamma, tau, alpha) replay_buffer = ReplayMemory(args.capacity, args.seed) if args.train: print("Train True") if args.load: print("Load True") # agent.load_model(actor_path="./models_hard1/actor.pth", critic_path="./models_hard1/critic.pth") agent.load_model() updates = 0 avg_reward = 0. total_steps = 0 count_1500 = 0 time_start = time.time() scores_deque = deque(maxlen=100) avg_scores_array = [] for i in range(args.iteration): ep_r = 0 ep_s = 0 done = False state = env.reset() while not done: action = [] if total_steps < start_steps and not args.load: action = env.action_space.sample() else: use_eval = False if args.render: use_eval = True else: if i % (test_ep * 2) >= test_ep: use_eval = True action = agent.select_action(state, use_eval) next_state, reward, done, info = env.step(action) reward = reward * reward_scale ep_r += reward ep_s += 1 total_steps += 1 if args.render and i >= args.render_interval: env.render() mask = 1 if (ep_s == 1600) else float(not done) if args.train: replay_buffer.push(state, action, reward, next_state, mask) state = next_state if i % (test_ep * 2) >= test_ep: avg_reward += ep_r writer.add_scalar('reward/test', ep_r, i) if i % (test_ep * 2) == test_ep * 2 - 1: avg_reward /= test_ep writer.add_scalar('reward/test_avg', avg_reward, i / 2) avg_reward = 0. if args.train: for upi in range(ep_s): if args.load: if len(replay_buffer) >= 10000: agent.update_parameters(replay_buffer, batch_size, updates, writer) updates += 1 if not args.load and len(replay_buffer) >= update_start_steps: agent.update_parameters(replay_buffer, batch_size, updates, writer) updates += 1 writer.add_scalar('reward/train', ep_r, i) s = (int)(time.time() - time_start) print("Ep.: {}, Total Steps: {}, Ep.Steps: {}, Score: {:.2f}, Time: {:02}:{:02}:{:02}".\ format(i, total_steps, ep_s, ep_r, \ s//3600, s%3600//60, s%60)) if ep_r >= 1500: count_1500 += 1 if count_1500 == 200: agent.save_model() break if args.train: if ep_r > 1400: agent.save_model() if i % 20 == 0: agent.save_model() env.close()
def __init__(self, args): '''Constructor''' self.WARM_UP = 0 self.QUALIFYING = 1 self.RACE = 2 self.UNKNOWN = 3 self.stage = args.stage self.parser = msgParser.MsgParser() self.state = carState.CarState() self.control = carControl.CarControl() self.steers = [-1.0, -0.8, -0.6, -0.5, -0.4, -0.3, -0.2, -0.15, -0.1, -0.05, 0.0, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 1.0] self.speeds = [-1.0, -0.5, 0.0, 0.5, 1.0] self.num_inputs = 19 self.num_steers = len(self.steers) self.num_speeds = len(self.speeds) self.num_actions = self.num_steers + self.num_speeds self.net = DeepQNetwork(self.num_inputs, self.num_steers, self.num_speeds, args) self.mem = ReplayMemory(args.replay_size, self.num_inputs, args) self.minibatch_size = args.batch_size if args.load_weights: self.net.load_weights(args.load_weights) self.save_weights_prefix = args.save_weights_prefix self.pretrained_network = args.pretrained_network self.steer_lock = 0.785398 self.max_speed = 100 self.algorithm = args.algorithm self.device = args.device self.mode = args.mode self.maxwheelsteps = args.maxwheelsteps self.enable_training = args.enable_training self.enable_exploration = args.enable_exploration self.total_train_steps = 0 self.exploration_decay_steps = args.exploration_decay_steps self.exploration_rate_start = args.exploration_rate_start self.exploration_rate_end = args.exploration_rate_end self.show_sensors = args.show_sensors self.show_qvalues = args.show_qvalues self.episode = 0 self.onRestart() if self.show_sensors: from sensorstats import Stats self.stats = Stats(inevery=8) if self.show_qvalues: from plotq import PlotQ self.plotq = PlotQ(self.num_steers, self.num_speeds) if self.device == 'wheel': from wheel import Wheel self.wheel = Wheel(args.joystick_nr, args.autocenter, args.gain, args.min_force, args.max_force)
np.random.seed(args.seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = int(env.action_space.high[0]) # Initialize and load policy actor_path = "models/DDPG_actor_{}_{}.pkl".format(args.env_name, args.buffer_type) critic_path = "models/DDPG_critic_{}_{}.pkl".format( args.env_name, args.buffer_type) policy = ddpg.DDPG(state_dim, action_dim, max_action) policy.load(actor_path, critic_path) # Initialize buffer memory = ReplayMemory(args.replay_size) evaluations = [] total_timesteps = 0 episode_num = 0 done = True while total_timesteps < args.replay_size: if done: if total_timesteps != 0: print("Total T: %d Episode Num: %d Episode T: %d Reward: %f" % (total_timesteps, episode_num, episode_timesteps, episode_reward))
def empty_replay(self): return ReplayMemory(30, [1, 1, 1], 5, 200, np.random.RandomState(456))
torch.manual_seed(args.seed) np.random.seed(args.seed) # Agent agent = SAC(env.observation_space.shape[0], env.action_space, args) # path = 'models/sac_CHANGE_LineFollower-v0_normal' # agent.load_model(path.replace('CHANGE', 'actor'), # path.replace('CHANGE', 'critic')) # Tesnorboard writer = SummaryWriter('runs/{}_SAC_{}_{}_{}'.format( datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name, args.policy, "autotune" if args.automatic_entropy_tuning else "")) # Memory memory = ReplayMemory(args.replay_size, args.seed) # Training Loop total_numsteps = 0 updates = 0 did_it = False for i_episode in itertools.count(1): episode_reward = 0 episode_steps = 0 done = False episode = [] state = env.reset() if did_it: did_it = False while not done: if args.start_steps > total_numsteps:
def __init__(self, scenario_tag=None, model_savefile=None, run_id_string=None, network_class="DQNNet", write_summaries=True, tf_logdir="tensorboard_logs", epochs=100, train_steps_per_epoch=1000000, test_episodes_per_epoch=100, run_tests=True, initial_epsilon=1.0, final_epsilon=0.0000, epsilon_decay_steps=10e07, epsilon_decay_start_step=2e05, frozen_steps=5000, batchsize=32, memory_capacity=10000, update_pattern=(4, 4), prioritized_memory=False, enable_progress_bar=True, save_interval=1, writer_max_queue=10, writer_flush_secs=120, dynamic_frameskips=None, **settings): if prioritized_memory: raise NotImplementedError( "Prioritized memory not implemented. Maybe some day.") # TODO maybe some day ... pass if dynamic_frameskips: if isinstance(dynamic_frameskips, (list, tuple)): self.frameskips = list(dynamic_frameskips) elif isinstance(dynamic_frameskips, int): self.frameskips = list(range(1, dynamic_frameskips + 1)) else: self.frameskips = [None] self.update_pattern = update_pattern self.write_summaries = write_summaries self._settings = settings self.run_id_string = run_id_string self.train_steps_per_epoch = train_steps_per_epoch self._run_tests = test_episodes_per_epoch > 0 and run_tests self.test_episodes_per_epoch = test_episodes_per_epoch self._epochs = np.float32(epochs) self.doom_wrapper = VizdoomWrapper(**settings) misc_len = self.doom_wrapper.misc_len img_shape = self.doom_wrapper.img_shape self.use_misc = self.doom_wrapper.use_misc self.actions_num = self.doom_wrapper.actions_num self.replay_memory = ReplayMemory(img_shape, misc_len, batch_size=batchsize, capacity=memory_capacity) self.network = getattr(networks, network_class)( actions_num=self.actions_num * len(self.frameskips), img_shape=img_shape, misc_len=misc_len, **settings) self.batchsize = batchsize self.frozen_steps = frozen_steps self.save_interval = save_interval self._model_savefile = model_savefile ## TODO move summaries somewhere so they are consistent between dqn and asyncs if self.write_summaries: assert tf_logdir is not None create_directory(tf_logdir) self.scores_placeholder, summaries = setup_vector_summaries( scenario_tag + "/scores") self._summaries = tf.summary.merge(summaries) self._train_writer = tf.summary.FileWriter( "{}/{}/{}".format(tf_logdir, self.run_id_string, "train"), flush_secs=writer_flush_secs, max_queue=writer_max_queue) self._test_writer = tf.summary.FileWriter( "{}/{}/{}".format(tf_logdir, self.run_id_string, "test"), flush_secs=writer_flush_secs, max_queue=writer_max_queue) else: self._train_writer = None self._test_writer = None self._summaries = None self.steps = 0 # TODO epoch as tf variable? self._epoch = 1 # Epsilon self.epsilon_decay_rate = (initial_epsilon - final_epsilon) / epsilon_decay_steps self.epsilon_decay_start_step = epsilon_decay_start_step self.initial_epsilon = initial_epsilon self.final_epsilon = final_epsilon self.enable_progress_bar = enable_progress_bar
def __init__(self, dimO, dimA): dimA = list(dimA) dimO = list(dimO) nets = nets_dm # init replay memory self.rm = ReplayMemory(rm_size, dimO, dimA, dtype=np.__dict__[rm_dtype]) self.rrrm = ReplayMemory(rm_size, dimO, dimA, dtype=np.__dict__[rm_dtype]) # start tf session self.sess = tf.Session( config=tf.ConfigProto(inter_op_parallelism_threads=threads, log_device_placement=False, log_device_placement=False, allow_soft_placement=True)) # create tf computational graph # self.theta_p = nets.theta_p(dimO, dimA) self.theta_q = nets.theta_q(dimO, dimA) self.theta_pt, update_pt = exponential_moving_averages( self.theta_p, tau) self.theta_qt, update_qt = exponential_moving_averages( self.theta_q, tau) obs = tf.placeholder(tf.float32, [None] + dimO, "obs") act_test, sum_p = nets.policy(obs, self.theta_p) # explore noise_init = tf.zeros([1] + dimA) noise_var = tf.Variable(noise_init) self.ou_reset = noise_var.assign(noise_init) noise = noise_var.assign_sub( (FLAGS.ou_theta) * noise_var - tf.random_normal(dimA, stddev=FLAGS.ou_sigma)) act_expl = act_test + noise # test q, sum_q = nets.qfunction(obs, act_test, self.theta_q) # training # policy loss meanq = tf.reduce_mean(q, 0) wd_p = tf.add_n([pl2 * tf.nn.l2_loss(var) for var in self.theta_p]) # weight decay loss_p = -meanq + wd_p # policy optimization optim_p = tf.train.AdamOptimizer(learning_rate=lrp) grads_and_vars_p = optim_p.compute_gradients(loss_p, var_list=self.theta_p) optimize_p = optim_p.apply_gradients(grads_and_vars_p) with tf.control_dependencies([optimize_p]): train_p = tf.group(update_pt) # q optimization act_train = tf.placeholder(tf.float32, [FLAGS.bsize] + dimA, "act_train") rew = tf.placeholder(tf.float32, [FLAGS.bsize], "rew") term = tf.placeholder(tf.bool, [FLAGS.bsize], "term") obs2 = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO, "obs2") # q q_train, sum_qq = nets.qfunction(obs, act_train, self.theta_q) # q targets act2, sum_p2 = nets.policy(obs2, theta=self.theta_pt) q2, sum_q2 = nets.qfunction(obs2, act2, theta=self.theta_qt) q_target = tf.stop_gradient(tf.select(term, rew, rew + discount * q2)) # q_target = tf.stop_gradient(rew + discount * q2) # q loss td_error = q_train - q_target ms_td_error = tf.reduce_mean(tf.square(td_error), 0) wd_q = tf.add_n([ql2 * tf.nn.l2_loss(var) for var in self.theta_q]) # weight decay loss_q = ms_td_error + wd_q # q optimization optim_q = tf.train.AdamOptimizer(learning_rate=lrq) grads_and_vars_q = optim_q.compute_gradients(loss_q, var_list=self.theta_q) optimize_q = optim_q.apply_gradients(grads_and_vars_q) with tf.control_dependencies([optimize_q]): train_q = tf.group(update_qt) # logging log_obs = [] if dimO[0] > 20 else [ tf.histogram_summary("obs/" + str(i), obs[:, i]) for i in range(dimO[0]) ] log_act = [] if dimA[0] > 20 else [ tf.histogram_summary("act/inf" + str(i), act_test[:, i]) for i in range(dimA[0]) ] log_act2 = [] if dimA[0] > 20 else [ tf.histogram_summary("act/train" + str(i), act_train[:, i]) for i in range(dimA[0]) ] log_misc = [sum_p, sum_qq, tf.histogram_summary("td_error", td_error)] log_grad = [ grad_histograms(grads_and_vars_p), grad_histograms(grads_and_vars_q) ] log_train = log_obs + log_act + log_act2 + log_misc + log_grad # initialize tf log writer self.writer = tf.train.SummaryWriter(FLAGS.outdir + "/tf", self.sess.graph, flush_secs=20) # init replay memory for recording episodes max_ep_length = 10000 self.rm_log = ReplayMemory(max_ep_length, dimO, dimA, rm_dtype) # tf functions with self.sess.as_default(): self._act_test = Fun(obs, act_test) self._act_expl = Fun(obs, act_expl) self._reset = Fun([], self.ou_reset) self._train_q = Fun([obs, act_train, rew, term, obs2], [train_q], log_train, self.writer) self._train_p = Fun([obs], [train_p], log_train, self.writer) self._train = Fun([obs, act_train, rew, term, obs2], [train_p, train_q], log_train, self.writer) # initialize tf variables self.saver = tf.train.Saver(max_to_keep=1) ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf") if ckpt: self.saver.restore(self.sess, ckpt) else: self.sess.run(tf.initialize_all_variables()) self.sess.graph.finalize() self.t = 0 # global training time (number of observations)
def train(active_mv): senv = ShapeNetEnv(FLAGS) replay_mem = ReplayMemory(FLAGS) #### for debug #a = np.array([[1,0,1],[0,0,0]]) #b = np.array([[1,0,1],[0,1,0]]) #print('IoU: {}'.format(replay_mem.calu_IoU(a, b))) #sys.exit() #### for debug log_string('====== Starting burning in memories ======') burn_in(senv, replay_mem) log_string('====== Done. {} trajectories burnt in ======'.format( FLAGS.burn_in_length)) #epsilon = FLAGS.init_eps K_single = np.asarray([[420.0, 0.0, 112.0], [0.0, 420.0, 112.0], [0.0, 0.0, 1]]) K_list = np.tile(K_single[None, None, ...], (1, FLAGS.max_episode_length, 1, 1)) ### burn in(pretrain) for MVnet if FLAGS.burn_in_iter > 0: for i in xrange(FLAGS.burn_in_iter): mvnet_input = replay_mem.get_batch_list(FLAGS.batch_size) tic = time.time() out_stuff = active_mv.run_step(mvnet_input, mode='burnin', is_training=True) burnin_log(i, out_stuff, time.time() - tic) rollout_obj = Rollout(active_mv, senv, replay_mem, FLAGS) for i_idx in xrange(FLAGS.max_iter): t0 = time.time() rollout_obj.go(i_idx, verbose=True, add_to_mem=True, mode='random') t1 = time.time() replay_mem.enable_gbl() mvnet_input = replay_mem.get_batch_list(FLAGS.batch_size) t2 = time.time() out_stuff = active_mv.run_step(mvnet_input, mode='train_mv', is_training=True) replay_mem.disable_gbl() t3 = time.time() train_log(i_idx, out_stuff, (t0, t1, t2, t3)) active_mv.train_writer.add_summary(out_stuff.merged_train, i_idx) if i_idx % FLAGS.save_every_step == 0 and i_idx > 0: save(active_mv, i_idx, i_idx, i_idx) if i_idx % FLAGS.test_every_step == 0 and i_idx > 0: evaluate(active_mv, FLAGS.test_episode_num, replay_mem, i_idx, rollout_obj)