def main(): parser = argparse.ArgumentParser() parser.add_argument("--environment", type=str, default='Pendulum-v0') parser.add_argument("--time-steps", type=int, default=30000) parser.add_argument("--replay-mem-size", type=int, default=1000000) parser.add_argument("--batch-size", type=int, default=64) parser.add_argument("--learning-rate", type=float, default=.9) args = parser.parse_args() env = gym.make(args.environment) unroll = 20 state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] action_bound_high = env.action_space.high action_bound_low = env.action_space.low agent = direct_policy_search(state_dim, action_dim, action_bound_high, action_bound_low, unroll, .9, 5, 'direct_policy_search') # Replay memory memory = Memory(args.replay_mem_size) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) state = env.reset() total_rewards = 0.0 epoch = 1 for time_steps in range(args.time_steps): #env.render() action = agent.act(sess, state) next_state, reward, done, _ = env.step(action) total_rewards += float(reward) # Store tuple in replay memory memory.add([ np.atleast_2d(state), np.atleast_2d(action), reward, np.atleast_2d(next_state), done ]) # Training step batch = np.array(memory.sample(args.batch_size)) assert len(batch) > 0 states = np.concatenate(batch[:, 0], axis=0) # Train the agent agent.train(sess, states) # s <- s' state = np.copy(next_state) if done == True: print 'time steps', time_steps, 'epoch', epoch, 'total rewards', total_rewards, 'unroll', unroll epoch += 1 total_rewards = 0. state = env.reset()
class DDPG: def __init__(self, env, batch_size=32, gamma=0.99, hidden_units=32, maxlen=10000, tau=0.1, actor_lr=0.001, critic_lr=0.001): self.env=env self.batch_size=batch_size self.gamma=gamma self.maxlen=maxlen self.sess=tf.Session() self.actor=Actor(env, self.sess, hidden_units, tau, actor_lr) self.critic=Critic(env, self.sess, hidden_units, tau, critic_lr) self.memory=Memory(maxlen) self.sess.run(tf.global_variables_initializer()) self.step=0 def store(self, exp): self.memory.add(exp) def update(self, ): if len(self.memory.buffer)<1000:#self.batch_size: return self.step+=1 data = self.memory.sample(self.batch_size) s=np.array([d[0] for d in data]) a=np.array([d[1] for d in data]) r=np.array([d[2] for d in data]) s_=np.array([d[3] for d in data]) a_=self.actor.target_model.predict(s_) target_q=self.critic.target_model.predict([s_, a_]) #y=np.array([d[2] for d in data]) #for i in range(self.batch_size): # y[i]+=self.gamma*target_q[i] y=r[:,np.newaxis]+self.gamma*target_q self.critic.model.train_on_batch([s, a], y) action=self.actor.model.predict(s) grads=self.critic.get_grads(s, action) self.actor.train(s,grads) if self.step%10==0: self.actor.update_weights() self.critic.update_weights() def get_action(self, s): return self.actor.get_action(s)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--environment", type=str, default='Pendulum-v0') parser.add_argument("--action-dim", type=int, default=1) parser.add_argument("--state-dim", type=int, default=1) #parser.add_argument("--epochs", type=int, default=30000) parser.add_argument("--time-steps", type=int, default=30000) parser.add_argument('--tau', type=float, help='soft target update parameter', default=0.01) parser.add_argument("--action-bound", type=float, default=1.) parser.add_argument("--replay-mem-size", type=int, default=1000000) parser.add_argument("--batch-size", type=int, default=64) parser.add_argument("--learning-rate", type=float, default=.9) parser.add_argument("--latent-size", type=int, default=4, help='Size of vector for Z') parser.add_argument("--model", type=str, default='gan') parser.add_argument("--mode", type=str, default='none') args = parser.parse_args() assert args.mode in ['none', 'test', 'transfer'] assert args.model in [ 'mlp', 'gan', 'gated', 'dmlac_mlp', 'dmlac_gan', 'dmlac_gated', 'ddpg_unrolled_pg_mlp', 'dmlac_gp', 'dmlac_truth', 'mpc' ] if args.model == 'dmlac_truth': assert args.environment == 'Pendulum-v0' # Initialize environment env = gym.make(args.environment) args.state_dim = env.observation_space.shape[0] args.action_dim = env.action_space.shape[0] #assert args.action_dim == 1 args.action_bound_high = env.action_space.high args.action_bound_low = env.action_space.low assert len(args.action_bound_high) == len(args.action_bound_low) for i in range(len(args.action_bound_high)): assert args.action_bound_high[i] == -args.action_bound_low[i] print(args) jointddpg, update_target_actor, update_target_critic, copy_target_actor, copy_target_critic = init_model( [None, args.state_dim], args.action_dim, args.latent_size, args.learning_rate, args.action_bound_low, args.action_bound_high, args.tau, args.model) # Replay memory memory = Memory(args.replay_mem_size) # Actor noise exploration_strategy = OUStrategy(jointddpg, env) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) #sess.run(copy_target_critic) #sess.run(copy_target_actor) if args.mode in ['test', 'transfer']: env.seed(1) state = env.reset() total_rewards = 0.0 epoch = 1 for time_steps in range(args.time_steps): env.render() # Choose an action exploration = (float(args.time_steps - time_steps) / float(args.time_steps))**4 action = exploration_strategy.action(sess, state[np.newaxis, ...], exploration) # Execute action state1, reward, done, _ = env.step(action) total_rewards += float(reward) # Store tuple in replay memory memory.add([ state[np.newaxis, ...], action[np.newaxis, ...], reward, state1[np.newaxis, ...], done ]) # Training step batch_B = np.array(memory.sample(args.batch_size)) assert len(batch_B) > 0 states_B = np.concatenate(batch_B[:, 0], axis=0) actions_B = np.concatenate(batch_B[:, 1], axis=0) rewards_B = batch_B[:, 2] states1_B = np.concatenate(batch_B[:, 3], axis=0) dones_B = batch_B[:, 4] #Get another batch batch_M = np.array(memory.sample(args.batch_size)) assert len(batch_M) > 0 states_M = np.vstack(batch_M[:, 0]) actions_M = np.concatenate(batch_M[:, 1], axis=0) if args.model == 'dmlac_gp': jointddpg.update_hist(memory) jointddpg.train(sess, states_B, actions_B, rewards_B, states1_B, dones_B, states_M, actions_M, len(batch_M), args.latent_size) # Update target networks #jointddpg.update(self, sess, update_target_critic, update_target_actor) #sess.run(update_target_critic) #sess.run(update_target_actor) state = np.copy(state1) if done == True: print 'time steps', time_steps, 'epoch', epoch, 'total rewards', total_rewards epoch += 1 total_rewards = 0. if args.mode == 'transfer': if time_steps >= args.time_steps / 3: env.seed(0) else: env.seed(1) elif args.mode == 'test': env.seed(1) state = env.reset() if args.mode == 'transfer': if time_steps == args.time_steps / 3: memory = Memory(args.replay_mem_size)
class DDPGagent: def __init__(self, hidden_size, env): self.num_states = env.observation_space.shape[0] self.num_actions = env.action_space.shape[0] self.Actor = Actor(input_size=self.num_states, hidden_size=hidden_size, output_size=self.num_actions).cuda() self.Actor_target = Actor(input_size=self.num_states, hidden_size=hidden_size, output_size=self.num_actions).cuda() self.Critic = Critic(input_size=self.num_states, hidden_size=hidden_size, output_size=self.num_actions).cuda() self.Critic_target = Critic(input_size=self.num_states, hidden_size=hidden_size, output_size=self.num_actions).cuda() for target_param, param in zip(self.Actor_target.parameters(), self.Actor.parameters()): target_param.data = param.data for target_param, param in zip(self.Critic_target.parameters(), self.Critic.parameters()): target_param.data = param.data self.Memory = Memory(30000) self.criterion = nn.MSELoss().cuda() self.actor_optimizer = torch.optim.Adam(self.Actor.parameters(), lr=1e-2) self.critic_optimizer = torch.optim.Adam(self.Critic.parameters(), lr=1e-1) def get_action(self, state): state = torch.from_numpy(state).float().unsqueeze(0).cuda() action = self.Actor.forward(state) action = action.detach().cpu().numpy() return action def update(self, batch_size): states, actions, rewards, next_states, _ = self.Memory.sample( batch_size) states = torch.tensor(states).cuda() actions = torch.tensor(actions).cuda() rewards = torch.tensor(rewards).cuda() next_states = torch.tensor(next_states).cuda() Q_Value = self.Critic.forward(states, action=actions) next_actions = self.Actor_target(next_states) next_Q = self.Critic_target.forward(next_states, next_actions.detach()) Q_prime = rewards + 0.99 * next_Q critic_loss = self.criterion(Q_Value, Q_prime) policy_loss = -self.Critic.forward(states, self.Actor.forward(states)).mean() self.actor_optimizer.zero_grad() policy_loss.backward() self.actor_optimizer.step() self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() for target_param, param in zip(self.Actor_target.parameters(), self.Actor.parameters()): target_param.data = (param.data * 1e-2 + target_param.data * (1.0 - 1e-2)) for target_param, param in zip(self.Critic_target.parameters(), self.Critic.parameters()): target_param.data.copy_(param.data * 1e-2 + target_param.data * (1.0 - 1e-2))
def main(): parser = argparse.ArgumentParser() parser.add_argument("--env-interface", type=str, default='gym!atari') parser.add_argument("--environment", type=str, default='CartPole-v0') parser.add_argument("--action-size", type=int, default=2) parser.add_argument("--input-shape", type=list, default=[None, 4]) parser.add_argument("--target-update-freq", type=int, default=200) parser.add_argument("--epsilon-max", type=float, default=1.) parser.add_argument("--epsilon-min", type=float, default=.01) parser.add_argument("--epsilon-decay", type=float, default=.001) parser.add_argument("--learning-rate", type=float, default=.99) parser.add_argument("--batch-size", type=int, default=64) parser.add_argument("--epochs", type=int, default=30000) parser.add_argument("--replay-mem-size", type=int, default=1000000) parser.add_argument("--K", type=int, default=1, help='The number of steps to train the environment') parser.add_argument( "--L", type=int, default=1, help='The number of Q-learning steps for hypothetical rollouts') parser.add_argument("--latent-size", type=int, default=4, help='Size of vector for Z') args = parser.parse_args() env = env_interface(args.env_interface, args.environment, pixel_feature=False, render=True) #args.action_size = env.action_space.n args.action_size = env.action_size args.input_shape = [None] + list(env.obs_space_shape) print args # Other parameters epsilon = args.epsilon_max # Replay memory memory = Memory(args.replay_mem_size) # Time step time_step = 0. # Initialize the GANs cgan_state = CGAN(input_shape=args.input_shape, action_size=args.action_size, latent_size=args.latent_size, gen_input_shape=args.input_shape) cgan_reward = CGAN(input_shape=args.input_shape, action_size=args.action_size, latent_size=args.latent_size, gen_input_shape=[None, 1]) qnet = qnetwork(input_shape=args.input_shape, action_size=args.action_size, scope='qnet') target_qnet = qnetwork(input_shape=args.input_shape, action_size=args.action_size, scope='target_qnet') update_ops = update_target_graph('qnet', 'target_qnet') rand_no = np.random.rand() #env = gym.wrappers.Monitor(env, '/tmp/cartpole-experiment-' + str(rand_no), force=True, video_callable=False) init = tf.initialize_all_variables() with tf.Session() as sess: sess.run(init) for epoch in range(args.epochs): total_reward = 0 observation = env.reset() for t in range(1000000): #env.render() action = qnet.get_action(sess, observation) if np.random.rand() < epsilon: #action = env.action_space.sample() action = np.random.randint(args.action_size) observation1, reward, done, info = env.step(action) total_reward += reward # Add to memory memory.add([observation, action, reward, observation1, done]) # Reduce epsilon time_step += 1. epsilon = args.epsilon_min + ( args.epsilon_max - args.epsilon_min) * np.exp( -args.epsilon_decay * time_step) # Training step batch = np.array(memory.sample(args.batch_size)) qnet.train(sess, batch, args.learning_rate, target_qnet) # Training step: environment model for k in range(args.K): batch = np.array(memory.sample(args.batch_size)) states = np.vstack(batch[:, 0]) actions = np.array(batch[:, 1]) rewards = batch[:, 2] states1 = np.vstack(batch[:, 3]) _, D_loss_state = sess.run( [cgan_state.D_solver, cgan_state.D_loss], feed_dict={ cgan_state.states: states, cgan_state.actions: actions, cgan_state.Z: sample_z(len(batch), args.latent_size), cgan_state.X: states1 }) _, G_loss_state = sess.run( [cgan_state.G_solver, cgan_state.G_loss], feed_dict={ cgan_state.states: states, cgan_state.actions: actions, cgan_state.Z: sample_z(len(batch), args.latent_size) }) _, D_loss_reward = sess.run( [cgan_reward.D_solver, cgan_reward.D_loss], feed_dict={ cgan_reward.states: states, cgan_reward.actions: actions, cgan_reward.Z: sample_z(len(batch), args.latent_size), cgan_reward.X: rewards[..., np.newaxis] }) _, G_loss_reward = sess.run( [cgan_reward.G_solver, cgan_reward.G_loss], feed_dict={ cgan_reward.states: states, cgan_reward.actions: actions, cgan_reward.Z: sample_z(len(batch), args.latent_size) }) #print D_loss_state, G_loss_state, D_loss_reward, G_loss_state # Training step: imagination rollouts if time_step == 0.: print "time_step 0 here" if time_step >= 0.: for l in range(args.L): batch = np.array(memory.sample(args.batch_size)) assert len(batch) > 0 states1 = np.vstack(batch[:, 3]) actions = np.random.randint(args.action_size, size=len(batch)) dones = np.array([False] * len(batch)) G_sample_state = sess.run(cgan_state.G_sample, feed_dict={ cgan_state.states: states1, cgan_state.actions: actions, cgan_state.Z: sample_z( len(batch), args.latent_size) }) G_sample_reward = sess.run(cgan_reward.G_sample, feed_dict={ cgan_reward.states: states1, cgan_reward.actions: actions, cgan_reward.Z: sample_z( len(batch), args.latent_size) }) qnet.train(sess, None, args.learning_rate, target_qnet, states1, actions, G_sample_reward, G_sample_state, dones) # Set observation observation = observation1 # Update? if int(time_step) % args.target_update_freq == 0: #print "Updating target..." sess.run(update_ops) if done: print "Episode finished after {} timesteps".format( t + 1), 'epoch', epoch, 'total_rewards', total_reward break
def main(): parser = argparse.ArgumentParser() parser.add_argument("--environment", type=str, default='MountainCarContinuous-v0') parser.add_argument("--unroll-steps", type=int, default=20) parser.add_argument("--time-steps", type=int, default=30000) parser.add_argument("--replay-mem-size", type=int, default=1000000) parser.add_argument("--batch-size", type=int, default=64) parser.add_argument("--discount-factor", type=float, default=1.) parser.add_argument("--goal-position", type=float, default=.45) args = parser.parse_args() env = gym.make(args.environment) env.seed(seed=args.goal_position) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] action_bound_high = env.action_space.high action_bound_low = env.action_space.low agent = direct_policy_search(state_dim, action_dim, action_bound_high, action_bound_low, args.unroll_steps, args.discount_factor, 1, 'direct_policy_search') # Replay memory memory = Memory(args.replay_mem_size) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) #weights = pickle.load(open('../custom_environments/weights/pendulum_reward.p', 'rb')) #weights = pickle.load(open('../custom_environments/weights/mountain_car_continuous_reward'+str(args.goal_position)+'.p', 'rb')) #sess.run(agent.assign_ops0, feed_dict=dict(zip(agent.placeholders_reward, weights))) weights = pickle.load( open( '../custom_environments/weights/mountain_car_continuous_next_state.p', 'rb')) sess.run(agent.assign_ops1, feed_dict=dict(zip(agent.placeholders_state, weights))) state = env.reset() total_rewards = 0.0 epoch = 1 for time_steps in range(args.time_steps): env.render() action = agent.act(sess, state) next_state, reward, done, _ = env.step(action) total_rewards += float(reward) # Store tuple in replay memory memory.add([ np.atleast_2d(state), np.atleast_2d(action), reward, np.atleast_2d(next_state), done ]) # Training step batch = np.array(memory.sample(args.batch_size)) assert len(batch) > 0 states = np.concatenate(batch[:, 0], axis=0) # Train the agent agent.train(sess, states) # s <- s' state = np.copy(next_state) if done == True: print 'time steps', time_steps, 'epoch', epoch, 'total rewards', total_rewards, 'unroll', args.unroll_steps epoch += 1 total_rewards = 0. state = env.reset()
def main(): parser = argparse.ArgumentParser() parser.add_argument("--environment", type=str, default='Pendulum-v0') parser.add_argument("--action-dim", type=int, default=1) parser.add_argument("--state-dim", type=int, default=1) parser.add_argument("--epochs", type=int, default=30000) parser.add_argument("--time-steps", type=int, default=30000) parser.add_argument('--tau', type=float, help='soft target update parameter', default=0.01) parser.add_argument("--action-bound", type=float, default=1.) parser.add_argument("--replay-mem-size", type=int, default=1000000) parser.add_argument("--batch-size", type=int, default=64) parser.add_argument("--learning-rate", type=float, default=.9) parser.add_argument("--mode", type=str, default='none') args = parser.parse_args() assert args.mode in ['none', 'test', 'transfer'] # Initialize environment env = gym.make(args.environment) args.state_dim = env.observation_space.shape[0] args.action_dim = env.action_space.shape[0] #assert args.action_dim == 1 args.action_bound_high = env.action_space.high args.action_bound_low = env.action_space.low assert len(args.action_bound_high) == len(args.action_bound_low) for i in range(len(args.action_bound_high)): assert args.action_bound_high[i] == -args.action_bound_low[i] print(args) print(sys.argv) # Networks ddpg = actorcritic(state_shape=[None, args.state_dim], action_shape=[None, args.action_dim], output_bound_low=args.action_bound_low, output_bound_high=args.action_bound_high, learning_rate=args.learning_rate, tau=args.tau) # Replay memory memory = Memory(args.replay_mem_size) # Actor noise exploration_strategy = OUStrategy(ddpg, env) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) ddpg.copy_target(sess) time_steps = 0. for epoch in range(args.epochs): state = env.reset() total_rewards = 0.0 ts = 0 while True: #env.render() # Choose an action time_steps += 1. ts += 1 if time_steps >= args.time_steps: exploration = 0. else: exploration = (float(args.time_steps - time_steps) / float(args.time_steps))**4 action = exploration_strategy.action(sess, state[np.newaxis, ...], exploration) # Execute action state1, reward, done, _ = env.step(action) total_rewards += float(reward) # Store tuple in replay memory memory.add([ state[np.newaxis, ...], action[np.newaxis, ...], reward, state1[np.newaxis, ...], done ]) # Training step batch = np.array(memory.sample(args.batch_size)) assert len(batch) > 0 states = np.concatenate(batch[:, 0], axis=0) actions = np.concatenate(batch[:, 1], axis=0) rewards = batch[:, 2] states1 = np.concatenate(batch[:, 3], axis=0) dones = batch[:, 4] ddpg.train(sess, states, actions, rewards, states1, dones) # Update target networks ddpg.update_target(sess) state = state1.copy() if done == True: print 'time steps', time_steps, 'epoch', epoch, 'total rewards', total_rewards, 'epoch ts:', ts break
class Agent(): def __init__(self, state_size, action_size, random_seed): """ Args: ====== state_size (int): state dim action_size (int): action dim random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # actor net initialization self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # critic net initialization self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Ornstein-Uhlenbeck Exploration Noise Process self.noise = OUNoise(action_space=action_size, seed=random_seed) # Replay memory init self.memory = Memory(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, states, actions, rewards, next_states, dones, is_learning_step, saving_wrong_step_prob = 0.9): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones): if reward> 0 or random.uniform(0,1) <= saving_wrong_step_prob: self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE and is_learning_step: for _ in range(10): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """map action to state""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.evolve_state() return np.clip(action, -1, 1) def act_on_all_agents(self, states): """map action to state to all agents""" vectorized_act = np.vectorize(self.act, excluded='self', signature='(n),()->(k)') return vectorized_act(states, True) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update actor and critic nets parameters Args: ====== experiences (Tuple[torch.Tensor]): experience tuples gamma (float): bellman discount factor """ states, actions, rewards, next_states, dones = experiences # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): #Soft update model parameters for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
def DQN(): parser = argparse.ArgumentParser() parser.add_argument("--environment", type=str, default='CartPole-v0') parser.add_argument("--action-size", type=int, default=2) parser.add_argument("--input-shape", type=list, default=[None, 4]) parser.add_argument("--target-update-freq", type=int, default=200) parser.add_argument("--epsilon-max", type=float, default=1.) parser.add_argument("--epsilon-min", type=float, default=.01) parser.add_argument("--epsilon-decay", type=float, default=.001) parser.add_argument("--discount-factor", type=float, default=.99) parser.add_argument("--batch-size", type=int, default=64) parser.add_argument("--epochs", type=int, default=1000) parser.add_argument("--replay-mem-size", type=int, default=1000000) args = parser.parse_args() env = Environment() args.action_size = env.nActions args.input_shape = [None, env.stateShape] print args # Epsilon parameter epsilon = 0.1 # args.epsilon_max # Replay memory memory = Memory(args.replay_mem_size) # Time step time_step = 0. # Initialize the agent qnet = qnetwork(input_shape=args.input_shape, action_size=args.action_size, scope='qnet') tnet = qnetwork(input_shape=args.input_shape, action_size=args.action_size, scope='tnet') update_ops = update_target_graph('qnet', 'tnet') rewardHistory = np.zeros(args.epochs) env.render() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for epoch in range(args.epochs): total_reward = 0 state = env.reset() while (True): #env.render() if np.random.rand() < epsilon: action = np.random.randint(args.action_size) else: action = qnet.act(sess, state) [next_state, reward, done] = env.step(action) total_reward += reward rewardHistory[epoch] += reward # Add to memory memory.add([state, action, reward, next_state, done]) # Reduce epsilon time_step += 1. #epsilon = args.epsilon_min + (args.epsilon_max - args.epsilon_min) * np.exp(-args.epsilon_decay * time_step) # Training step batch = np.array(memory.sample(args.batch_size)) qnet.train(sess, batch, args.discount_factor, tnet) # s <- s' state = np.copy(next_state) # Update target network if int(time_step) % args.target_update_freq == 0: sess.run(update_ops) if done: print 'epoch:', epoch, 'total_rewards:', total_reward break ''' np.set_printoptions(threshold=np.nan) for v in range(-5, 5): policy = np.zeros((env.W, env.W), dtype='int') for x in range(env.W): for y in range(env.W): policy[x,y] = qnet.act(sess, np.array([x,y,1,v])) print(policy) ''' plt.xlabel('episode #') plt.ylabel('reward') plt.plot(rewardHistory) plt.savefig("DQN") plt.show() for epoch in range(10): total_reward = 0 state = env.reset() while (True): env.render() action = qnet.act(sess, state) [next_state, reward, done] = env.step(action) total_reward += reward rewardHistory[epoch] += reward # Reduce epsilon time_step += 1. # s <- s' state = np.copy(next_state) if done: print 'epoch:', epoch, 'total_rewards:', total_reward break
class SACagent: def __init__(self, state_dim, action_dim=2, hidden_dim=256, lr=1e-3, gamma=0.99, tau=1e-2, max_memory_size=1000000, action_scales=[.22, .5], maxTemp=0.5, minTemp=0.1, tempTimeScale=500000.0): # use cuda? use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") self.device = device # Params self.state_dim = state_dim self.action_dim = action_dim self.hidden_dim = hidden_dim self.gamma = gamma self.tau = tau self.lr = lr self.maxTemp = maxTemp self.minTemp = minTemp self.tempTimeScale = tempTimeScale self.action_scales = action_scales # Memory replay buffer self.memory = Memory(max_memory_size) # Initialize all networks self.value_net = ValueNetwork(self.state_dim, self.hidden_dim).to(device) self.target_value_net = ValueNetwork(self.state_dim, self.hidden_dim).to(device) self.soft_q_net1 = SoftQNetwork(self.state_dim, self.action_dim, self.hidden_dim).to(device) self.soft_q_net2 = SoftQNetwork(self.state_dim, self.action_dim, self.hidden_dim).to(device) self.policy_net = PolicyNetwork(self.state_dim, self.action_dim, self.hidden_dim, self.action_scales, device).to(device) # Copy initial parameters from value net to target value net for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(param.data) self.value_criterion = nn.MSELoss() self.soft_q_criterion1 = nn.MSELoss() self.soft_q_criterion2 = nn.MSELoss() value_lr = self.lr #3e-4 soft_q_lr = self.lr #3e-4 policy_lr = self.lr #3e-4 self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=value_lr) self.soft_q_optimizer1 = optim.Adam(self.soft_q_net1.parameters(), lr=soft_q_lr) self.soft_q_optimizer2 = optim.Adam(self.soft_q_net2.parameters(), lr=soft_q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) def get_action(self, state): action = self.policy_net.get_action(state) action = action.detach().numpy() return action def update(self, batch_size, t): print("STARTING NETWORK UPDATES") #state, action, reward, next_state, done = replay_buffer.sample(batch_size) state, action, reward, next_state = self.memory.sample(batch_size) state = torch.FloatTensor(state).to(self.device) next_state = torch.FloatTensor(next_state).to(self.device) action = torch.FloatTensor(action).to(self.device) reward = torch.FloatTensor(reward).to(self.device) #done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(device) predicted_q_value1 = self.soft_q_net1(state, action) predicted_q_value2 = self.soft_q_net2(state, action) predicted_value = self.value_net(state) new_action, log_prob, epsilon, mean, log_std = self.policy_net.evaluate( state) log_prob_sum = torch.sum(log_prob, dim=1) joint_entropy = log_prob_sum.unsqueeze(1) # Training Q Function target_value = self.target_value_net(next_state) target_q_value = reward + self.gamma * target_value q_value_loss1 = self.soft_q_criterion1(predicted_q_value1, target_q_value.detach()) q_value_loss2 = self.soft_q_criterion2(predicted_q_value2, target_q_value.detach()) print("Q1 LOSS = " + str(q_value_loss1) + " Q2 LOSS = " + str(q_value_loss2)) self.soft_q_optimizer1.zero_grad() q_value_loss1.backward() self.soft_q_optimizer1.step() self.soft_q_optimizer2.zero_grad() q_value_loss2.backward() self.soft_q_optimizer2.step() # Training Value Function predicted_new_q_value = torch.min(self.soft_q_net1(state, new_action), self.soft_q_net2(state, new_action)) #target_value_func = predicted_new_q_value - log_prob alpha = max(self.minTemp, (self.maxTemp - self.maxTemp * (t / self.tempTimeScale))) print("ALPHA = " + str(alpha) + " min = " + str(self.minTemp) + " max = " + str(self.maxTemp)) target_value_func = predicted_new_q_value - alpha * joint_entropy value_loss = self.value_criterion(predicted_value, target_value_func.detach()) print("VALUE LOSS = " + str(value_loss)) self.value_optimizer.zero_grad() value_loss.backward() self.value_optimizer.step() # Training Policy Function #policy_loss = (log_prob - predicted_new_q_value).mean() policy_loss = (alpha * joint_entropy - predicted_new_q_value).mean() print("POLICY LOSS = " + str(policy_loss)) self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau) print("DONE WITH NETWORK UPDATES")
def main(): parser = argparse.ArgumentParser() parser.add_argument("--environment", type=str, default='Pendulum-v0') parser.add_argument("--action-dim", type=int, default=1) parser.add_argument("--state-dim", type=int, default=1) #parser.add_argument("--epochs", type=int, default=30000) parser.add_argument("--time-steps", type=int, default=30000) parser.add_argument('--tau', type=float, help='soft target update parameter', default=0.01) parser.add_argument("--action-bound", type=float, default=1.) parser.add_argument("--replay-mem-size", type=int, default=1000000) parser.add_argument("--batch-size", type=int, default=64) parser.add_argument("--learning-rate", type=float, default=.9) parser.add_argument("--mode", type=str, default='none') args = parser.parse_args() assert args.mode in ['none', 'test', 'transfer'] # Initialize environment env = gym.make(args.environment) args.state_dim = env.observation_space.shape[0] args.action_dim = env.action_space.shape[0] #assert args.action_dim == 1 args.action_bound_high = env.action_space.high args.action_bound_low = env.action_space.low assert len(args.action_bound_high) == len(args.action_bound_low) for i in range(len(args.action_bound_high)): assert args.action_bound_high[i] == -args.action_bound_low[i] print(args) # Networks ddpg = actorcritic(state_shape=[None, args.state_dim], action_shape=[None, args.action_dim], output_bound_low=args.action_bound_low, output_bound_high=args.action_bound_high, learning_rate=args.learning_rate, tau=args.tau) # Allocate the Gaussian process model_been_trained = False smodel = gp_model([None, args.state_dim], [None, args.action_dim], [None, args.state_dim], epochs=100) rmodel = gp_model([None, args.state_dim], [None, args.action_dim], [None, 1], epochs=100) Bold = Memory(500) B = Memory(500) ell = 1#Unroll depth I = 5#Number of updates per timestep memory_fictional = Memory(args.replay_mem_size) # Replay memory memory = Memory(args.replay_mem_size) # Actor noise exploration_strategy = OUStrategy(ddpg, env) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) ddpg.copy_target(sess) if args.mode in ['test', 'transfer']: env.seed(1) state = env.reset() total_rewards = 0.0 epoch = 1 for time_steps in range(args.time_steps): #env.render() # Choose an action exploration = (float(args.time_steps - time_steps) / float(args.time_steps)) ** 4 action = exploration_strategy.action(sess, state[np.newaxis, ...], exploration) # Execute action state1, reward, done, _ = env.step(action) total_rewards += float(reward) # Store tuple in replay memory memory.add([state[np.newaxis, ...], action[np.newaxis, ...], reward, state1[np.newaxis, ...], done]) B.add([state[np.newaxis, ...], action[np.newaxis, ...], reward, state1[np.newaxis, ...], done]) if time_steps % args.batch_size == 0 and time_steps != 0 and model_been_trained and ell > 0: #if time_steps >= 3 and model_been_trained: batch = np.array(memory.sample(args.batch_size)) assert len(batch) > 0 next_states = np.concatenate([ele[3] for ele in batch], axis=0) for _ in range(ell): states = np.copy(next_states) actions = np.random.uniform(low=args.action_bound_low, high=args.action_bound_high, size=[states.shape[0], args.action_dim]) rewards = rmodel.predict(sess, states, actions) next_states = smodel.predict(sess, states, actions) for state, action, reward, next_state in zip(list(states), list(actions), list(rewards), list(next_states)): memory_fictional.add([state[np.newaxis, ...], action[np.newaxis, ...], reward, next_state[np.newaxis, ...], False]) for _ in range(I): # Training step batch = np.array(memory.sample(args.batch_size)) assert len(batch) > 0 states = np.concatenate(batch[:, 0], axis=0) actions = np.concatenate(batch[:, 1], axis=0) rewards = batch[:, 2] states1 = np.concatenate(batch[:, 3], axis=0) dones = batch[:, 4] ddpg.train(sess, states, actions, rewards, states1, dones) ddpg.update_target(sess) for _ in range(ell): # Training step for fictional experience batch = np.array(memory_fictional.sample(args.batch_size)) if len(batch) > 0: states = np.concatenate(batch[:, 0], axis=0) actions = np.concatenate(batch[:, 1], axis=0) rewards = batch[:, 2] states1 = np.concatenate(batch[:, 3], axis=0) dones = batch[:, 4] ddpg.train(sess, states, actions, rewards, states1, dones) ddpg.update_target(sess) if len(B.mem) == B.max_size and ell > 0: import copy Bold = copy.deepcopy(B) B.mem = [] states = np.concatenate([ele[0] for ele in Bold.mem], axis=0) actions = np.concatenate([ele[1] for ele in Bold.mem], axis=0) rewards = np.array([ele[2] for ele in Bold.mem]) next_states = np.concatenate([ele[3] for ele in Bold.mem], axis=0) rmodel.train(sess, states, actions, rewards[..., np.newaxis]) smodel.train(sess, states, actions, next_states) model_been_trained = True state = np.copy(state1) if done == True: print 'time steps', time_steps, 'epoch', epoch, 'total rewards', total_rewards epoch += 1 total_rewards = 0. if args.mode == 'transfer': if time_steps >= args.time_steps / 3: env.seed(0) else: env.seed(1) elif args.mode == 'test': env.seed(1) state = env.reset() if args.mode == 'transfer': if time_steps == args.time_steps / 3: memory = Memory(args.replay_mem_size)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--environment", type=str, default='CartPole-v0') parser.add_argument("--action-size", type=int, default=2) parser.add_argument("--input-shape", type=list, default=[None, 4]) parser.add_argument("--target-update-freq", type=int, default=200) parser.add_argument("--epsilon-max", type=float, default=1.) parser.add_argument("--epsilon-min", type=float, default=.01) parser.add_argument("--epsilon-decay", type=float, default=.001) parser.add_argument("--learning-rate", type=float, default=.99) parser.add_argument("--batch-size", type=int, default=64) parser.add_argument("--epochs", type=int, default=30000) parser.add_argument("--replay-mem-size", type=int, default=1000000) parser.add_argument("--latent-size", type=int, default=4, help='Size of vector for Z') parser.add_argument("--model", type=str, default='gan') args = parser.parse_args() assert args.model in ['gan', 'gated', 'gated_reg'] env = gym.make(args.environment) args.action_size = env.action_space.n args.input_shape = [None, env.observation_space.shape[0]] print args # Other parameters epsilon = args.epsilon_max # Replay memory memory = Memory(args.replay_mem_size) # Time step time_step = 0. # Initialize the model jqnet, update_ops = init_model(args.input_shape, args.action_size, args.latent_size, args.learning_rate, args.model) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for epoch in range(args.epochs): total_reward = 0 observation = env.reset() for t in range(1000000): #env.render() action = jqnet.get_action(sess, observation) if np.random.rand() < epsilon: action = env.action_space.sample() observation1, reward, done, info = env.step(action) total_reward += reward # Add to memory memory.add([observation, action, reward, observation1, done]) # Reduce epsilon time_step += 1. epsilon = args.epsilon_min + ( args.epsilon_max - args.epsilon_min) * np.exp( -args.epsilon_decay * time_step) # Training step batch = np.array(memory.sample(args.batch_size)) assert len(batch) > 0 states = np.vstack(batch[:, 0]) actions = np.array(batch[:, 1]) rewards = batch[:, 2] states1 = np.vstack(batch[:, 3]) dones = batch[:, 4].astype(np.float32) #Get another batch batch2 = np.array(memory.sample(args.batch_size)) assert len(batch2) > 0 states2 = np.vstack(batch2[:, 0]) actions2 = np.array(batch2[:, 1]) # Update Q jqnet.updateQ(sess, states, actions, rewards, states1, dones, states2, actions2, len(batch), args.latent_size) # Update state model jqnet.updateS(sess, states, actions, states1, states2, actions2, len(batch), args.latent_size) # Update reward model jqnet.updateR(sess, states, actions, rewards, states2, actions2, len(batch), args.latent_size) # Set observation observation = observation1 # Update? if int(time_step) % args.target_update_freq == 0: #print "Updating target..." sess.run(update_ops) if done: print "Episode finished after {} timesteps".format( t + 1), 'epoch', epoch, 'total_reward', total_reward break env.close() gym.upload('/tmp/cartpole-experiment-' + str(rand_no), api_key='sk_AlBXbTIgR4yaxPlvDpm61g')
else: # Stack the frame of the next_state next_state, stacked_frames = stack_frames( stacked_frames, next_state, False) # Add experience to memory experience = state, action, reward, next_state, done memory.store(experience) # st+1 is now our current state state = next_state ### LEARNING PART # Obtain random mini-batch from memory tree_idx, batch, ISWeights_mb = memory.sample(batch_size) # batch = memory.sample(batch_size) states_mb = np.array([each[0][0] for each in batch], ndmin=3) # print_var("states_mb", states_mb.shape) actions_mb = np.array([each[0][1] for each in batch]) # print_var("actions_mb", actions_mb.shape) # print_var("actions_mb", actions_mb) rewards_mb = np.array([each[0][2] for each in batch]) # print_var("rewards_mb", rewards_mb.shape) # print_var("rewards_mb", rewards_mb) next_states_mb = np.array([each[0][3] for each in batch], ndmin=3) # print_var("next_states_mb", next_states_mb.shape)
class Agent: def __init__(self, level_name): self.level_name = level_name # setup environment self.env = gym_super_mario_bros.make(level_name) self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT) # one hot encoded version of our actions self.possible_actions = np.array(np.identity(self.env.action_space.n, dtype=int).tolist()) # resest graph tf.reset_default_graph() # instantiate the DQNetwork self.DQNetwork = DQNetwork(state_size, action_size, learning_rate) # instantiate memory self.memory = Memory(max_size=memory_size) # initialize deque with zero images self.stacked_frames = deque([np.zeros((100, 128), dtype=np.int) for i in range(stack_size)], maxlen=4) for i in range(pretrain_length): # If it's the first step if i == 0: state = self.env.reset() state, self.stacked_frames = stack_frames(self.stacked_frames, state, True) # Get next state, the rewards, done by taking a random action choice = random.randint(1, len(self.possible_actions)) - 1 action = self.possible_actions[choice] next_state, reward, done, _ = self.env.step(choice) # stack the frames next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False) # if the episode is finished (we're dead) if done: # we inished the episode next_state = np.zeros(state.shape) # add experience to memory self.memory.add((state, action, reward, next_state, done)) # start a new episode state = self.env.reset() state, self.stacked_frames = stack_frames(self.stacked_frames, state, True) else: # add experience to memory self.memory.add((state, action, reward, next_state, done)) # our new state is now the next_state state = next_state # saver will help us save our model self.saver = tf.train.Saver() # setup tensorboard writer self.writer = tf.summary.FileWriter("logs/") # losses tf.summary.scalar("Loss", self.DQNetwork.loss) self.write_op = tf.summary.merge_all() def predict_action(self, sess, explore_start, explore_stop, decay_rate, decay_step, state, actions): # first we randomize a number exp_exp_tradeoff = np.random.rand() explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step) if explore_probability > exp_exp_tradeoff: # make a random action choice = random.randint(1, len(self.possible_actions)) - 1 action = self.possible_actions[choice] else: # estimate the Qs values state Qs = sess.run(self.DQNetwork.output, feed_dict={self.DQNetwork.inputs_: state.reshape((1, *state.shape))}) # take the biggest Q value (= best action) choice = np.argmax(Qs) action = self.possible_actions[choice] return action, choice, explore_probability def play_notebook(self): import matplotlib.pyplot as plt # imports to render env to gif from JSAnimation.IPython_display import display_animation from matplotlib import animation from IPython.display import display # http://mckinziebrandon.me/TensorflowNotebooks/2016/12/21/openai.html def display_frames_as_gif(frames): """ Displays a list of frames as a gif, with controls """ #plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 72) patch = plt.imshow(frames[0]) plt.axis('off') def animate(i): patch.set_data(frames[i]) anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50) display(display_animation(anim, default_mode='loop')) frames = [] with tf.Session() as sess: total_test_rewards = [] # Load the model self.saver.restore(sess, "models/{0}.cpkt".format(self.level_name)) for episode in range(1): total_rewards = 0 state = self.env.reset() state, self.stacked_frames = stack_frames(self.stacked_frames, state, True) print("****************************************************") print("EPISODE ", episode) while True: # Reshape the state state = state.reshape((1, *state_size)) # Get action from Q-network # Estimate the Qs values state Qs = sess.run(self.DQNetwork.output, feed_dict = {self.DQNetwork.inputs_: state}) # Take the biggest Q value (= the best action) choice = np.argmax(Qs) #Perform the action and get the next_state, reward, and done information next_state, reward, done, _ = self.env.step(choice) frames.append(self.env.render(mode = 'rgb_array')) total_rewards += reward if done: print ("Score", total_rewards) total_test_rewards.append(total_rewards) break next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False) state = next_state self.env.close() display_frames_as_gif(frames) def play(self): with tf.Session() as sess: total_test_rewards = [] # Load the model self.saver.restore(sess, "models/{0}.cpkt".format(self.level_name)) #self.env = wrap_env(self.env) for episode in range(1): total_rewards = 0 state = self.env.reset() state, self.stacked_frames = stack_frames(self.stacked_frames, state, True) print("****************************************************") print("EPISODE ", episode) while True: # Reshape the state state = state.reshape((1, *state_size)) # Get action from Q-network # Estimate the Qs values state Qs = sess.run(self.DQNetwork.output, feed_dict = {self.DQNetwork.inputs_: state}) # Take the biggest Q value (= the best action) choice = np.argmax(Qs) #Perform the action and get the next_state, reward, and done information next_state, reward, done, _ = self.env.step(choice) self.env.render() total_rewards += reward if done: print ("Score", total_rewards) total_test_rewards.append(total_rewards) break next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False) state = next_state self.env.close() def train(self): with tf.Session() as sess: # initialize the variables sess.run(tf.global_variables_initializer()) # initialize decay rate (that will be used to reduce epsilon) decay_step = 0 for episode in range(total_episodes): # set step to 0 step = 0 # initialize rewards of episode episode_rewards = [] # make a new episode and opserve the first state state = self.env.reset() # remember that stack frame function state, self.stacked_frames = stack_frames(self.stacked_frames, state, True) print("Episode:", episode) while step < max_steps: step += 1 #print("step:", step) # increase decay_step decay_step += 1 # predict an action action, choice, explore_probability = self.predict_action(sess, explore_start, explore_stop, decay_rate, decay_step, state, self.possible_actions) # perform the action and get the next_state, reward, and done information next_state, reward, done, _ = self.env.step(choice) if episode_render: self.env.render() # add the reward to total reward episode_rewards.append(reward) # the game is finished if done: print("done") # the episode ends so no next state next_state = np.zeros((110, 84), dtype=np.int) next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False) # set step = max_steps to end episode step = max_steps # get total reward of the episode total_reward = np.sum(episode_rewards) print("Episode:", episode, "Total reward:", total_reward, "Explore P:", explore_probability, "Training Loss:", loss) #rewards_list.append((episode, total_reward)) # store transition <s_i, a, r_{i+1}, s_{i+1}> in memory self.memory.add((state, action, reward, next_state, done)) else: # stack frame of the next state next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False) # store transition <s_i, a, r_{i+1}, s_{i+1}> in memory self.memory.add((state, action, reward, next_state, done)) # s_{i} := s_{i+1} state = next_state ### Learning part # obtain random mini-batch from memory batch = self.memory.sample(batch_size) states_mb = np.array([each[0] for each in batch], ndmin=3) actions_mb = np.array([each[1] for each in batch]) rewards_mb = np.array([each[2] for each in batch]) next_states_mb = np.array([each[3] for each in batch], ndmin=3) dones_mb = np.array([each[4] for each in batch]) target_Qs_batch = [] # get Q values for next_state Qs_next_state = sess.run(self.DQNetwork.output, feed_dict={self.DQNetwork.inputs_: next_states_mb}) # set Q_target = r if episode ends with s+1 for i in range(len(batch)): terminal = dones_mb[i] # if we are in a terminal state, only equals reward if terminal: target_Qs_batch.append(rewards_mb[i]) else: target = rewards_mb[i] + gamma * np.max(Qs_next_state[i]) target_Qs_batch.append(target) targets_mb = np.array([each for each in target_Qs_batch]) loss, _ = sess.run([self.DQNetwork.loss, self.DQNetwork.optimizer], feed_dict={self.DQNetwork.inputs_: states_mb, self.DQNetwork.target_Q: targets_mb, self.DQNetwork.actions_: actions_mb}) # write tf summaries summary = sess.run(self.write_op, feed_dict={self.DQNetwork.inputs_: states_mb, self.DQNetwork.target_Q: targets_mb, self.DQNetwork.actions_: actions_mb}) self.writer.add_summary(summary, episode) self.writer.flush() # save model every 5 episodes if episode % 5 == 0: self.saver.save(sess, "models/{0}.cpkt".format(self.level_name)) print("Model Saved")
def main2(): import gym import copy from utils import Memory from utils import process_frame2 env = gym.make('BreakoutDeterministic-v4') gc = gated_convolution2(shape=[None, 84, 84, 4], nummap=128, numfactors=128, learning_rate=.001, w=8, s=1, a_size=env.action_space.n) mem = Memory(50000) batch_size = 4 steps = 1 length = 4 action_space = env.action_space.n with tf.Session() as sess: sess.run(tf.global_variables_initializer()) while True: s = env.reset() s = process_frame2(s) state = [s[..., np.newaxis]] * length state_ = [s[..., np.newaxis]] * length action = [-1] * length done = False while done == False: #env.render() a = np.random.randint(env.action_space.n) s_, r, done, _ = env.step(a) s_ = process_frame2(s_) state_.pop(0) action.pop(0) state_.append(s_[..., np.newaxis]) action.append(a) mem.add([ np.concatenate(state, axis=-1)[np.newaxis, ...], np.array(action)[np.newaxis, ...], np.concatenate(state_, axis=-1)[np.newaxis, ...] ]) if len(mem.mem) >= batch_size: batch = mem.sample(batch_size) #Do stuff states = [] actions = [] states_ = [] for i in range(len(batch)): states.append(batch[i][0]) actions.append(batch[i][1]) states_.append(batch[i][2]) states = np.concatenate(states, axis=0).astype( np.float64) / 255. actions = np.concatenate(actions, axis=0) states_ = np.concatenate(states_, axis=0).astype( np.float64) / 255. _, recon_loss, recon_x, recon_y, recon_action_loss = gc.run2( sess, states, actions, states_) print 'steps:', steps, 'recon_loss:', recon_loss, 'recon_action_loss', recon_action_loss, 'main2' steps += 1 if done == True: break
def main(): parser = argparse.ArgumentParser() parser.add_argument("--environment", type=str, default='CartPole-v0') parser.add_argument("--action-size", type=int, default=2) parser.add_argument("--input-shape", type=list, default=[None, 4]) parser.add_argument("--target-update-freq", type=int, default=200) parser.add_argument("--epsilon-max", type=float, default=1.) parser.add_argument("--epsilon-min", type=float, default=.01) parser.add_argument("--epsilon-decay", type=float, default=.001) parser.add_argument("--learning-rate", type=float, default=.99) parser.add_argument("--batch-size", type=int, default=64) parser.add_argument("--epochs", type=int, default=300) parser.add_argument("--replay-mem-size", type=int, default=1000000) args = parser.parse_args() env = gym.make(args.environment) args.action_size = env.action_space.n args.input_shape = [None] + list(env.observation_space.shape) print args # Epsilon parameter epsilon = args.epsilon_max # Replay memory memory = Memory(args.replay_mem_size) # Time step time_step = 0. # Initialize the agent qnet = qnetwork(input_shape=args.input_shape, action_size=args.action_size, scope='qnet') tnet = qnetwork(input_shape=args.input_shape, action_size=args.action_size, scope='tnet') update_ops = update_target_graph('qnet', 'tnet') with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for epoch in range(args.epochs): total_reward = 0 state = env.reset() while True: #env.render() if np.random.rand() < epsilon: action = np.random.randint(args.action_size) else: action = qnet.act(sess, state) next_state, reward, done, _ = env.step(action) total_reward += reward # Add to memory memory.add([state, action, reward, next_state, done]) # Reduce epsilon time_step += 1. epsilon = args.epsilon_min + ( args.epsilon_max - args.epsilon_min) * np.exp( -args.epsilon_decay * time_step) # Training step batch = np.array(memory.sample(args.batch_size)) qnet.train(sess, batch, args.learning_rate, tnet) # s <- s' state = np.copy(next_state) # Update target network if int(time_step) % args.target_update_freq == 0: sess.run(update_ops) if done: print 'epoch:', epoch, 'total_rewards:', total_reward break
def main(): parser = argparse.ArgumentParser() parser.add_argument("--environment", type=str, default='Pendulum-v0') parser.add_argument("--action-dim", type=int, default=1) parser.add_argument("--state-dim", type=int, default=1) parser.add_argument("--input-shape", type=list, default=[None, 1]) parser.add_argument("--epochs", type=int, default=30000) parser.add_argument('--tau', help='soft target update parameter', default=0.001) parser.add_argument("--action-bound", type=float, default=1.) parser.add_argument("--replay-mem-size", type=int, default=1000000) parser.add_argument("--batch-size", type=int, default=64) parser.add_argument("--gamma", type=float, default=.99) parser.add_argument("--K", type=int, default=1, help='The number of steps to train the environment') parser.add_argument( "--L", type=int, default=1, help='The number of Q-learning steps for hypothetical rollouts') parser.add_argument("--latent-size", type=int, default=4, help='Size of vector for Z') args = parser.parse_args() # Initialize environment env = gym.make(args.environment) args.state_dim = env.observation_space.shape[0] args.input_shape = [None, args.state_dim] args.action_dim = env.action_space.shape[0] #assert args.action_dim == 1 args.action_bound = env.action_space.high print(args) # Networks actor_source = actor(state_shape=[None, args.state_dim],\ action_shape=[None, args.action_dim],\ output_bound=args.action_bound[0],\ scope='actor_source') critic_source = critic(state_shape=[None, args.state_dim],\ action_shape=[None, args.action_dim],\ scope='critic_source') actor_target = actor(state_shape=[None, args.state_dim],\ action_shape=[None, args.action_dim],\ output_bound=args.action_bound[0],\ scope='actor_target') critic_target = critic(state_shape=[None, args.state_dim],\ action_shape=[None, args.action_dim],\ scope='critic_target') # Initialize the GANs cgan_state = CGAN(input_shape=args.input_shape,\ action_size=args.action_dim,\ latent_size=args.latent_size,\ gen_input_shape=args.input_shape,\ continuous_action=True) cgan_reward = CGAN(input_shape=args.input_shape,\ action_size=args.action_dim,\ latent_size=args.latent_size,\ gen_input_shape=[None, 1],\ continuous_action=True) # Update and copy operators update_target_actor = update_target_graph2('actor_source', 'actor_target', args.tau) update_target_critic = update_target_graph2('critic_source', 'critic_target', args.tau) copy_target_actor = update_target_graph2('actor_source', 'actor_target', 1.) copy_target_critic = update_target_graph2('critic_source', 'critic_target', 1.) # Replay memory memory = Memory(args.replay_mem_size) # Actor noise actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(args.action_dim)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(copy_target_critic) sess.run(copy_target_actor) for epoch in range(args.epochs): state = env.reset() total_rewards = 0.0 while True: #env.render() # Choose an action action = sess.run( actor_source.action, feed_dict={actor_source.states: state[np.newaxis, ...] })[0] + actor_noise() # Execute action state1, reward, done, _ = env.step(action) total_rewards += float(reward) # Store tuple in replay memory memory.add([state[np.newaxis, ...],\ action[np.newaxis, ...],\ reward,\ state1[np.newaxis, ...],\ done]) # Training step: update actor critic using real experience batch = np.array(memory.sample(args.batch_size)) assert len(batch) > 0 states = np.concatenate(batch[:, 0], axis=0) actions = np.concatenate(batch[:, 1], axis=0) rewards = batch[:, 2] states1 = np.concatenate(batch[:, 3], axis=0) dones = batch[:, 4] # Update the critic actions1 = sess.run(actor_target.action,\ feed_dict={actor_target.states:states1}) targetQ = np.squeeze(sess.run(critic_target.Q,\ feed_dict={critic_target.states:states1,\ critic_target.actions:actions1}), axis=-1) targetQ = rewards + ( 1. - dones.astype(np.float32)) * args.gamma * targetQ targetQ = targetQ[..., np.newaxis] _, critic_loss = sess.run([critic_source.critic_solver,\ critic_source.loss],\ feed_dict={critic_source.states:states,\ critic_source.actions:actions,\ critic_source.targetQ:targetQ}) # Update the actor critic_grads = sess.run(critic_source.grads,\ feed_dict={critic_source.states:states,\ critic_source.actions:actions})[0]# Grab gradients from critic _ = sess.run(actor_source.opt,\ feed_dict={actor_source.states:states,\ actor_source.dQ_by_da:critic_grads}) # Update target networks sess.run(update_target_critic) sess.run(update_target_actor) # Training step: update the environment model using real experience (i.e., update the conditional GANs) for k in range(args.K): batch = np.array(memory.sample(args.batch_size)) states = np.concatenate(batch[:, 0], axis=0) actions = np.concatenate(batch[:, 1], axis=0) rewards = batch[:, 2] states1 = np.concatenate(batch[:, 3], axis=0) _, D_loss_state = sess.run([cgan_state.D_solver, cgan_state.D_loss],\ feed_dict={cgan_state.states:states,\ cgan_state.actions:actions,\ cgan_state.Z:sample_z(len(batch),\ args.latent_size),\ cgan_state.X:states1}) _, G_loss_state = sess.run([cgan_state.G_solver,\ cgan_state.G_loss],\ feed_dict={cgan_state.states:states,\ cgan_state.actions:actions,\ cgan_state.Z:sample_z(len(batch),\ args.latent_size)}) _, D_loss_reward = sess.run([cgan_reward.D_solver,\ cgan_reward.D_loss],\ feed_dict={cgan_reward.states:states,\ cgan_reward.actions:actions,\ cgan_reward.Z:sample_z(len(batch),\ args.latent_size),\ cgan_reward.X:rewards[..., np.newaxis]}) _, G_loss_reward = sess.run([cgan_reward.G_solver,\ cgan_reward.G_loss],\ feed_dict={cgan_reward.states:states,\ cgan_reward.actions:actions,\ cgan_reward.Z:sample_z(len(batch),\ args.latent_size)}) #print D_loss_state, G_loss_state, D_loss_reward, G_loss_state # Training step: update actor critic using imagination rollouts for l in range(args.L): batch = np.array(memory.sample(args.batch_size)) states_ = np.concatenate(batch[:, 3], axis=0) actions = np.random.uniform(env.action_space.low[0],\ env.action_space.high[0],\ size=(len(batch),\ env.action_space.shape[0])) dones = np.array([False] * len(batch)) G_sample_state = sess.run(cgan_state.G_sample,\ feed_dict={cgan_state.states:states_,\ cgan_state.actions:actions,\ cgan_state.Z:sample_z(len(batch),\ args.latent_size)}) G_sample_reward = sess.run(cgan_reward.G_sample,\ feed_dict={cgan_reward.states:states_,\ cgan_reward.actions:actions,\ cgan_reward.Z:sample_z(len(batch),\ args.latent_size)}) G_sample_reward = np.squeeze(G_sample_reward, axis=-1) # Update the critic actions1 = sess.run(actor_target.action,\ feed_dict={actor_target.states:G_sample_state}) targetQ = np.squeeze(sess.run(critic_target.Q,\ feed_dict={critic_target.states:G_sample_state,\ critic_target.actions:actions1}), axis=-1) targetQ = G_sample_reward + ( 1. - dones.astype(np.float32)) * args.gamma * targetQ targetQ = targetQ[..., np.newaxis] _, critic_loss = sess.run([critic_source.critic_solver,\ critic_source.loss],\ feed_dict={critic_source.states:states_,\ critic_source.actions:actions,\ critic_source.targetQ:targetQ}) # Update the actor critic_grads = sess.run(critic_source.grads,\ feed_dict={critic_source.states:states_,\ critic_source.actions:actions})[0]# Grab gradients from critic _ = sess.run(actor_source.opt,\ feed_dict={actor_source.states:states_,\ actor_source.dQ_by_da:critic_grads}) # Update target networks sess.run(update_target_critic) sess.run(update_target_actor) state = np.copy(state1) if done == True: print 'epoch', epoch, 'total rewards', total_rewards break
def main(): import gym import sys import copy sys.path.append('../..') from utils import Memory #env = gym.make('LunarLander-v2') env = gym.make('Pendulum-v0') #env = gym.make('CartPole-v0') mem = Memory(1000000) batch_size = 32 try: a_size = env.action_space.n a_type = 'discrete' except: try: a_size = env.action_space.shape[0] a_type = 'continuous' except: raise ValueError('Cannot find action size.') emg = gated_env_modeler(s_shape=[None, env.observation_space.shape[0]], a_size=a_size, out_shape=[None, env.observation_space.shape[0]], a_type=a_type, numfactors=256) #emg = gated_env_modeler(s_shape=[None, env.observation_space.shape[0]], a_size=a_size, out_shape=[None, 1], a_type=a_type, numfactors=256) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) while True: s = env.reset() done = False while done == False: #env.render() #a = np.random.randint(a_size) a = random_action(a_size, a_type) s_, r, done, _ = env.step(a) mem.add([s, a, r, s_, done]) batch = mem.sample(batch_size) if len(batch) == batch_size: states = [] actions = [] rewards = [] states_ = [] for i in range(batch_size): states.append(batch[i][0]) actions.append(batch[i][1]) rewards.append(batch[i][2]) states_.append(batch[i][3]) states = np.stack(states, axis=0) actions = np.stack(actions, axis=0) rewards = np.stack(rewards, axis=0) states_ = np.stack(states_, axis=0) #_, loss_s, loss_a, loss_s_, loss = sess.run([emg.update_model, emg.loss_s, emg.loss_a, emg.loss_s_, emg.loss], feed_dict={emg.states:states, emg.states_:rewards[..., np.newaxis], emg.actions_placeholder:actions}) _, loss_s, loss_a, loss_s_, loss = sess.run( [ emg.update_model, emg.loss_s, emg.loss_a, emg.loss_s_, emg.loss ], feed_dict={ emg.states: states, emg.states_: states_, emg.actions_placeholder: actions }) print 'loss_s', loss_s, 'loss_a', loss_a, 'loss_s_', loss_s_, 'loss', loss s = copy.deepcopy(s_) if done == True: break
def main(): parser = argparse.ArgumentParser() parser.add_argument("--no-samples", type=int, default=50) parser.add_argument("--unroll-steps", type=int, default=20) parser.add_argument("--replay-mem-size", type=int, default=200) parser.add_argument("--batch-size", type=int, default=64) parser.add_argument("--pretrain-epochs", type=int, default=100) args = parser.parse_args() print args env = gym.make('Pendulum-v0') # Initialize the agent psb = policy_search_bayesian( state_dim=env.observation_space.shape[0], action_dim=env.action_space.shape[0], observation_space_low=env.observation_space.low, observation_space_high=env.observation_space.high, no_basis=(6**4) + 1, action_bound_low=env.action_space.low, action_bound_high=env.action_space.high, unroll_steps=args.unroll_steps, no_samples=args.no_samples, discount_factor=.9) # Initialize the memory memory = Memory(args.replay_mem_size) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) #psb.pretrain(sess, args.pretrain_epochs) state = env.reset() total_rewards = 0.0 epoch = 1 #batch = [] for time_steps in range(30000): #env.render() # Get action and step in environment action = psb.act(sess, state, epoch) next_state, reward, done, _ = env.step(action) total_rewards += float(reward) # Append to the batch memory.add([ np.atleast_2d(state), np.atleast_2d(action), reward, np.atleast_2d(next_state), done ]) #batch.append([state, action, reward, next_state, done]) # Training step batch = memory.sample(args.batch_size) states = np.concatenate([b[0] for b in batch], axis=0) #psb.train2(sess, states) psb.train_policy(sess, states, epoch) # s <- s' state = np.copy(next_state) if done == True: print 'time steps', time_steps, 'epoch', epoch, 'total rewards', total_rewards epoch += 1 total_rewards = 0. ''' B = batch states = np.stack([b[0] for b in B], axis=0) actions = np.stack([b[1] for b in B], axis=0) rewards = np.array([b[2] for b in B]) next_states = np.stack([b[3] for b in B], axis=0) dones = np.array([float(b[4]) for b in B]) psb.train_dynamics(sess, states, actions, next_states) psb.visualize_trajectories2(sess) psb.visualize_trajectories(sess) #psb.train_policy(sess, states, epoch) batch = [] ''' state = env.reset()