def main(args): env = gym.make(args['env_name']) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') action_dim = env.action_space.shape[0] max_action = env.action_space.high[0] state_dim = env.observation_space.shape[0] ddpg = DDPG(args, action_dim, max_action, state_dim, device) trained_actor = torch.load(args['model_directory']) ddpg.actor.load_state_dict(trained_actor) timestep = 0 for episode in range(args['max_episode']): episode_reward = 0 state = env.reset() while True: action = ddpg.get_action(state) next_state, reward, done, info = env.step(action) env.render() episode_reward += reward state = next_state timestep += 1 if done: print('episode: ', episode, ' reward : %.3f' % (episode_reward), ' timestep :', timestep) break
def main(): # Initialize the ANNs agent = DDPG() rospy.init_node("neuro_deep_planner", anonymous=False) ros_handler = ROSHandler() ros_handler.on_policy = False while not rospy.is_shutdown(): # If we have a new msg we might have to execute an action and need to put the new experience in the buffer if ros_handler.new_msg(): if not ros_handler.is_episode_finished: # Send back the action to execute ros_handler.publish_action(agent.get_action(ros_handler.state)) # Safe the past state and action + the reward and new state into the replay buffer agent.set_experience(ros_handler.state, ros_handler.reward, ros_handler.is_episode_finished) elif ros_handler.new_setting(): agent.noise_flag = ros_handler.noise_flag else: # Train the network! agent.train()
def main(): with tf.Session() as sess: while True: try: env = CarlaEnv() break except Exception as e: print(e) agent = Agent(sess=sess, state_size=env.observation_space.shape[0], action_size=env.action_space.shape[0]) max_episodes = 1000 max_steps = 1800 for i in range(int(max_episodes)): state = env.reset() print(state.shape) ep_reward = 0 ep_ave_max_q = 0 # plt.clf() # if i: # with open("ddpg_memory.pkl","wb") as hand: # pickle.dump(replay_buffer,hand) # actor.save_model() # critic.save_model() # print("Agent saved") for j in range(int(max_steps)): print("epoch: {}, step: {}".format(i, j)) # env.render() # Added exploration noise # a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i)) action = agent.get_action(state) # a = controller(s[0],s[1],s[3]) # a = [a] next_state, reward, done, info = env.step(action) print("reward: {}".format(reward)) agent.remember(state, action, reward, done, next_state) # Keep adding experience to the memory until # there are at least minibatch size samples agent.train()
def main(): agent = DDPG() print "========================== finish ddpg init" rospy.init_node("robot_follower", anonymous=True) ros_handler = RosHandler() print "========================== start ros_handler" while not rospy.is_shutdown(): if ros_handler.new_msg(): if not ros_handler.end_of_episode: action = agent.get_action(ros_handler.state, ros_handler.old_action) ros_handler.publish_action(action) agent.set_experience(ros_handler.state, ros_handler.reward, ros_handler.end_of_episode) else: agent.train()
def main(): # Initialize the ANNs agent = DDPG() rospy.init_node("neuro_deep_planner", anonymous=False) ros_handler = ROSHandler() ros_handler.on_policy = False # For plotting currently_plotting = False goal_count = 0 crash_count = 0 start_time = 0 # Make sure the directory for the plotting exists if not tf.gfile.Exists(PLOT_PATH): tf.gfile.MakeDirs(PLOT_PATH) f = open(PLOT_PATH + '/results', 'w') while not rospy.is_shutdown(): # If we are plotting results we don't want to train and need to turn of noise! if PLOTTING and not currently_plotting and agent.training_step > 0 and \ agent.training_step % PLOT_INTERVALL == 0: currently_plotting = True agent.noise_flag = False start_time = rospy.get_time() if currently_plotting and rospy.get_time() - start_time > PLOT_TIME: # Plot the results string = str(agent.training_step) + ', ' + str( goal_count) + ', ' + str(crash_count) + '\n' f.write(string) # Reset all parameters currently_plotting = False agent.noise_flag = True goal_count = 0 crash_count = 0 # If we are plotting results we need to count reached goals and crashes if currently_plotting: # Count the positive and negative rewards if ros_handler.new_msg(): if not ros_handler.is_episode_finished: # Send back the action to execute ros_handler.publish_action( agent.get_action(ros_handler.state)) elif ros_handler.reward == 1: goal_count += 1 elif ros_handler.reward == -1: crash_count += 1 # If we're not plotting results else: # If we have a new msg we might have to execute an action and need to put the new experience in the buffer if ros_handler.new_msg(): if not ros_handler.is_episode_finished: # Send back the action to execute ros_handler.publish_action( agent.get_action(ros_handler.state)) # Safe the past state and action + the reward and new state into the replay buffer agent.set_experience(ros_handler.state, ros_handler.reward, ros_handler.is_episode_finished) elif ros_handler.new_setting(): agent.noise_flag = ros_handler.noise_flag else: # Train the network! agent.train()
with tf.device('/{}:0'.format('CPU')): agent = DDPG(actor=actor, critic=critic, exprep=exprep, noise=noise, action_bound=ACTION_RANGE) sess.run(tf.initialize_all_variables()) for i in range(NUM_EPISODES): cur_state = env.reset() cum_reward = 0 # tensorboard summary summary_writer = tf.summary.FileWriter('/tmp/pendulum-log-0'+'/train', graph=tf.get_default_graph()) if (i % EVALUATE_EVERY) == 0: print ('====evaluation====') for t in range(MAX_STEPS): if (i % EVALUATE_EVERY) == 0: env.render() action = agent.get_action(cur_state, sess)[0] else: # decaying noise action = agent.get_action_noise(cur_state, sess, rate=(NUM_EPISODES-i)/NUM_EPISODES)[0] next_state, reward, done, info = env.step(action) if (i % EVALUATE_EVERY) == 0: print('cur_state: ', end='') print(cur_state) print('action: ', end='') print(action) print('reward: ' + str(reward)) print('------------------------------------------------------') if done: cum_reward += reward agent.add_step(Step(cur_step=cur_state, action=action, next_step=next_state, reward=reward, done=done)) print("Done! Episode {} finished after {} timesteps, cum_reward: {}".format(i, t + 1, cum_reward))
class Environment: """ Train & simulate wrapper for Atari-DQN Args: params: dictionary of parameters memory_size : size of replay memory. 100000 needs almost 25GB memory, recommend reduce it if you need exploration_step : pure exploration step gamma : discount rate tau: parameter for soft update lr_actor: learning rate for actor network lr_critic: learning rate for critic network device_name : name of device(normally cpu:0 or gpu:0) """ def __init__(self, params, device_name): self.env = gym.make('Pendulum-v0') self.ddpg = DDPG(input_dim=self.env.observation_space.shape[0], action_dim=self.env.action_space.shape[0], action_scale=(self.env.action_space.low[0], self.env.action_space.high[0]), memory_size=params["memory_size"], gamma=params["gamma"], tau=params["tau"], learning_rate_actor=params["lr_actor"], learning_rate_critic=params["lr_critic"], device_name=device_name) self.ddpg.build() self.ddpg.summary() self.random_process = OUNoise(size=self.env.action_space.shape[0]) # total step operated self.i_step = 0 def load(self, global_step="latest"): """ Load saved weights for ddpg Args: global_step : load specific step, if "latest" load latest one """ self.ddpg.load(global_step) def save(self): """ Save current weight of ddpg layers """ self.ddpg.save() def train(self, episode, max_step, minibatch_size, render=False, verbose=1, val_epi=5, saving=False): """run the game with training network Args: episode : number of train episodes max_step : maximum step for each episode minibatch_size : minibatch size for replay memory training render : whether to show game simulating graphic verbose : for which step it will print the loss and accuracy (and saving) val_epi : number of episode for validation saving: whether to save checkpoint or not """ losses = [] episode_return = [] verbose_return = [] episode_return_val = [] tr = trange(episode, desc="") for i_episode in tr: return_episode = 0 observation = self.env.reset() self.random_process.reset() for t in range(max_step): self.i_step += 1 if render: self.env.render() X = observation.astype(np.float32) action_policy = self.ddpg.get_action(tf.convert_to_tensor(X)) action_policy += self.random_process.sample() action_policy = np.clip(action_policy, self.env.action_space.low[0], self.env.action_space.high[0]) observation, reward, done, info = self.env.step(action_policy) return_episode += reward X_next = observation.astype(np.float32) self.ddpg.replay_memory.append( (X, action_policy, reward, X_next, done)) # training step if len(self.ddpg.replay_memory) > minibatch_size: X_batch, action_batch, reward_batch, X_next_batch, done_batch = self.ddpg.replay_memory.get_batch( minibatch_size) loss_critic, loss_actor = self.ddpg.train( X_batch, action_batch, reward_batch, X_next_batch, done_batch) losses.append((loss_critic, loss_actor)) if done: break episode_return.append(return_episode) verbose_return.append(return_episode) tr.set_description("%.4f" % (sum(episode_return) / len(episode_return))) if i_episode == 0 or ((i_episode + 1) % verbose == 0): if len(self.ddpg.replay_memory) <= minibatch_size: stage_tooltip = "EXPLORATION" print(Fore.RED + "[EPISODE %3d / STEP %5d] - %s" % (i_episode + 1, self.i_step, stage_tooltip)) print(Fore.GREEN + "Learned Step : %4d" % (self.ddpg.global_step)) print(Fore.BLUE + "AVG Return : %.4f" % (sum(verbose_return) / len(verbose_return))) print(Fore.BLUE + "MAX Return : %.4f" % (max(verbose_return))) continue else: stage_tooltip = "TRAINING" losses_critic = [l[0] for l in losses] losses_actor = [l[1] for l in losses] # validation returns = [] for epi_val in range(val_epi): return_episode_val = 0 observation = self.env.reset() for t in range(max_step): if render: self.env.render() action_policy = self.ddpg.get_action( tf.convert_to_tensor(observation.astype( np.float32))) observation, reward, done, info = self.env.step( action_policy) return_episode_val += reward if done: # print(Fore.GREEN + "EPISODE %3d: REWARD: %s" % (i_episode, return_episode)) returns.append(return_episode_val) break print(Fore.RED + "[EPISODE %3d / STEP %5d] - %s" % (i_episode + 1, self.i_step, stage_tooltip)) print(Fore.GREEN + "Learned Step : %4d" % (self.ddpg.global_step)) print(Fore.BLUE + "AVG Return : %.4f" % (sum(verbose_return) / len(verbose_return))) print(Fore.BLUE + "MAX Return : %.4f" % (max(verbose_return))) print(Fore.LIGHTYELLOW_EX + "AVG LOSS Actor : %.4f" % (sum(losses_actor) / len(losses_actor))) print(Fore.LIGHTYELLOW_EX + "AVG LOSS Critic : %.4f" % (sum(losses_critic) / len(losses_critic))) print(Fore.LIGHTRED_EX + "AVG VAL[%2d] Return : %.4f" % (val_epi, sum(returns) / len(returns))) print(Fore.LIGHTRED_EX + "MAX VAL[%2d] Return : %.4f" % (val_epi, max(returns))) verbose_return = [] losses = [] episode_return_val.append(sum(returns) / len(returns)) if saving: self.save() time.sleep(1) return episode_return def simulate(self, episode, max_step=1000, render=False): """Run the game with existing dqn network Args: episode : number of train episodes max_step : maximum step for each episode render : whether to show game simulating graphic """ returns = [] for i_episode in range(episode): return_episode = 0 observation = self.env.reset() for t in range(max_step): if render: self.env.render() action_policy = self.ddpg.get_action( tf.convert_to_tensor(observation.astype(np.float32))) observation, reward, done, info = self.env.step(action_policy) return_episode += reward if done: print(Fore.GREEN + "EPISODE %3d: REWARD: %s" % (i_episode, return_episode)) returns.append(return_episode) break print(Fore.RED + "AVG REWARD : %s" % (sum(returns) / len(returns))) print(Fore.BLUE + "MAX REWARD : %s" % (max(returns)))
def main(): # Initialize the ANNs agent = DDPG() rospy.init_node("neuro_deep_planner", anonymous=False) ros_handler = ROSHandler() ros_handler.on_policy = False # For plotting currently_plotting = False goal_count = 0 crash_count = 0 start_time = 0 # Make sure the directory for the plotting exists if not tf.gfile.Exists(PLOT_PATH): tf.gfile.MakeDirs(PLOT_PATH) f = open(PLOT_PATH + '/results', 'w') while not rospy.is_shutdown(): # If we are plotting results we don't want to train and need to turn of noise! if PLOTTING and not currently_plotting and agent.training_step > 0 and \ agent.training_step % PLOT_INTERVALL == 0: currently_plotting = True agent.noise_flag = False start_time = rospy.get_time() if currently_plotting and rospy.get_time() - start_time > PLOT_TIME: # Plot the results string = str(agent.training_step) + ', ' + str(goal_count) + ', ' + str(crash_count) + '\n' f.write(string) # Reset all parameters currently_plotting = False agent.noise_flag = True goal_count = 0 crash_count = 0 # If we are plotting results we need to count reached goals and crashes if currently_plotting: # Count the positive and negative rewards if ros_handler.new_msg(): if not ros_handler.is_episode_finished: # Send back the action to execute ros_handler.publish_action(agent.get_action(ros_handler.state)) elif ros_handler.reward == 1: goal_count += 1 elif ros_handler.reward == -1: crash_count += 1 # If we're not plotting results else: # If we have a new msg we might have to execute an action and need to put the new experience in the buffer if ros_handler.new_msg(): if not ros_handler.is_episode_finished: # Send back the action to execute ros_handler.publish_action(agent.get_action(ros_handler.state)) # Safe the past state and action + the reward and new state into the replay buffer agent.set_experience(ros_handler.state, ros_handler.reward, ros_handler.is_episode_finished) elif ros_handler.new_setting(): agent.noise_flag = ros_handler.noise_flag else: # Train the network! agent.train()
import tensorflow as tf import gym from ddpg import DDPG as Agent env = gym.make("MountainCarContinuous-v0") keel = Agent(env, 100, 10000, 0.99, (100, 100, 0.9), (100, 100, 0.9)) while (True): starting_state = env.reset() action = keel.get_action(starting_state) next_state, reward, done = env.observation(action) keel.experience(starting_state, action, reward, done, next_state) keel.train()