def run(self): self.t_train = 0 self.t_test = 0 # create filtered environment self.env = filter_env.makeFilteredEnv(gym.make(FLAGS.env)) # self.env = gym.make(FLAGS.env) self.env.monitor.start(FLAGS.outdir+'/monitor/',video_callable=lambda _: False) gym.logger.setLevel(gym.logging.WARNING) dimO = self.env.observation_space.shape dimA = self.env.action_space.shape print(dimO,dimA) import pprint pprint.pprint(self.env.spec.__dict__,width=1) self.agent = ddpg.Agent(dimO=dimO,dimA=dimA) returns = [] # main loop while self.t_train < FLAGS.total: # test T = self.t_test R = [] self.env.monitor.start(FLAGS.outdir+'/monitor/',video_callable=lambda _: False,resume=True) while self.t_test - T < FLAGS.test: R.append(self.run_episode(test=True,monitor=(len(R)==0))) avr = np.mean(R) print('Average test return\t{} after {} timesteps of training'.format(avr,self.t_train)) # save return returns.append((self.t_train, avr)) np.save(FLAGS.outdir+"/returns.npy",returns) # evaluate required number of episodes for gym if self.env.spec.reward_threshold is not None and avr > self.env.spec.reward_threshold: for i in range(self.env.spec.trials): self.run_episode(test=True) self.env.monitor.close() # train T = self.t_train R = [] while self.t_train - T < FLAGS.train: R.append(self.run_episode(test=False)) avr = np.mean(R) print('Average training return\t{} after {} timesteps of training'.format(avr,self.t_train)) self.env.monitor.close() # upload results if FLAGS.upload: gym.upload(FLAGS.outdir+"/monitor",algorithm_id = GYM_ALGO_ID)
def test_ddpg_replay(tmpdir): import ddpg import numpy as np np.set_printoptions(threshold=np.nan) ddpg.FLAGS.warmup = 10000 ddpg.FLAGS.outdir = tmpdir.strpath # test replay memory a = ddpg.Agent([1], [1]) a.reset([0]) T = 10 actions = [] for t in range(0, T): actions.append(a.act()) a.observe(t, False, [t + 1])
def run(self,t_train = 1000000,t_warmup=20000,f_test=20,env='Pendulum-v0',render=False,**kwargs): self.t_warmup = t_warmup self.t_log = 103 #self.env = gym.make(s.env) #from gym.envs.classic_control.dl import DoubleLinkEnv self.env = doublelink.DoubleLinkEnv() # self.env.monitor.start('./monitor/',video_callable=lambda _: False) TODO: fix on cluster dimO = self.env.observation_space.shape dimA = self.env.action_space.shape print('dimO: '+str(dimO) +' dimA: '+str(dimA)) # agent self.agent = ddpg.Agent(dimO=dimO,dimA=dimA,**kwargs) # main loop while self.agent.t < t_train: # train for i in xrange(f_test): self.run_episode(test=False) # test R = np.mean([self.run_episode(test=True,render=render) for _ in range(5)]) print('Average return '+str(R)+ ' after '+str(self.agent.t)+' timesteps of training')
def run(self, env): self.t_train = 0 self.t_test = 0 # create filtered environment # self.env = filter_env.makeFilteredEnv(gym.make(FLAGS.env)) self.env = filter_env.makeFilteredEnv(gym.make(env)) self.t_elapsed = [] # self.env = gym.make(FLAGS.env) if tf.gfile.Exists(FLAGS.outdir): tf.gfile.DeleteRecursively(FLAGS.outdir) # self.env.monitor.start(FLAGS.outdir + '/monitor/', video_callable=lambda _: False) # gym.logger.setLevel(gym.logging.WARNING) dimO = self.env.observation_space.shape dimA = self.env.action_space.shape print 'observationspace action space', print(dimO, dimA) import pprint pprint.pprint(self.env.spec.__dict__, width=1) self.agent = ddpg.Agent(dimO=dimO, dimA=dimA) returns = [] it = 0 episodelengths = [] testlengths = [] if env == 'Reacher-v1': self.train_frequency = 1 test_frequency = 3 plot_frequency = 1 if env == 'MountainCarContinuous-v0': test_frequency = 10 plot_frequency = 1 self.train_frequency = 16 if env == 'InvertedPendulum-v1': test_frequency = 100 plot_frequency = 300 self.train_frequency = 1 print 'using train frequency', self.train_frequency # main loop while self.t_train < FLAGS.total: it += 1 episodelengths.append(self.run_episode(test=False)) if it % test_frequency == 0: testlengths.append(self.run_episode(test=True)) if it % plot_frequency == 0: print 'avg time for sim step:', np.mean( np.array(self.t_elapsed)) plotting.plot_episode_lengths(episodelengths) plotting.plot_episode_lengths(testlengths) # plotting.plot_replay_memory_2d_state_histogramm(self.agent.rm.observations) # plotting.plot_learned_mu(self.agent.act_test, self.env) # else: # # test # T = self.t_test # R = [] # # while self.t_test - T < FLAGS.test: # # print 'running test episode' # R.append(self.run_episode(test=True, monitor=(self.t_test - T < FLAGS.monitor * FLAGS.test))) # avr = np.mean(R) # print('Average test return\t{} after {} timesteps of training'.format(avr, self.t_train)) # # save return # returns.append((self.t_train, avr)) # np.save(FLAGS.outdir + "/returns.npy", returns) # # # evaluate required number of episodes for gym and end training when above threshold # if self.env.spec.reward_threshold is not None and avr > self.env.spec.reward_threshold: # avr = np.mean([self.run_episode(test=True) for _ in range(self.env.spec.trials)]) # if avr > self.env.spec.reward_threshold: # break # # # train # T = self.t_train # R = [] # while self.t_train - T < FLAGS.train: # # print 'running train episode' # R.append(self.run_episode(test=False)) # avr = np.mean(R) # print('Average training return\t{} after {} timesteps of training'.format(avr, self.t_train)) # self.env.monitor.close() # upload results if FLAGS.upload: gym.upload(FLAGS.outdir + "/monitor", algorithm_id=GYM_ALGO_ID)
state_inp = concat_obs_goal(observation) # parameters for the agent d_alpha = 0.000025 d_beta = 0.00025 d_tau = 0.001 d_layer1_size = 256 d_layer2_size = 256 d_layer3_size = 256 d_layer4_size = 256 d_input_dims = state_inp.shape[0] d_num_actions = env.action_space.shape[0] d_output_dir = "tmp/fetch_test/" fetch_agent = ddpg.Agent(d_alpha, d_beta, d_input_dims, d_tau, env, d_num_actions, d_layer1_size, d_layer2_size, d_layer3_size, d_layer4_size, d_output_dir) fetch_agent.load_models() score_history = [] num_steps = 50 test_no_episodes = 100 score_file_name = "inference.csv" for i in range(test_no_episodes): #Total episodes to train observation = env.reset() #Sample desired goal desired_goal = observation["desired_goal"] curr_state = observation["observation"]
def run(self): self.t_train = 0 self.t_test = 0 # create filtered environment self.env = filter_env.makeFilteredEnv(gym.make(FLAGS.env), skip_space_norm=FLAGS.skip_space_norm, wolpertinger=FLAGS.wolpertinger) # self.env = gym.make(FLAGS.env) self.env.monitor.start(FLAGS.outdir+'/monitor/',video_callable=lambda _: False) # self.env.monitor.start(FLAGS.outdir+'/monitor/',video_callable=lambda _: True) gym.logger.setLevel(gym.logging.WARNING) dimO = self.env.observation_space.shape dimA = self.env.action_space.shape print(dimO,dimA) import pprint pprint.pprint(self.env.spec.__dict__,width=1) wolp = None if FLAGS.wolpertinger: wolp = wp.Wolpertinger(self.env, i=FLAGS.wp_total_actions, action_set=wp.load_action_set(FLAGS.wp_action_set_file, i=FLAGS.wp_total_actions, action_shape=dimA[0]) ).g elif FLAGS.fmpolicy: wolp = fmp.FMPolicy(self.env).g self.agent = ddpg.Agent(dimO=dimO, dimA=dimA, custom_policy=FLAGS.wolpertinger or FLAGS.fmpolicy, env_dtype=str(self.env.action_space.high.dtype)) returns = [] # main loop while self.t_train < FLAGS.total: # test T = self.t_test R = [] if self.t_train - T > 0 or FLAGS.train == 0: while self.t_test - T < FLAGS.test: R.append(self.run_episode(test=True, monitor=(self.t_test - T < FLAGS.monitor * FLAGS.test), custom_policy=wolp)) self.t_test += 1 avr = np.mean(R) # print('Average test return\t{} after {} timesteps of training'.format(avr,self.t_train)) with open(os.path.join(FLAGS.outdir, "output.log"), mode='a') as f: # f.write('Average test return\t{} after {} timesteps of training\n'.format(avr, self.t_train)) f.write('Average test return\t{} after {} timesteps\n'.format(avr, self.t_train + FLAGS.test)) # save return returns.append((self.t_train, avr)) np.save(FLAGS.outdir+"/returns.npy",returns) s = self.agent.checkpoint_session() with open(os.path.join(FLAGS.outdir, "output.log"), mode='a') as f: f.write('Checkpoint saved at {} \n'.format(s)) # evaluate required number of episodes for gym and end training when above threshold if self.env.spec.reward_threshold is not None and avr > self.env.spec.reward_threshold: # TODO: it is supposed that when testing the model does not have to use the full wolpertinger policy? # TODO: to avoid the item not found exception in environment, custom policy is being sent to the run_episode avr = np.mean([self.run_episode(test=True, custom_policy=wolp) for _ in range(self.env.spec.trials)]) # trials??? # print('TRIALS => Average return{}\t Reward Threshold {}'.format(avr, self.env.spec.reward_threshold)) with open(os.path.join(FLAGS.outdir, "output.log"), mode='a') as f: f.write('TRIALS => Average return{}\t Reward Threshold {}\n'.format(avr, self.env.spec.reward_threshold)) if avr > self.env.spec.reward_threshold: s = self.agent.checkpoint_session() with open(os.path.join(FLAGS.outdir, "output.log"), mode='a') as f: f.write('Final Checkpoint saved at {} \n'.format(s)) break # train T = self.t_train R = [] start_time = time.time() while self.t_train - T < FLAGS.train: R.append(self.run_episode(test=False, custom_policy=wolp)) self.t_train += 1 end_time = time.time() avr = np.mean(R) # print('Average training return\t{} after {} timesteps of training'.format(avr,self.t_train)) with open(os.path.join(FLAGS.outdir, "output.log"), mode='a') as f: f.write('Average training return\t{} after {} timesteps of training. Batch time: {} sec.\n' .format(avr, self.t_train, end_time - start_time)) self.env.monitor.close() f.close() # upload results if FLAGS.upload: gym.upload(FLAGS.outdir+"/monitor",algorithm_id = GYM_ALGO_ID)
def train(hp, simulator, unity_worker_id): random.seed(1234) np.random.seed(2345) torch.manual_seed(4567) time_start = time.time() run_id = time.strftime('%b%d_%H-%M-%S', time.localtime(time_start)) run_id = f'{run_id}_{unity_worker_id}' file_handler = logging.FileHandler(f'run/log/{run_id}.log') file_handler.setFormatter(FORMATTER) ROOT_LOGGER.addHandler(file_handler) repo = git.Repo() logger.info('======= run_id %s started at %s =======', run_id, time.strftime('%b%d_%H-%M-%S_%z')) logger.info('using commit: %s (clean=%s): %s', repo.head.commit.hexsha, not repo.is_dirty(), repo.head.commit.message) logger.info('run_id %s using hyper parameters: %s, unity_worker_id: %s', run_id, hp, unity_worker_id) setup_env() writer = tensorboardX.SummaryWriter(os.path.join('run/summary', run_id)) env = UnityEnvironment(file_name=simulator, worker_id=unity_worker_id) state_length = env.brains[BRAIN_NAME].vector_observation_space_size action_length = env.brains[BRAIN_NAME].vector_action_space_size agent = ddpg.Agent(state_length=state_length * NUM_STACKS, action_length=action_length, hp=hp, writer=writer) window_rewards = collections.deque(maxlen=SOLVE_NUM_EPISODES) last_save_episode = None pbar = tqdm.tqdm(total=hp.num_episodes, desc=run_id) for i_episode in range(hp.num_episodes): states = env.reset(train_mode=True)[BRAIN_NAME].vector_observations # this records the episode reward for each agent episode_rewards = np.zeros(NUM_HOMOGENEOUS_AGENTS) episode_length = 0 while True: episode_length += 1 actions = agent.act(states) env_info = env.step(actions)[BRAIN_NAME] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done agent.step(states, actions, rewards, next_states, dones) episode_rewards += rewards if any(dones): break else: states = next_states pbar.update(1) writer.add_scalar('episode_length', episode_length, i_episode) # the episode reward is defined to be the maximum of all agents episode_reward = np.max(episode_rewards) writer.add_scalar('episode_reward_agent_max', episode_reward, i_episode) for i, reward in enumerate(episode_rewards): writer.add_scalar(f'agent_{i}/episode_reward', reward, i_episode) window_rewards.append(episode_reward) mean_reward = np.mean(window_rewards) writer.add_scalar( f'episode_reward_agent_max_avg_over_{SOLVE_NUM_EPISODES}_episodes', mean_reward, i_episode) if (len(window_rewards) >= SOLVE_NUM_EPISODES and mean_reward >= SOLVE_REWARD and (last_save_episode is None or i_episode - last_save_episode >= hp.save_interval)): last_save_episode = i_episode save_dir = os.path.join('run/model', f'{run_id}_{i_episode}') os.makedirs(save_dir, exist_ok=True) agent.save(save_dir) logger.info('model saved to directory: %s', save_dir) time_stop = time.time() logger.info('run_id %s completed at %s, time cost: %s seconds', run_id, time.strftime('%b%d_%H-%M-%S_%z', time.localtime(time_stop)), f'{time_stop - time_start:.2f}') env.close()
def __init__(self, hp): self.hp = hp self.memory = replay_buffer.ReplayBuffer(hp) self.agents = [ddpg.Agent(self.hp) for _ in range(self.hp.num_agents)] self.losses = (0., 0.)