def run(self): self.train_timestep = 0 self.test_timestep = 0 # create normal self.env = normalized_env.make_normalized_env(gym.make(FLAGS.env)) tf.set_random_seed(FLAGS.tfseed) np.random.seed(FLAGS.npseed) self.env.monitor.start(os.path.join(FLAGS.outdir, 'monitor'), force=FLAGS.force) self.env.seed(FLAGS.gymseed) gym.logger.setLevel(gym.logging.WARNING) dimO = self.env.observation_space.shape dimA = self.env.action_space.shape pprint.pprint(self.env.spec.__dict__) self.agent = Agent(dimO, dimA=dimA) test_log = open(os.path.join(FLAGS.outdir, 'test.log'), 'w') train_log = open(os.path.join(FLAGS.outdir, 'train.log'), 'w') while self.train_timestep < FLAGS.total: # test reward_list = [] for _ in range(FLAGS.test): reward, timestep = self.run_episode( test=True, monitor=np.random.rand() < FLAGS.monitor) reward_list.append(reward) self.test_timestep += timestep avg_reward = np.mean(reward_list) print( 'Average test return {} after {} timestep of training.'.format( avg_reward, self.train_timestep)) test_log.write("{}\t{}\n".format(self.train_timestep, avg_reward)) test_log.flush() # train reward_list = [] last_checkpoint = np.floor(self.train_timestep / FLAGS.train) while np.floor(self.train_timestep / FLAGS.train) == last_checkpoint: print('=== Running episode') reward, timestep = self.run_episode(test=False, monitor=False) reward_list.append(reward) self.train_timestep += timestep train_log.write("{}\t{}\n".format(self.train_timestep, reward)) train_log.flush() avg_reward = np.mean(reward_list) print('Average train return {} after {} timestep of training.'. format(avg_reward, self.train_timestep)) os.system('{} {}'.format(plotScr, FLAGS.outdir)) self.env.monitor.close() os.makedirs(os.path.join(FLAGS.outdir, "tf")) ckpt = os.path.join(FLAGS.outdir, "tf/model.ckpt") self.agent.saver.save(self.agent.sess, ckpt)
def run(self): self.train_timestep = 0 self.test_timestep = 0 # create normal self.env = normalized_env.make_normalized_env(gym.make(FLAGS.env)) tf.set_random_seed(FLAGS.tfseed) np.random.seed(FLAGS.npseed) self.env.monitor.start(os.path.join(FLAGS.outdir, 'monitor'), force=FLAGS.force) self.env.seed(FLAGS.gymseed) gym.logger.setLevel(gym.logging.WARNING) dimO = self.env.observation_space.shape dimA = self.env.action_space.shape print(dimO, dimA) pprint.pprint(self.env.spec.__dict__) self.agent = Agent(dimO, dimA=dimA) simple_log_file = open(os.path.join(FLAGS.outdir, 'log.txt'), 'w') while self.train_timestep < FLAGS.total: # test reward_list = [] for _ in xrange(FLAGS.test): reward, timestep = self.run_episode(test=True, monitor=np.random.rand() < FLAGS.monitor) reward_list.append(reward) self.test_timestep += timestep avg_reward = np.mean(reward_list) print('Average test return {} after {} timestep of training.'.format(avg_reward, self.train_timestep)) print >> simple_log_file, "{}\t{}".format(self.train_timestep, avg_reward) # train reward_list = [] last_checkpoint = self.train_timestep / FLAGS.train while self.train_timestep / FLAGS.train == last_checkpoint: reward, timestep = self.run_episode(test=False, monitor=False) reward_list.append(reward) self.train_timestep += timestep avg_reward = np.mean(reward_list) print('Average train return {} after {} timestep of training.'.format(avg_reward, self.train_timestep)) self.env.monitor.close()
def run(self): self.train_timestep = 0 self.test_timestep = 0 # create normal maze_def = {'type': FLAGS.maze} self.env = normalized_env.make_normalized_env( Minecraft(maze_def, reset=FLAGS.reset, grayscale=False, vision_observation=FLAGS.vision, video_dim=(FLAGS.height, FLAGS.width), num_parallel=FLAGS.num_parallel) ) # normalized_env.make_normalized_env(gym.make(FLAGS.env)) tf.set_random_seed(FLAGS.tfseed) np.random.seed(FLAGS.npseed) #self.env.monitor.start(os.path.join(FLAGS.outdir, 'monitor'), force=FLAGS.force) #self.env.seed(FLAGS.gymseed) gym.logger.setLevel(gym.logging.WARNING) dimO = self.env.observation_space.shape dimA = self.env.action_space.shape print(dimO, dimA) #pprint.pprint(self.env.spec.__dict__) self.agent = Agent(dimO, dimA=dimA) simple_log_file = open(os.path.join(FLAGS.outdir, 'log.txt'), 'a') # Save command line arg git_hash = subprocess.check_output( ['git', 'rev-parse', '--short', 'HEAD']) simple_log_file.write(" ".join(sys.argv[:] + [git_hash])) avg_rewards = [] while self.train_timestep < FLAGS.total: # test reward_list = [] for _ in xrange(FLAGS.test): reward, timestep = self.run_episode( test=True, monitor=np.random.rand() < FLAGS.monitor) reward_list.append(reward) self.test_timestep += timestep avg_reward = np.mean(reward_list) avg_rewards.append(avg_reward) print( 'Average test return {} after {} timestep of training.'.format( avg_reward, self.train_timestep)) #print >> simple_log_file, "{}\t{}\t{}\t{}\t{}".format(self.train_timestep, avg_reward, np.std(reward_list), np.min(reward_list), np.max(reward_list)) simple_log_file.write("{}\t{}\t{}\t{}\t{}\n".format( self.train_timestep, avg_reward, np.std(reward_list), np.min(reward_list), np.max(reward_list))) simple_log_file.flush() # Stopping criterion if self.train_timestep > 5e5 and len(avg_rewards) > 10 and np.var( avg_rewards) < 1: break # train reward_list = [] last_checkpoint = self.train_timestep / FLAGS.train while self.train_timestep / FLAGS.train == last_checkpoint: reward, timestep = self.run_episode(test=False, monitor=False) reward_list.append(reward) self.train_timestep += timestep avg_reward = np.mean(reward_list) print('Average train return {} after {} timestep of training.'. format(avg_reward, self.train_timestep))
def run(self): Agents = [Agent1, Agent2, Agent3, Agent4, Agent5] rd_seeds = [8, 15, 20, 35] for i in range(4): if FLAGS.i != 0: i = FLAGS.i self.train_timestep = 0 self.test_timestep = 0 # create normal self.env = normalized_env.make_normalized_env(gym.make(FLAGS.env)) tf.set_random_seed(rd_seeds[i]) np.random.seed(rd_seeds[i]) #self.env.monitor.start(os.path.join(FLAGS.outdir, 'monitor'), force=FLAGS.force) # self.env = gym.wrappers.Monitor(self.env,os.path.join(FLAGS.outdir, 'monitor'),force=True) self.env.seed(rd_seeds[i]) gym.logger.set_level(logging.WARNING) dimO = self.env.observation_space.shape dimA = self.env.action_space.shape pprint.pprint(self.env.spec.__dict__) if FLAGS.model == "ICNN" or FLAGS.model == "ICNN_ARCH": print("Yes,ICNN!") self.agent = Agents[i](dimO=dimO, dimA=dimA) else: self.agent = Agents[i](dimO=dimO, dimA=dimA, num_layer=FLAGS.num_layer, num_nodes=FLAGS.num_nodes) test_log = open(os.path.join(FLAGS.outdir, 'test.log'), 'w') train_log = open(os.path.join(FLAGS.outdir, 'train.log'), 'w') x_offline = [] y_offline = [] x_online = [] y_online = [] reward_best = -2000 while self.train_timestep <= FLAGS.total: # test reward_list = [] for _ in range(FLAGS.test): reward, timestep = self.run_episode( test=True, monitor=np.random.rand() < FLAGS.monitor) reward_list.append(reward) self.test_timestep += timestep if reward > reward_best: reward_best = reward avg_reward = np.mean(reward_list) print('Average test return {} after {} timestep of training.'. format(avg_reward, self.train_timestep)) x_offline.append(self.train_timestep) y_offline.append(avg_reward) #test_log.write("{}\t{}\n".format(self.train_timestep, avg_reward)) #test_log.flush() # train reward_list = [] last_checkpoint = np.floor(self.train_timestep / FLAGS.train) while np.floor(self.train_timestep / FLAGS.train) == last_checkpoint: #print('=== Running episode') reward, timestep = self.run_episode(test=False, monitor=False) reward_list.append(reward) self.train_timestep += timestep #train_log.write("{}\t{}\n".format(self.train_timestep, reward)) #train_log.flush() avg_reward = np.mean(reward_list) print('Average train return {} after {} timestep of training.'. format(avg_reward, self.train_timestep)) x_online.append(self.train_timestep) y_online.append(avg_reward) #os.system('{} {}'.format(plotScr, FLAGS.outdir)) self.env.close() if FLAGS.parseq == 0: subadr = "pl2norm/" parval = FLAGS.pl2norm elif FLAGS.parseq == 1: subadr = "rate/" parval = FLAGS.rate elif FLAGS.parseq == 2: subadr = "prate/" parval = FLAGS.prate elif FLAGS.parseq == 4: subadr = "mix/" if FLAGS.rate == 0.0005 and FLAGS.prate == 0.00005: parval = 1 elif FLAGS.rate == 0.0001 and FLAGS.prate == 0.00001: parval = 2 elif FLAGS.rate == 0.001 and FLAGS.prate == 0.0001: parval = 3 elif FLAGS.l2norm == 0.0001 and FLAGS.pl2norm == 0: parval = 4 elif FLAGS.l2norm == 0.00001 and FLAGS.pl2norm == 0.0005: parval = 5 elif FLAGS.l2norm == 0.00005 and FLAGS.pl2norm == 0.001: parval = 6 if FLAGS.env == "HalfCheetah-v2": env_addr = "HalfCheetah/" + subadr elif FLAGS.env == "Pendulum-v0": env_addr = "Pendulum/" + subadr elif FLAGS.env == "Reacher-v2": env_addr = "Reacher/" elif FLAGS.env == "MountainCarContinuous-v0": env_addr = "MCContinuous/" #ckpt=FLAGS.outdir+"/"+env_addr+"tf/"+FLAGS.model+"_"+FLAGS.arch+str(parval)+"_"+str(i)+".ckpt" # os.makedirs(os.path.join(ckpt_addr, "tf")) #self.agent.saver.save(self.agent.sess, ckpt) x_offline.append(reward_best) print('Saving ckpt at {} timesteps.'.format(self.train_timestep)) print("Best Reward:{}".format(reward_best)) np.save( FLAGS.outdir + '/plots/' + env_addr + FLAGS.model + '_xon' + FLAGS.arch + str(parval) + "_" + str(i), x_online) np.save( FLAGS.outdir + '/plots/' + env_addr + FLAGS.model + '_xoff' + FLAGS.arch + str(parval) + "_" + str(i), x_offline) np.save( FLAGS.outdir + '/plots/' + env_addr + FLAGS.model + '_yon' + FLAGS.arch + str(parval) + "_" + str(i), y_online) np.save( FLAGS.outdir + '/plots/' + env_addr + FLAGS.model + '_yoff' + FLAGS.arch + str(parval) + "_" + str(i), y_offline) tf.reset_default_graph() tf.Graph().as_default()