def test_agent(fname, agent, avg=100, seed=43): _, env_args = load_args(CONFIG_PATH) if fname is not None: # if map is specified, use the map without random map env_args["fname"] = fname env_args["random_map"] = False env = gym.make("ScavengerHuntMap-v0", **env_args) env.seed(seed) dist_list = [] a = agent(env) for i in range(avg): print("Running %d/%d" % ((i + 1), avg), end="\r") obs = env.reset() done = False dist = 0 while not done: act = a.next_node(obs) cl = env.env.map.get_current_loc() obs, _, done, info = env.step(act) dist += info["cost"] dist_list.append(dist) return sum(dist_list) / avg, np.std(dist_list)
print('observation space:', env.observation_space) print('action space:', env.action_space) env.render() action = env.action_space.sample() print(action) obs, r, done, info = env.step(action) print('next observation:', obs) print('reward:', r) print('done:', done) print('info:', info) print('nb_actions', env.action_space.n) env = gym.make(ENV_NAME) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n # Next, we build a very simple model. model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(300)) model.add(Activation('relu')) model.add(Dense(300)) model.add(Activation('relu')) model.add(Dense(300)) model.add(Activation('relu')) model.add(Dense(300)) model.add(Activation('relu')) model.add(Dense(300)) model.add(Activation('relu'))
sys.path.append('./') # 此时假设此 py 文件和 env 文件夹在同一目录下 import env parser = argparse.ArgumentParser( description='Train or test neural net motor controller.') parser.add_argument('--train', dest='train', action='store_true', default=True) parser.add_argument('--test', dest='test', action='store_false') args = parser.parse_args() if __name__ == '__main__': # 初始化环境 env = gym.make(ENV_NAME) env = env.unwrapped # reproducible,设置随机种子,为了能够重现 env.seed(RANDOMSEED) np.random.seed(RANDOMSEED) tf.random.set_seed(RANDOMSEED) # 定义状态空间,动作空间,动作幅度范围 s_dim = 50 a_dim = 50 a_bound = env.action_space.high print('s_dim', s_dim) print('a_dim', a_dim) # 用DDPG算法 ddpg = DDPG(a_dim, s_dim, a_bound) # 训练部分:
def launch(args): rank = MPI.COMM_WORLD.Get_rank() t_total_init = time.time() # Make the environment if args.algo == 'continuous': args.env_name = 'FetchManipulate3ObjectsContinuous-v0' args.multi_criteria_her = True else: args.env_name = 'FetchManipulate3Objects-v0' env = gym.make(args.env_name) # set random seeds for reproducibility env.seed(args.seed + MPI.COMM_WORLD.Get_rank()) random.seed(args.seed + MPI.COMM_WORLD.Get_rank()) np.random.seed(args.seed + MPI.COMM_WORLD.Get_rank()) torch.manual_seed(args.seed + MPI.COMM_WORLD.Get_rank()) if args.cuda: torch.cuda.manual_seed(args.seed + MPI.COMM_WORLD.Get_rank()) # get saving paths if rank == 0: logdir, model_path, bucket_path = init_storage(args) logger.configure(dir=logdir) logger.info(vars(args)) args.env_params = get_env_params(env) if args.algo == 'language': language_goal = get_instruction() goal_sampler = GoalSampler(args) else: language_goal = None goal_sampler = GoalSampler(args) # Initialize RL Agent if args.agent == "SAC": policy = RLAgent(args, env.compute_reward, goal_sampler) else: raise NotImplementedError # Initialize Rollout Worker rollout_worker = RolloutWorker(env, policy, goal_sampler, args) # Main interaction loop episode_count = 0 for epoch in range(args.n_epochs): t_init = time.time() # setup time_tracking time_dict = dict(goal_sampler=0, rollout=0, gs_update=0, store=0, norm_update=0, policy_train=0, lp_update=0, eval=0, epoch=0) # log current epoch if rank == 0: logger.info('\n\nEpoch #{}'.format(epoch)) # Cycles loop for _ in range(args.n_cycles): # Sample goals t_i = time.time() goals, self_eval = goal_sampler.sample_goal( n_goals=args.num_rollouts_per_mpi, evaluation=False) if args.algo == 'language': language_goal_ep = np.random.choice( language_goal, size=args.num_rollouts_per_mpi) else: language_goal_ep = None time_dict['goal_sampler'] += time.time() - t_i # Control biased initializations if epoch < args.start_biased_init: biased_init = False else: biased_init = args.biased_init # Environment interactions t_i = time.time() episodes = rollout_worker.generate_rollout( goals=goals, # list of goal configurations self_eval= self_eval, # whether the agent performs self-evaluations true_eval=False, # these are not offline evaluation episodes biased_init=biased_init, language_goal=language_goal_ep ) # whether initializations should be biased. time_dict['rollout'] += time.time() - t_i # Goal Sampler updates t_i = time.time() episodes = goal_sampler.update(episodes, episode_count) time_dict['gs_update'] += time.time() - t_i # Storing episodes t_i = time.time() policy.store(episodes) time_dict['store'] += time.time() - t_i # Updating observation normalization t_i = time.time() for e in episodes: policy._update_normalizer(e) time_dict['norm_update'] += time.time() - t_i # Policy updates t_i = time.time() for _ in range(args.n_batches): policy.train() time_dict['policy_train'] += time.time() - t_i episode_count += args.num_rollouts_per_mpi * args.num_workers # Updating Learning Progress t_i = time.time() if goal_sampler.curriculum_learning and rank == 0: goal_sampler.update_LP() goal_sampler.sync() time_dict['lp_update'] += time.time() - t_i time_dict['epoch'] += time.time() - t_init time_dict['total'] = time.time() - t_total_init if args.evaluations: if rank == 0: logger.info('\tRunning eval ..') # Performing evaluations t_i = time.time() if args.algo == 'language': ids = np.random.choice(np.arange(35), size=len(language_goal)) eval_goals = goal_sampler.valid_goals[ids] else: eval_goals = goal_sampler.valid_goals episodes = rollout_worker.generate_rollout( goals=eval_goals, self_eval=True, # this parameter is overridden by true_eval true_eval=True, # this is offline evaluations biased_init=False, language_goal=language_goal) # Extract the results if args.algo == 'continuous': results = np.array([e['rewards'][-1] == 3. for e in episodes]).astype(np.int) elif args.algo == 'language': results = np.array([ e['language_goal'] in sentence_from_configuration(config=e['ag'][-1], all=True) for e in episodes ]).astype(np.int) else: results = np.array([ str(e['g'][0]) == str(e['ag'][-1]) for e in episodes ]).astype(np.int) rewards = np.array([e['rewards'][-1] for e in episodes]) all_results = MPI.COMM_WORLD.gather(results, root=0) all_rewards = MPI.COMM_WORLD.gather(rewards, root=0) time_dict['eval'] += time.time() - t_i # Logs if rank == 0: assert len(all_results) == args.num_workers # MPI test av_res = np.array(all_results).mean(axis=0) av_rewards = np.array(all_rewards).mean(axis=0) global_sr = np.mean(av_res) log_and_save(goal_sampler, epoch, episode_count, av_res, av_rewards, global_sr, time_dict) # Saving policy models if epoch % args.save_freq == 0: policy.save(model_path, epoch) goal_sampler.save_bucket_contents(bucket_path, epoch) if rank == 0: logger.info('\tEpoch #{}: SR: {}'.format(epoch, global_sr))
if __name__ == '__main__': num_eval = 20 path = './trained_model/' with open(path + 'config.json', 'r') as f: params = json.load(f) args = SimpleNamespace(**params) # Make the environment env = gym.make(args.env_name) # set random seeds for reproduce args.seed = np.random.randint(1e6) env.seed(args.seed + MPI.COMM_WORLD.Get_rank()) random.seed(args.seed + MPI.COMM_WORLD.Get_rank()) np.random.seed(args.seed + MPI.COMM_WORLD.Get_rank()) torch.manual_seed(args.seed + MPI.COMM_WORLD.Get_rank()) if args.cuda: torch.cuda.manual_seed(args.seed + MPI.COMM_WORLD.Get_rank()) args.env_params = get_env_params(env) goal_sampler = GoalSampler(args) eval_goals = goal_sampler.valid_goals inits = [None] * len(eval_goals) all_results = [] with open(path + 'inst_to_one_hot.pkl', 'rb') as f:
def train_dqn(size, agt, eps_start=1.0, eps_end=0.05, eps_decay=0.999): env = gym.make('game2048-v0', size=size, norm=FLAGS.norm) env.seed(1) if FLAGS.norm: channels = size * size + 2 else: channels = 1 agent = model.DQNAgent(size, channels, 4, 0, FLAGS.double_q, FLAGS.dueling) if FLAGS.model_file: print(f'load {FLAGS.model_file}') agent.load(FLAGS.model_file) total_steps = 0 total_scores = 0 highest_score = 0 trials = 10000 eps = eps_start scores_window = deque(maxlen=WINDOWS_SIZE) rewards_window = deque(maxlen=WINDOWS_SIZE) scores = [] sd_name = 'model_%dx%d.checkpoint' % (size, size) random = False for trial in range(1, trials + 1): obs = env.reset() stepno = 0 rewards = 0 loss = 0 while True: stepno += 1 total_steps += 1 action, _ = agent.choose_action(obs, eps, rand=random) obs_, reward, done, _ = env.step(action) random = np.all(obs == obs_) loss = agent.step(obs, action, reward, obs_, done) obs = obs_ rewards += reward if done: break eps = max(eps_end, eps * eps_decay) rewards_window.append(rewards) scores_window.append(env.get_score()) scores.append(rewards) # env.render() if env.get_score() > highest_score: highest_score = env.get_score() total_scores += env.get_score() print( '\rEpisode {}\t Steps: {}\t\t Average Reward: {:.2f}\t\t Average Scores: {:.2f}\t loss: {:.2f}\t highest: {}\t eps: {:.4f}' .format(trial, total_steps, np.mean(rewards_window), np.mean(scores_window), loss, highest_score, eps), end="") if trial % WINDOWS_SIZE == 0: print( '\rEpisode {}\t Steps: {}\t\t Average Reward: {:.2f}\t\t Average Scores: {:.2f}\t loss: {:.2f}\t highest: {}\t eps: {:.4f}' .format(trial, total_steps, np.mean(rewards_window), np.mean(scores_window), loss, highest_score, eps)) if trial % 1000 == 0: agent.save(sd_name) eval(env, agent, 1000, render=False) print(f'steps: {total_steps} avg_score: {total_scores / trials} \ highest_score: {highest_score} at size: {size}') plot_score(scores, [])