def main(): env = gym.make(ENV_NAME) env = Monitor(env, f"./videos/{ENV_PREFIX}", force=True) policy, baseline = load_meta_learner_params(META_POLICY_PATH, BASELINE_PATH, env) sampler = BatchSampler(env_name=ENV_NAME, batch_size=20, num_workers=2) learner = MetaLearner(sampler, policy, baseline, optimizer=None) for task in TEST_TASKS: returns = [] for i in range(1, EVAL_STEPS + 1): for grad_steps in GRAD_STEPS: if i % 10 == 0: print(f"Evaluation-step: {i}") env.reset_task(task) learner.policy, learner.baseline = load_meta_learner_params( META_POLICY_PATH, BASELINE_PATH, env) # Sample a batch of transitions sampler.reset_task(task) episodes = sampler.sample(learner.policy) for _ in range(grad_steps): new_params = learner.adapt(episodes) learner.policy.set_params_with_name(new_params) returns.append(evaluate(env, task, learner.policy)) print("========EVAL RESULTS=======") print(f"Task: {task}") print(f"Returns: {returns}") print(f"Average Return: {np.mean(returns)}") print("===========================")
fast_lr=args.fast_lr, tau=args.tau, device=args.device) env = gym.make(args.env_name) # new task! episodes = [] #randomly sample task test_task = sampler.sample_tasks(num_tasks=1) #set specific task. #test_task = [] #test_task.append({'velocity': 1.9}) sampler.reset_task(test_task[0]) print("new task: ", test_task[0], ", where 1 is forward") #task = env.unwrapped.sample_tasks(1) env.unwrapped.reset_task(test_task[0]) observations = env.reset() print("new task: ", env.step([1])[3]['task'], ", where 1 is forward") _theta = env.step([1])[3]['task'] degrees = 180 * _theta['theta'] / np.pi print("new task in degrees: ", degrees) train_episodes = metalearner.sampler.sample(the_model, gamma=args.gamma, device=args.device) print("len of train episoid: ", len(train_episodes))