def _init(): env = ExamplePushingTrainingEnv(frameskip=3, visualization=False) env.seed(seed=rank) env.action_space.seed(seed=rank) env = FlatObservationWrapper(env) return env
args = vars(parser.parse_args()) time_steps = int(args["time_steps"]) model_path = str(args["model_path"]) policy_path = os.path.join(model_path, "model_" + str(time_steps) + "_steps") model = PPO2.load(policy_path) # define a method for the policy fn of your trained model def policy_fn(obs): return model.predict(obs, deterministic=True)[0] # we create the same env as we used for training in train_pushing_ppo.py, # such that action and observation space remain coherent with the policy. # however, unlike during the training, we set the initialization to the the # same as in the standard CubeEnv, since this is what the policy will be # evaluated on eventually. initializer = cube_env.RandomInitializer( difficulty=1) # difficulty one means pushing env = ExamplePushingTrainingEnv(initializer=initializer, frameskip=3, visualization=True) env = FlatObservationWrapper(env) for _ in range(10): obs = env.reset() done = False while not done: obs, rew, done, info = env.step(policy_fn(obs))
def main(): try: difficulty = int(sys.argv[1]) initial_pose_json = sys.argv[2] goal_pose_json = sys.argv[3] output_file = sys.argv[4] except IndexError: print("Incorrect number of arguments.") print("Usage:\n" "\tevaluate_policy.py <difficulty_level> <initial_pose>" " <goal_pose> <output_file>") sys.exit(1) # the poses are passed as JSON strings, so they need to be converted first initial_pose = move_cube.Pose.from_json(initial_pose_json) goal_pose = move_cube.Pose.from_json(goal_pose_json) # create a FixedInitializer with the given values initializer = cube_env.FixedInitializer(difficulty, initial_pose, goal_pose) # if difficulty == 1 (i.e. pushing), we load the policy we trained for that # task. otherwise, we just use the RandomPolicy as placeholder. Naturally, # when you submit you would have a policy for each difficulty level. if difficulty == 1: # we create the same env as we used for training in # train_pushing_ppo.py, such that action and observation space remain # coherent with the policy. however, unlike during training, we set the # initialization using the initializer, since this is what's expected # during evaluation. if you do not use the initializer, or modify the # standard CubeEnv in any way which will affect the simulation (i.e. # affect the state action trajectories), the action trajectories you # compute will not make sense. env = ExamplePushingTrainingEnv(initializer=initializer, frameskip=3, visualization=False) env = FlatObservationWrapper(env) # we load the trained policy policy_path = os.path.join("./training_checkpoints", "model_78000000_steps") policy = DQNPolicy(policy_path) else: env = gym.make( "rrc_simulation.gym_wrapper:real_robot_challenge_phase_1-v1", initializer=initializer, action_type=cube_env.ActionType.POSITION, visualization=False, ) policy = RandomPolicy(env.action_space) # Execute one episode. Make sure that the number of simulation steps # matches with the episode length of the task. When using the default Gym # environment, this is the case when looping until is_done == True. Make # sure to adjust this in case your custom environment behaves differently! is_done = False observation = env.reset() accumulated_reward = 0 while not is_done: action = policy.predict(observation) observation, reward, is_done, info = env.step(action) accumulated_reward += reward print("Accumulated reward: {}".format(accumulated_reward)) # store the log for evaluation env.platform.store_action_log(output_file)
parser = argparse.ArgumentParser() parser.add_argument("--output_path", required=True, help="output path") args = vars(parser.parse_args()) output_path = str(args["output_path"]) total_time_steps = 80000000 validate_every_timesteps = 2000000 model_path = os.path.join(output_path, "training_checkpoints") os.makedirs(model_path, exist_ok=True) set_global_seeds(0) num_of_active_envs = 1 policy_kwargs = dict(layer=[256, 256]) #env = gym.make("real_robot_challenge_phase_1-v1") env = FlatObservationWrapper( ExamplePushingTrainingEnv(frameskip=20, visualization=False)) train_configs = { "gamma": 0.99, "n_steps": int(120000 / 20), "ent_coef": 0.01, "learning_rate": 0.00025, "vf_coef": 0.5, "max_grad_norm": 0.5, "nminibatches": 40, "noptepochs": 4, } model = HER(MlpPolicy, env, SAC, verbose=1, tensorboard_log=model_path) ckpt_frequency = int(validate_every_timesteps / num_of_active_envs)