示例#1
0
def main(args):
    """ Starts different tests

    Args:
        param1(args): args

    """
    size = 84
    use_render = False
    use_render = True
    print("use render {} ".format(use_render))
    env = suite.make(
            args.env_name,
            has_renderer=use_render,
            use_camera_obs=True,
            ignore_done=True,
            has_offscreen_renderer=True,
            camera_height=size,
            camera_width=size,
            render_collision_mesh=use_render,
            render_visual_mesh=True,
            camera_name='agentview',
            use_object_obs=False,
            camera_depth=False,
            reward_shaping=True,
            )
    state = env.reset()
    state_dim = 200
    action_dim = env.dof
    max_action = float(1)
    min_action = float(-1)
        
    policy = TD31v1(state_dim, action_dim, max_action, args) 
    directory = "24_07_lr_2_lift/pytorch_models/"

    filename ="SawyerLift-701reward_75.21-agentTD3_ad"
    filename = directory + filename
    print("Load " , filename)
    policy.load(filename)
    avg_reward = 0.
    seeds = [x for x in range(10)]
    episode = 1
    for s in seeds:
        torch.manual_seed(s)
        np.random.seed(s)
        print("iteration ", s)
        obs = env.reset()
        obs, state_buffer = stacked_frames(obs, size, args, policy)
        done = False
        for x in range(200):
            action = policy.select_action(np.array(obs))
            obs , reward, done, _ = env.step(action)
            obs, state_buffer = create_next_obs(obs, size, args, state_buffer, policy)
            avg_reward += reward * 10
            if use_render:
                time.sleep(0.02)
                env.render()
        print("episode reward {}".format(avg_reward/episode))
        episode += 1
    avg_reward /= len(seeds)
    print ("---------------------------------------")
    print ("Average Reward over the Evaluation Step: %f" % (avg_reward))
    print ("---------------------------------------")
def train_agent(args, param):
    """

    Args:
    """
    use_gym = False
    args.seed = param
    now = datetime.now()
    dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    pathname = str(args.locexp) + "/" + str(args.env_name) + '-agent-' + str(
        args.policy)
    pathname += "_batch_size_" + str(args.batch_size)
    pathname += '_update_freq: ' + str(
        args.target_update_freq) + "num_q_target_" + str(
            args.num_q_target) + "_seed_" + str(args.seed)
    pathname += "_actor_300_200"
    text = "Star_training target_update_freq: {}  num_q_target: {}  use device {} ".format(
        args.target_update_freq, args.num_q_target, args.device)
    print(pathname, text)
    write_into_file(pathname, text)
    arg_text = str(args)
    write_into_file(pathname, arg_text)
    tensorboard_name = str(args.locexp) + '/runs/' + pathname
    writer = SummaryWriter(tensorboard_name)

    if use_gym:
        env = gym.make(args.env_name)
        env.seed(args.seed)
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        max_action = float(env.action_space.high[0])
        args.max_episode_steps = env._max_episode_steps
    else:
        size = 84
        env = suite.make(
            args.env_name,
            has_renderer=False,
            use_camera_obs=True,
            ignore_done=True,
            has_offscreen_renderer=True,
            camera_height=size,
            camera_width=size,
            render_collision_mesh=False,
            render_visual_mesh=True,
            camera_name='agentview',
            use_object_obs=False,
            camera_depth=True,
            reward_shaping=True,
        )
    state_dim = 200
    print("State dim, ", state_dim)
    action_dim = env.dof
    print("action_dim ", action_dim)
    max_action = 1
    args.max_episode_steps = 200

    if args.policy == "TD3_ad":
        policy = TD31v1(state_dim, action_dim, max_action, args)
    elif args.policy == "DDPG":
        policy = DDPG(state_dim, action_dim, max_action, args)

    file_name = str(args.locexp) + "/pytorch_models/{}".format(args.env_name)
    obs_shape = (3, 84, 84)
    action_shape = (action_dim, )
    print("obs", obs_shape)
    print("act", action_shape)
    replay_buffer = ReplayBuffer(obs_shape, action_shape,
                                 int(args.buffer_size), args.device)
    save_env_vid = False
    total_timesteps = 0
    timesteps_since_eval = 0
    episode_num = 0
    done = True
    t0 = time.time()
    scores_window = deque(maxlen=100)
    episode_reward = 0
    evaluations = []
    tb_update_counter = 0
    while total_timesteps < args.max_timesteps:
        tb_update_counter += 1
        # If the episode is done
        if done:
            episode_num += 1
            scores_window.append(episode_reward)
            average_mean = np.mean(scores_window)
            if tb_update_counter > args.tensorboard_freq:
                print("Write tensorboard")
                tb_update_counter = 0
                writer.add_scalar('Reward', episode_reward, total_timesteps)
                writer.add_scalar('Reward mean ', average_mean,
                                  total_timesteps)
                writer.flush()
            # If we are not at the very beginning, we start the training process of the model
            if total_timesteps != 0:
                text = "Total Timesteps: {} Episode Num: {} ".format(
                    total_timesteps, episode_num)
                text += "Episode steps {} ".format(episode_timesteps)
                text += "Reward: {:.2f}  Average Re: {:.2f} Time: {}".format(
                    episode_reward, np.mean(scores_window),
                    time_format(time.time() - t0))

                print(text)
                write_into_file(pathname, text)
            # We evaluate the episode and we save the policy
            if total_timesteps > args.start_timesteps:
                policy.train(replay_buffer, writer, 200)
            if timesteps_since_eval >= args.eval_freq:
                timesteps_since_eval %= args.eval_freq
                evaluations.append(
                    evaluate_policy(policy, writer, total_timesteps, args,
                                    env))
                torch.manual_seed(args.seed)
                np.random.seed(args.seed)
                save_model = file_name + '-{}reward_{:.2f}-agent{}'.format(
                    episode_num, evaluations[-1], args.policy)
                policy.save(save_model)
            # When the training step is done, we reset the state of the environment
            if use_gym:
                obs = env.reset()
            else:
                state = env.reset()
                obs, state_buffer = stacked_frames(state, size, args, policy)

            # Set the Done to False
            done = False
            # Set rewards and episode timesteps to zero
            episode_reward = 0
            episode_timesteps = 0
        # Before 10000 timesteps, we play random actions
        if total_timesteps < args.start_timesteps:
            if use_gym:
                action = env.action_space.sample()
            else:
                action = np.random.randn(env.dof)
        else:  # After 10000 timesteps, we switch to the model
            if use_gym:
                action = policy.select_action(np.array(obs))
                # If the explore_noise parameter is not 0, we add noise to the action and we clip it
                if args.expl_noise != 0:
                    action = (action + np.random.normal(
                        0, args.expl_noise,
                        size=env.action_space.shape[0])).clip(
                            env.action_space.low, env.action_space.high)
            else:
                action = (policy.select_action(np.array(obs)) +
                          np.random.normal(
                              0, max_action * args.expl_noise,
                              size=action_dim)).clip(-max_action, max_action)

        if total_timesteps % args.target_update_freq == 0:
            if args.policy == "TD3_ad":
                policy.hardupdate()
        # The agent performs the action in the environment, then reaches the next state and receives the reward
        new_obs, reward, done, _ = env.step(action)
        done = float(done)
        if not use_gym:
            new_obs, state_buffer = create_next_obs(new_obs, size, args,
                                                    state_buffer, policy)
        # We check if the episode is done
        done_bool = 0 if episode_timesteps + 1 == args.max_episode_steps else float(
            done)
        if not use_gym:
            if episode_timesteps + 1 == args.max_episode_steps:
                done = True
        # We increase the total reward
        reward = reward * args.reward_scalling
        episode_reward += reward
        # We store the new transition into the Experience Replay memory (ReplayBuffer)
        if args.debug:
            print("add to buffer next_obs ", obs.shape)
            print("add to bufferobs ", new_obs.shape)
        replay_buffer.add(obs, action, reward, new_obs, done, done_bool)
        # We update the state, the episode timestep, the total timesteps, and the timesteps since the evaluation of the policy
        obs = new_obs
        if total_timesteps > args.start_timesteps:
            policy.train(replay_buffer, writer, 0)
        episode_timesteps += 1
        total_timesteps += 1
        timesteps_since_eval += 1

    # We add the last policy evaluation to our list of evaluations and we save our model
    evaluations.append(
        evaluate_policy(policy, writer, total_timesteps, args, episode_num))
def main(args):
    """ Starts different tests

    Args:
        param1(args): args

    """
    size = 84
    use_render = False
    # use_render = True
    print("use render {} ".format(use_render))
    env = suite.make(
        args.env_name,
        has_renderer=use_render,
        use_camera_obs=True,
        ignore_done=True,
        has_offscreen_renderer=True,
        camera_height=size,
        camera_width=size,
        render_collision_mesh=use_render,
        render_visual_mesh=True,
        camera_name='agentview',
        use_object_obs=False,
        camera_depth=False,
        reward_shaping=True,
    )
    state = env.reset()
    state_dim = 200
    action_dim = env.dof
    max_action = float(1)
    min_action = float(-1)
    width = size
    height = size
    fourcc = cv2.VideoWriter_fourcc('M', 'J', 'P', 'G')
    fps = 30
    video_filename = 'output.avi'
    video = cv2.VideoWriter(video_filename, fourcc, fps, (width, height))
    policy = TD31v1(state_dim, action_dim, max_action, args)
    directory = "24_07_lr_2_lift/pytorch_models/"

    filename = "SawyerLift-701reward_75.21-agentTD3_ad"
    filename = directory + filename
    print("Load ", filename)
    policy.load(filename)
    avg_reward = 0.
    seeds = [x for x in range(args.repeat)]
    episode = 1
    for s in seeds:
        torch.manual_seed(s)
        np.random.seed(s)
        print("iteration ", s)
        obs = env.reset()
        obs, state_buffer = stacked_frames(obs, size, args, policy)
        done = False
        for x in range(args.timesteps):
            action = policy.select_action(np.array(obs))
            obs, reward, done, _ = env.step(action)
            img = obs["image"]
            gray = cv2.normalize(img,
                                 None,
                                 255,
                                 0,
                                 norm_type=cv2.NORM_MINMAX,
                                 dtype=cv2.CV_8U)
            name = "images/state-{}.jpg".format(x + 1000)
            im = Image.fromarray(img)
            im.save(name)
            frame = cv2.merge([gray, gray, gray])
            video.write(frame)
            obs, state_buffer = create_next_obs(obs, size, args, state_buffer,
                                                policy)
            avg_reward += reward * 10
            if use_render:
                time.sleep(0.02)
                env.render()
        print("episode reward {}".format(avg_reward / episode))
        episode += 1
    avg_reward /= len(seeds)
    cv2.destroyAllWindows()
    video.release()
    print("---------------------------------------")
    print("Average Reward over the Evaluation Step: %f" % (avg_reward))
    print("---------------------------------------")