示例#1
0
    replay_buffer_size = 5e5
    replay_buffer = ReplayBuffer(replay_buffer_size)

    qt_opt = QT_Opt(replay_buffer, hidden_dim)

    if args.train:
        # hyper-parameters
        max_episodes = 2000
        max_steps = 20 if ENV == 'Reacher' else 150  # Pendulum needs 150 steps per episode to learn well, cannot handle 20
        frame_idx = 0
        episode_rewards = []

        for i_episode in range(max_episodes):

            if ENV == 'Reacher':
                state = env.reset(SCREEN_SHOT)
            elif ENV == 'Pendulum':
                state = env.reset()
            episode_reward = 0

            for step in range(max_steps):
                # action = qt_opt.policy.act(state)
                action = qt_opt.cem_optimal_action(state)
                if ENV == 'Reacher':
                    next_state, reward, done, _ = env.step(
                        action, SPARSE_REWARD, SCREEN_SHOT)
                elif ENV == 'Pendulum':
                    next_state, reward, done, _ = env.step(action)
                    env.render()
                episode_reward += reward
                replay_buffer.push(state, action, reward, next_state, done)
示例#2
0
def worker(id, sac_trainer, ENV, rewards_queue, replay_buffer, max_episodes, max_steps, batch_size,
           explore_steps, \
           update_itr, action_itr, AUTO_ENTROPY, DETERMINISTIC, hidden_dim, model_path):
    '''
    the function for sampling with multi-processing
    '''

    with torch.cuda.device(id % torch.cuda.device_count()):
        sac_trainer.to_cuda()

        print(sac_trainer, replay_buffer)  # sac_tainer are not the same, but all networks and optimizers in it are the same; replay  buffer is the same one.
        if ENV == 'Reacher':
            NUM_JOINTS=2
            LINK_LENGTH=[200, 140]
            INI_JOING_ANGLES=[0.1, 0.1]

            SCREEN_SIZE=1000
            SPARSE_REWARD=False
            SCREEN_SHOT=False
            action_range = 10.0

            env=Reacher(screen_size=SCREEN_SIZE, num_joints=NUM_JOINTS, link_lengths = LINK_LENGTH, \
            ini_joint_angles=INI_JOING_ANGLES, target_pos = [369,430], render=True, change_goal=False)
            action_dim = env.num_actions
            state_dim  = env.num_observations

        elif ENV == 'Pendulum':
            env = NormalizedActions(gym.make("Pendulum-v0"))
            action_dim = env.action_space.shape[0]
            state_dim  = env.observation_space.shape[0]
            action_range=1.
        
        frame_idx=0
        rewards=[]
        # training loop
        for eps in range(max_episodes):
            episode_reward = 0
            if ENV == 'Reacher':
                state = env.reset(SCREEN_SHOT)
            elif ENV == 'Pendulum':
                state =  env.reset()
            
            for step in range(max_steps):
                if frame_idx > explore_steps:
                    action = sac_trainer.policy_net.get_action(state, deterministic = DETERMINISTIC)
                else:
                    action = sac_trainer.policy_net.sample_action()
        
                try:
                    if ENV ==  'Reacher':
                        next_state, reward, done, _ = env.step(action, SPARSE_REWARD, SCREEN_SHOT)
                    elif ENV ==  'Pendulum':
                        next_state, reward, done, _ = env.step(action)
                        env.render() 
                except KeyboardInterrupt:
                    print('Finished')
                    sac_trainer.save_model(model_path)

                replay_buffer.push(state, action, reward, next_state, done)

                state = next_state
                episode_reward += reward
                frame_idx += 1

                # if len(replay_buffer) > batch_size:
                if replay_buffer.get_length() > batch_size:
                    for i in range(update_itr):
                        _ = sac_trainer.update(batch_size, reward_scale=10., auto_entropy=AUTO_ENTROPY,
                                                target_entropy=-1. * action_dim)

                if eps % 10 == 0 and eps > 0:
                    # plot(rewards, id)
                    sac_trainer.save_model(model_path)

                if done:
                    break
            print('Worker: ', id, '| Episode: ', eps, '| Episode Reward: ', episode_reward)
            # if len(rewards) == 0:
            #     rewards.append(episode_reward)
            # else:
            #     rewards.append(rewards[-1] * 0.9 + episode_reward * 0.1)
            rewards_queue.put(episode_reward)

        sac_trainer.save_model(model_path)
    torch.autograd.set_detect_anomaly(True)
    alg = DDPG(replay_buffer, state_space, action_space, hidden_dim)

    if args.train:
        # alg.load_model(model_path)

        # hyper-parameters
        max_episodes  = 1000
        max_steps   = 100
        frame_idx   = 0
        rewards=[]

        for i_episode in range (max_episodes):
            q_loss_list=[]
            policy_loss_list=[]
            state = env.reset()
            episode_reward = 0

            for step in range(max_steps):
                if frame_idx > explore_steps:
                    action = alg.policy_net.get_action(state)
                else:
                    action = alg.policy_net.sample_action()
                next_state, reward, done, _ = env.step(action)
                if ENV !='Reacher':
                    env.render()
                replay_buffer.push(state, action, reward, next_state, done)
                
                state = next_state
                episode_reward += reward
                frame_idx += 1
def worker(id, ):  # thread could read global variables
    '''
    the function for sampling with multi-threading
    '''
    print(sac_trainer, replay_buffer)
    if ENV == 'Reacher':
        env=Reacher(screen_size=SCREEN_SIZE, num_joints=NUM_JOINTS, link_lengths = LINK_LENGTH, \
        ini_joint_angles=INI_JOING_ANGLES, target_pos = [369,430], render=True, change_goal=False)

    elif ENV == 'Pendulum':
        env = NormalizedActions(gym.make("Pendulum-v0"))
    print(env)
    frame_idx = 0
    rewards = []
    # training loop
    for eps in range(max_episodes):
        episode_reward = 0
        if ENV == 'Reacher':
            state = env.reset(SCREEN_SHOT)
        elif ENV == 'Pendulum':
            state = env.reset()

        for step in range(max_steps):
            if frame_idx > explore_steps:
                action = sac_trainer.policy_net.get_action(
                    state, deterministic=DETERMINISTIC)
            else:
                action = sac_trainer.policy_net.sample_action()

            try:
                if ENV == 'Reacher':
                    next_state, reward, done, _ = env.step(
                        action, SPARSE_REWARD, SCREEN_SHOT)
                elif ENV == 'Pendulum':
                    next_state, reward, done, _ = env.step(action)
                    env.render()
            except KeyboardInterrupt:
                print('Finished')
                sac_trainer.save_model(model_path)

            replay_buffer.push(state, action, reward, next_state, done)

            state = next_state
            episode_reward += reward
            frame_idx += 1

            if len(replay_buffer) > batch_size:
                for i in range(update_itr):
                    _ = sac_trainer.update(batch_size,
                                           reward_scale=10.,
                                           auto_entropy=AUTO_ENTROPY,
                                           target_entropy=-1. * action_dim)

            if eps % 10 == 0 and eps > 0:
                plot(rewards, id)
                sac_trainer.save_model(model_path)

            if done:
                break
        print('Episode: ', eps, '| Episode Reward: ', episode_reward)
        # if len(rewards) == 0: rewards.append(episode_reward)
        # else: rewards.append(rewards[-1]*0.9+episode_reward*0.1)
    sac_trainer.save_model(model_path)
NUM_JOINTS = 2
LINK_LENGTH = [200, 140]
INI_JOING_ANGLES = [0.1, 0.1]
SCREEN_SIZE = 1000
SPARSE_REWARD = False
SCREEN_SHOT = False
DETERMINISTIC = False
env=Reacher(screen_size=SCREEN_SIZE, num_joints=NUM_JOINTS, link_lengths = LINK_LENGTH, \
    ini_joint_angles=INI_JOING_ANGLES, target_pos = [369,430], render=True)
ppo = PPO()

if args.train:
    all_ep_r = []

    for ep in range(EP_MAX):
        s = env.reset(SCREEN_SHOT)
        s = s / 100.
        buffer_s, buffer_a, buffer_r = [], [], []
        ep_r = 0
        for t in range(EP_LEN):  # in one episode
            # env.render()
            a = ppo.choose_action(s)
            s_, r, done, distance2goal = env.step(a, SPARSE_REWARD,
                                                  SCREEN_SHOT)
            s_ = s_ / 100.
            buffer_s.append(s)
            buffer_a.append(a)
            # print('r, norm_r: ', r, (r+8)/8)
            '''the normalization makes reacher's reward almost same and not work'''
            # buffer_r.append((r+8)/8)    # normalize reward, find to be useful
            buffer_r.append(r)