示例#1
0
def train(rank, device, args):
    current_time = datetime.now().strftime('%b%d_%H-%M')
    LOGGER_DIR = os.path.join(args.log_dir, args.env, current_time, 'Agent:{}'.format(rank))
    writer = SummaryWriter(LOGGER_DIR)
    MODEL_DIR = os.path.join(LOGGER_DIR, 'models')
    if not os.path.exists(MODEL_DIR):
        os.makedirs(MODEL_DIR)

    env = create_env(args.env, args)

    if args.pri:
        ram = PrioMemoryBuffer(args.buffer_size)
    else:
        ram = MemoryBuffer(args.buffer_size)

    player = DDPGAgent(env.observation_space, env.action_space, ram, writer, device, args)
    if args.model_dir is not None:
        player.load_models(args.model_dir)
    steps_done = 0
    episode_rewards = []
    max_score = -9999
    count_eps = 0
    for _ep in range(1, args.max_eps):
        observation = env.reset()
        total_reward = 0
        count_eps += 1
        for r in range(10000):
            if 'img' in args.obs:
                state = np.expand_dims(observation, axis=0)
            else:
                state = np.float32(observation)
            action, action_rescale = player.get_exploration_action(state)
            new_observation, reward, done, info = env.step(action_rescale)
            steps_done += 1
            total_reward += reward
            ram.add(observation, np.expand_dims(action, axis=0), reward, new_observation)
            observation = new_observation
            # perform optimization
            if steps_done > args.start_learning:
                player.optimize()
            if done:
                break

        # logger
        writer.add_scalar('episode/reward', total_reward, steps_done)
        writer.add_scalar('episode/length', r, steps_done)
        episode_rewards.append(total_reward)
        if _ep % args.eval_eps == 0:
            reward_ave = np.array(episode_rewards).mean()
            print('Train, episode %d, steps: %d reward: %.3f,ave_reward: %.3f' % (count_eps, steps_done, episode_rewards[-1], reward_ave))
            if reward_ave > max_score:
                player.save_models(os.path.join(MODEL_DIR, 'best'))
                max_score = reward_ave
                print('Save Best!')
            else:
                player.save_models(os.path.join(MODEL_DIR, 'new'))
            episode_rewards = []
        # check memory consumption and clear memory
        gc.collect()
示例#2
0
def test(device, args):

    env = create_env(args.env, args)
    ram = MemoryBuffer(1)
    player = DDPGAgent(env.observation_space, env.action_space, ram, None,
                       device, args)
    if args.model_dir is not None:
        player.load_models(args.model_dir, test=True)
    steps_done = 0
    count_eps = 0
    count_success = 0
    while True:
        episode_rewards = []
        episode_lenghts = []
        for _ep in range(1, args.eval_eps):
            if args.ar:
                env.seed(True)
            observation = env.reset()
            total_reward = 0
            episode_action = []
            for steps in range(1000):
                if 'img' in args.obs:
                    state = np.expand_dims(observation, axis=0)
                else:
                    state = np.float32(observation)

                action, action_rescale = player.get_exploitation_action(state)
                episode_action.append(action)
                new_observation, reward, done, info = env.step(action_rescale)
                observation = new_observation
                total_reward += reward
                steps_done += 1

                if args.render:
                    env.render()
                if done:
                    episode_rewards.append(total_reward)
                    count_eps += 1
                    episode_lenghts.append(steps)
                    if reward > 1:
                        count_success += 1.0
                    break
            # check memory consumption and clear memory
            gc.collect()

        reward_ave = np.array(episode_rewards).mean()
        length_ave = np.array(episode_lenghts).mean()
        print(
            'Test, episode %d, steps: %d, Success_rate: %.3f ave_reward: %.3f, ave_length: %.3f'
            % (count_eps, steps_done, count_success / count_eps, reward_ave,
               length_ave))

    env.close()
示例#3
0
        'tau': 0.01,
        'noise_sigma': 0.2
    }
else:
    print('Environment unknown!')
    exit()

num_episodes = 200
num_steps = 250

#------------------------------#
#------Hyperparameters---------#
#------------------------------#

agent = DDPGAgent(env, hyperparameter)
agent.load_models(model_path)

rets = []
render = False
for e in range(num_episodes):
    ret = 0
    s = env.reset()
    for step in range(num_steps):
        a = agent.take_action(s, greedy=True)
        s_next, r, done, _ = env.step(a)
        # Press Enter in the console to activate/deactivate Rendering
        render = rendering(env, render, r)
        ret += r
        if done:
            break
        s = s_next
示例#4
0
    environment.close()


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--testing', type=int, default=0)
    parser.add_argument('--mode', type=int, default=PENDULUM)
    args = parser.parse_args()

    mode = args.mode
    player2 = lh.BasicOpponent()

    if mode == TRAIN_SHOOTING:
        imitation_data = "imitations_shooting.pt"
    elif mode == TRAIN_DEFENSE:
        imitation_data = "imitations_defense.pt"
    else:
        imitation_data = "imitations_normal.pt"

    environment, action_size = create_environment(mode, args.testing)
    agent = DDPGAgent(environment.observation_space.shape[0], action_size,
                      environment.action_space.high[0],
                      environment.action_space.low[0], imitation_data)
    if args.testing:
        agent.load_models()
        for _ in range(20):
            test(environment, agent, mode, player2, True)
    else:
        #agent.load_models()
        train(environment, agent, mode, player2)