예제 #1
0
def main():
    env = gym.make('CartPole-v0').unwrapped

    batch_size = 128
    gamma = 0.999
    eps_start = 0.9
    eps_end = 0.05
    eps_decay = 200
    target_update = 10

    n_actions = env.action_space.n
    env.reset()
    init_screen = get_screen(env)
    _, _, screen_height, screen_width = init_screen.shape
    episode_durations = []

    agent = Agent(gamma=gamma,
                  n_actions=n_actions,
                  screen_height=screen_height,
                  screen_width=screen_width,
                  batch_size=batch_size)

    n_games = 500
    scores = []
    eps_history = []

    for i in range(n_games):
        done = False
        length = 0
        state = env.reset()
        last_screen = get_screen(env)
        current_screen = get_screen(env)
        state = current_screen - last_screen
        # env.render()
        while not done:
            action = agent.select_action(state)
            _, reward, done, _ = env.step(action.item())
            last_screen = current_screen
            current_screen = get_screen(env)

            next_state = current_screen - last_screen if not done else None
            agent.store_transition(state, action, next_state, reward)
            state = next_state

            agent.learn()
            length += 1
        episode_durations.append(length)
        # plot_durations()

        if i % target_update == 0:
            agent.update_target()

    #imagine pretty plots
    print('done')
    env.close()
예제 #2
0
def main():
    global args
    args = parser.parse_args()

    env = make(game='SonicTheHedgehog-Genesis', state='LabyrinthZone.Act1')
    current_state = get_screen(env)

    for epoch in range(args.epochs):
        env.reset()
예제 #3
0
    def test(self, env, n_epochs=30, verbose=False):
        rewards = []
        self.policy_net = self.policy_net.cuda()
        self.target_net = self.target_net.cuda()
        self.target_net.eval()

        for epoch in range(n_epochs):
            env.reset()
            done = False
            epoch_rewards = []
            video = []

            last_screen = get_screen(env)
            current_screen = get_screen(env)
            state = current_screen - last_screen

            while not done:
                if epoch % 5 == 0:
                    video.append(last_screen)
                action = self.select_action(state, 0.)

                _, reward, done, _ = env.step(action[0, 0])
                last_screen = current_screen
                current_screen = get_screen(env)

                if not done:
                    next_state = current_screen - last_screen
                else:
                    next_state = None

                epoch_rewards.append(reward)
                reward = Tensor([reward])
                state = next_state

                logging.debug(
                    'Test epoch {} :  reward= {}, duration= {}'.format(
                        epoch, np.sum(epoch_rewards), len(epoch_rewards)))
            rewards.append(np.sum(epoch_rewards))

            if epoch % 5 == 0:
                self.make_video(video, ext='_test_' + str(epoch))

            logging.info('Performance estimate : {} pm {}'.format(
                np.mean(rewards), np.std(rewards)))
예제 #4
0
파일: trainingDQN.py 프로젝트: mxxhcm/code
def trainDQN(file_name="DQN",
             env=GridworldEnv(1),
             batch_size=128,
             gamma=0.999,
             eps_start=0.9,
             eps_end=0.05,
             eps_decay=1000,
             is_plot=False,
             num_episodes=500,
             max_num_steps_per_episode=1000,
             learning_rate=0.0001,
             memory_replay_size=10000):
    """
    DQN training routine. Retuns rewards and durations logs.
    Plot environment screen
    """
    if is_plot:
        env.reset()
        plt.ion()
        plt.figure()
        plt.imshow(get_screen(env).cpu().squeeze(0).squeeze(0).numpy(),
                   interpolation='none')
        plt.title("")
        plt.draw()
        plt.pause(0.00001)

    num_actions = env.action_space.n
    model = DQN(num_actions)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    use_cuda = torch.cuda.is_available()
    if use_cuda:
        model.cuda()

    memory = ReplayMemory(memory_replay_size)

    episode_durations = []
    mean_durations = []
    episode_rewards = []
    mean_rewards = []
    steps_done = 0  # total steps
    for i_episode in range(num_episodes):
        if i_episode % 20 == 0:
            clear_output()
        print("Cur episode:", i_episode, "steps done:", steps_done,
                "exploration factor:", eps_end + (eps_start - eps_end) * \
                math.exp(-1. * steps_done / eps_decay))
        # Initialize the environment and state
        env.reset()
        # last_screen = env.current_grid_map
        # (1, 1, 8, 8)
        current_screen = get_screen(env)
        state = current_screen  # - last_screen
        for t in count():
            # Select and perform an action
            action = select_action(state, model, num_actions, eps_start,
                                   eps_end, eps_decay, steps_done)
            steps_done += 1
            _, reward, done, _ = env.step(action[0, 0])
            reward = Tensor([reward])

            # Observe new state
            last_screen = current_screen
            current_screen = get_screen(env)
            if not done:
                next_state = current_screen  # - last_screen
            else:
                next_state = None

            # Store the transition in memory
            memory.push(state, action, next_state, reward)

            # Move to the next state
            state = next_state
            # plot_state(state)
            # env.render()

            # Perform one step of the optimization (on the target network)
            optimize_model(model, optimizer, memory, batch_size, gamma)
            if done or t + 1 >= max_num_steps_per_episode:
                episode_durations.append(t + 1)
                episode_rewards.append(env.episode_total_reward)
                if is_plot:
                    plot_durations(episode_durations, mean_durations)
                    plot_rewards(episode_rewards, mean_rewards)
                break

    print('Complete')
    env.render(close=True)
    env.close()
    if is_plot:
        plt.ioff()
        plt.show()

    ## Store Results

    np.save(file_name + '-dqn-rewards', episode_rewards)
    np.save(file_name + '-dqn-durations', episode_durations)

    return model, episode_rewards, episode_durations
def trainD(file_name="Distral_1col",
           list_of_envs=[GridworldEnv(4), GridworldEnv(5)],
           batch_size=128,
           gamma=0.999,
           alpha=0.9,
           beta=5,
           eps_start=0.9,
           eps_end=0.05,
           eps_decay=5,
           is_plot=False,
           num_episodes=200,
           max_num_steps_per_episode=1000,
           learning_rate=0.001,
           memory_replay_size=10000,
           memory_policy_size=1000):
    """
    Soft Q-learning training routine. Retuns rewards and durations logs.
    Plot environment screen
    """
    num_actions = list_of_envs[0].action_space.n
    num_envs = len(list_of_envs)
    policy = PolicyNetwork(num_actions)
    models = [DQN(num_actions)
              for _ in range(0, num_envs)]  ### Add torch.nn.ModuleList (?)
    memories = [
        ReplayMemory(memory_replay_size, memory_policy_size)
        for _ in range(0, num_envs)
    ]

    use_cuda = torch.cuda.is_available()
    if use_cuda:
        policy.cuda()
        for model in models:
            model.cuda()

    optimizers = [
        optim.Adam(model.parameters(), lr=learning_rate) for model in models
    ]
    policy_optimizer = optim.Adam(policy.parameters(), lr=learning_rate)
    # optimizer = optim.RMSprop(model.parameters(), )

    episode_durations = [[] for _ in range(num_envs)]
    episode_rewards = [[] for _ in range(num_envs)]

    steps_done = np.zeros(num_envs)
    episodes_done = np.zeros(num_envs)
    current_time = np.zeros(num_envs)

    # Initialize environments
    for env in list_of_envs:
        env.reset()

    while np.min(episodes_done) < num_episodes:
        # TODO: add max_num_steps_per_episode

        # Optimization is given by alterating minimization scheme:
        #   1. do the step for each env
        #   2. do one optimization step for each env using "soft-q-learning".
        #   3. do one optimization step for the policy

        for i_env, env in enumerate(list_of_envs):
            # print("Cur episode:", i_episode, "steps done:", steps_done,
            #         "exploration factor:", eps_end + (eps_start - eps_end) * \
            #         math.exp(-1. * steps_done / eps_decay))

            # last_screen = env.current_grid_map
            current_screen = get_screen(env)
            state = current_screen  # - last_screen
            # Select and perform an action
            action = select_action(state, policy, models[i_env], num_actions,
                                   eps_start, eps_end, eps_decay,
                                   episodes_done[i_env], alpha, beta)
            steps_done[i_env] += 1
            current_time[i_env] += 1
            _, reward, done, _ = env.step(action[0, 0])
            reward = Tensor([reward])

            # Observe new state
            last_screen = current_screen
            current_screen = get_screen(env)
            if not done:
                next_state = current_screen  # - last_screen
            else:
                next_state = None

            # Store the transition in memory
            time = Tensor([current_time[i_env]])
            memories[i_env].push(state, action, next_state, reward, time)

            # Perform one step of the optimization (on the target network)
            optimize_model(policy, models[i_env], optimizers[i_env],
                           memories[i_env], batch_size, alpha, beta, gamma)
            if done:
                print(
                    "ENV:", i_env, "iter:", episodes_done[i_env], "\treward:",
                    env.episode_total_reward, "\tit:", current_time[i_env],
                    "\texp_factor:", eps_end + (eps_start - eps_end) *
                    math.exp(-1. * episodes_done[i_env] / eps_decay))
                env.reset()
                episodes_done[i_env] += 1
                episode_durations[i_env].append(current_time[i_env])
                current_time[i_env] = 0
                episode_rewards[i_env].append(env.episode_total_reward)
                if is_plot:
                    plot_rewards(episode_rewards, i_env)

        optimize_policy(policy, policy_optimizer, memories, batch_size,
                        num_envs, gamma)

    print('Complete')
    env.render(close=True)
    env.close()
    if is_plot:
        plt.ioff()
        plt.show()

    ## Store Results

    np.save(file_name + '-distral-2col-rewards', episode_rewards)
    np.save(file_name + '-distral-2col-durations', episode_durations)

    return models, policy, episode_rewards, episode_durations
예제 #6
0
def main():
    #gettign positions of rigib bodies in real time
    client = NatClient()
    arena_rb = client.rigid_bodies['Arena']
    rat_rb = client.rigid_bodies['Rat']


    window = pyglet.window.Window(resizable=True, fullscreen=True, screen=get_screen(1))  # Opening the basic pyglet window


    # Load Arena
    remove_image_lines_from_mtl('assets/3D/grass_scene.mtl')
    arena_filename = 'assets/3D/grass_scene.obj'# we are taking an arena which has been opened in blender and rendered to 3D after scanning it does not have flipped normals
    arena_reader = rc.WavefrontReader(arena_filename)  # loading the mesh of the arena thought a wavefrontreader
    arena = arena_reader.get_mesh("Arena", position=arena_rb.position)  # making the wafrotn into mesh so we can extrude texture ont top of it.
    arena.uniforms['diffuse'] = 1., 1., 1.  # addign a white diffuse material to the arena
    arena.rotation = arena.rotation.to_quaternion() # we also need to get arena's rotation not just xyz so it can be tracked and moved if it gets bumped

    # Load the projector as a Ratcave camera, set light to its position
    projector = rc.Camera.from_pickle('assets/3D/projector.pkl')  # settign the pickle filled of the projector, which gives us the coordinates of where the projector is
    projector.position.x += .004
    projector.projection = rc.PerspectiveProjection(fov_y =40.5, aspect=1.777777778)
    light = rc.Light(position=projector.position)

    ## Make Virtual Scene ##
    fields = []
    for x, z in itertools.product([-.8, 0, .8], [-1.6, 0, 1.6]):
            field = load_textured_mesh(arena_reader, 'grass', 'grass.png')
            field.position.x += x
            field.position.z += z
            fields.append(field)

    ground = load_textured_mesh(arena_reader, 'Ground', 'dirt.png')
    sky = load_textured_mesh(arena_reader, 'Sky', 'sky.png')
    snake = load_textured_mesh(arena_reader, 'Snake', 'snake.png')

    rat_camera = rc.Camera(projection=rc.PerspectiveProjection(aspect=1, fov_y=90, z_near=.001, z_far=10), position=rat_rb.position)  # settign the camera to be on top of the rats head

    meshes = [ground, sky, snake] + fields
    for mesh in meshes:
        mesh.uniforms['diffuse'] = 1., 1., 1.
        mesh.uniforms['flat_shading'] = False
        mesh.parent = arena

    virtual_scene = rc.Scene(meshes=meshes, light=light, camera=rat_camera, bgColor=(0, 0, 255))  # seetign aset virtual scene to be projected as the mesh of the arena
    virtual_scene.gl_states.states = virtual_scene.gl_states.states[:-1]


    ## Make Cubemapping work on arena
    cube_texture = rc.TextureCube(width=4096, height=4096)  # usign cube mapping to import eh image on the texture of the arena
    framebuffer = rc.FBO(texture=cube_texture) ## creating a fr`amebuffer as the texture - in tut 4 it was the blue screen
    arena.textures.append(cube_texture)

    # Stereo
    vr_camgroup = rc.StereoCameraGroup(distance=.05)
    vr_camgroup.rotation = vr_camgroup.rotation.to_quaternion()



    # updating the posiotn of the arena in xyz and also in rotational perspective
    def update(dt):
        """main update function: put any movement or tracking steps in here, because it will be run constantly!"""
        vr_camgroup.position, vr_camgroup.rotation.xyzw = rat_rb.position, rat_rb.quaternion  # setting the actual osiont of the rat camera to vbe of the rat position
        arena.uniforms['playerPos'] = rat_rb.position
        arena.position, arena.rotation.xyzw = arena_rb.position, arena_rb.quaternion
        arena.position.y -= .02

    pyglet.clock.schedule(update)  # making it so that the app updates in real time


    @window.event
    def on_draw():

        ## Render virtual scene onto cube texture
        with framebuffer:
            with cube_shader:

                for mask, camside in zip([(True, False, False, True), (False, True, True, True)], [vr_camgroup.left, vr_camgroup.right]):
                    gl.glColorMask(*mask)
                    virtual_scene.camera.position.xyz = camside.position_global
                    virtual_scene.draw360_to_texture(cube_texture)

        ## Render real scene onto screen
        gl.glColorMask(True, True, True, True)
        window.clear()
        with cube_shader:  # usign cube shader to create the actuall 6 sided virtual cube which gets upated with position and angle of the camera/viewer
            rc.clear_color(255, 0, 0)
              # why is it here 39? e
            with projector, light:
                arena.draw()

    # actually run everything.
    pyglet.app.run()
예제 #7
0
    def train(self,
              env,
              n_epochs=30,
              epsilon_init=1.,
              epsilon_schedule='exp',
              eps_decay=None,
              lr=0.001,
              batch_size=32):
        if epsilon_schedule == 'linear':
            eps_range = np.linspace(epsilon_init, 0., n_epochs)
        elif epsilon_schedule == 'constant':
            eps_range = [epsilon_init for _ in range(n_epochs)]
        elif epsilon_schedule == 'exp':
            if not eps_decay:
                eps_decay = n_epochs // 4
            eps_range = [
                epsilon_init * math.exp(-1. * i / eps_decay)
                for i in range(n_epochs)
            ]

        history_file = open(self.filename + 'history', mode='a+')
        self.policy_net = self.policy_net.cuda()
        self.target_net = self.target_net.cuda()
        self.target_net.eval()
        self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=lr)

        losses, rewards, change_history = [], [], []

        for epoch in range(n_epochs):
            env.reset()
            last_screen = get_screen(env)
            current_screen = get_screen(env)
            state = current_screen - last_screen
            done = False
            epoch_losses = []
            epoch_rewards = []
            video = []

            while not done:
                if epoch % 10 == 1:
                    video.append(last_screen)
                action = self.select_action(state, eps_range[epoch])

                _, reward, done, _ = env.step(action[0, 0])

                last_screen = current_screen
                current_screen = get_screen(env)

                reward = Tensor([reward])
                if not done:
                    next_state = current_screen - last_screen
                else:
                    next_state = None

                self.memory.push(state, action, next_state, reward)
                state = next_state
                loss = self.update(batch_size=batch_size)

                epoch_losses.append(loss)
                epoch_rewards.append(reward)

            history_file.write(
                'Epoch {}: loss= {}, reward= {}, duration= {}\n'.format(
                    epoch, np.mean(epoch_losses), np.sum(epoch_rewards),
                    len(epoch_rewards)))

            losses.append(np.mean(epoch_losses))
            rewards.append(np.sum(epoch_rewards))

            if epoch % 10 == 1:
                self.target_net.load_state_dict(self.policy_net.state_dict())
                self.save(ext=str(epoch))
                self.make_video(video, ext='_train_' + str(epoch))

                with open(self.filename + '.train_losses', 'a+') as f:
                    for l in losses:
                        f.write(str(l) + '\n')
                losses = []
                with open(self.filename + '.train_rewards', 'a+') as f:
                    for r in rewards:
                        f.write(str(r) + '\n')
                rewards = []
        self.save()
예제 #8
0
def train(Learner):
    # global args
    # args = parser.parse_args()
    # Learner = DQN().to(device)

    # env = retro.make(game='Airstriker-Genesis', state='Level1')

    criterion = L2_loss(0.99).to(device)

    if use_cuda:
        Learner = Learner.cuda()
        criterion = criterion.cuda()

    optimizer = optim.SGD(Learner.parameters(), lr=0.01)

    eps_threshold = 0.3
    RM = ReplayMemory(900)
    A_agent = ActorAgent(Learner, args)
    print("Start Episodes")
    for i_episode in range(50000):
        env.reset()
        A_agent.reset(Learner, args)
        last_state = get_screen(env)
        current_state = get_screen(env)
        state = current_state - last_state
        # state_var = torch.autograd.Variable(state)
        state_var = state.to(device)
        total_reward = 0
        # if i_episode % 50 == 0:
        #     if not eps_threshold < 0.1:
        #         eps_threshold -= 0.001
        start = time.time()
        for t in count():
            if t == 0:
                print("episode begin")
            action_q = A_agent.act(state_var, eps_threshold)
            """
            if is_cuda:
                action_q = action_q.cpu()
                _, action = action_q.data.max(2)
            else:
                _, action = action_q.data.max(2)
            """
            _, action = action_q.data.max(2)

            action_numpy = action.squeeze(0).numpy()
            # print(list(action_numpy))
            for i in range(4):
                _, reward, done, _ = env.step(action_numpy)
                total_reward += reward
            last_state = current_state
            current_state = get_screen(env)
            state = current_state - last_state
            # state_var = torch.autograd.Variable(state)
            state_var = state.to(device)
            # 行動語のstateを保存
            A_agent.add_to_buffer(reward, action_q, state_var)

            # ReplayMemoryに状態保存
            if len(A_agent.localbuffer) > 10:
                p, error = calc_priority_TDerror(Learner, criterion, A_agent,
                                                 10)

                RM.push(p, error)

            if done:
                break
            if t == 500:
                print("Total time: {0:.2f}".format(time.time() - start))
                # break
            # Optimize Learner model
            # if t%100==0 and len(A_agent.localbuffer)>80 and len(RM)>=30:
            env.render()

        # update Learner part
        for i in range(4):
            if len(RM.memory) >= 30:
                error_batch = RM.priority_sample(30)

                optimizer.zero_grad()
                # error_batch.backward(retain_graph=True)
                error_batch.backward()
                optimizer.step()
                # for param in Learner.parameters():
                #     param.grad.data.clamp_(-1, 1)
                optimizer.step()
                print("{0}\t{1}\tLoss:{2:.2f}\tTotal:{3:.2f}\tReward:{4:.2f}".
                      format(
                          i_episode,
                          t,
                          float(error_batch),
                          total_reward,
                          reward,
                      ))
            else:
                break

        if i_episode % 5 == 0:
            env.reset()
            last_state = get_screen(env)
            current_state = get_screen(env)
            state = current_state - last_state
            state_var = state.to(device)
            val_reward = 0
            for t in count():
                with torch.no_grad():

                    action_q = Learner(state_var)
                    _, action = action_q.data.max(2)
                    action_numpy = action.squeeze(0).numpy()
                    for i in range(4):
                        _, reward, done, _ = env.step(action_numpy)
                        val_reward += reward
                    last_state = current_state
                    current_state = get_screen(env)
                    state = current_state - last_state
                    state_var = state.to(device)

                    if done:
                        break
                    env.render()

            print("Validation:\tepisode{0}\tReward: {1:.2f}".format(
                i_episode, val_reward))

            with open("result.txt", "a") as f:
                f.write("episode{0}\tReward: {1:.2f}".format(
                    i_episode, val_reward))
                f.write("\n")

        RM.reset()
        # break

        with open("total_reward.txt", "a") as f:
            f.write("{0}\t{1}".format(i_episode, total_reward))
            f.write("\n")
예제 #9
0
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 10
num_episoeds = 50

resize = T.Compose([
    T.ToTensor(),
    T.ToPILImage(),
    T.Resize(40, interpolation=Image.CUBIC),
    T.ToTensor()
])

if __name__ == "__main__":
    env = gym.make(env_name)
    init_screen = get_screen(env)
    screen_height, screen_width, _ = init_screen.shape

    # Get number of actions from gym action space
    n_actions = env.action_space.n

    policy_net = DQN(screen_height, screen_width, n_actions).to(device)
    target_net = DQN(screen_height, screen_width, n_actions).to(device)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    optimizer = optim.RMSprop(policy_net.parameters())
    memory = ReplayMemory(10000)

    steps_done = 0
예제 #10
0
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
env.reset()

BATCH_SIZE = 128
# GAMMA is the discount factor
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200

TARGET_UPDATE = 10

AVERAGE_SIZE = 10
episode_durations = []

init_screen = get_screen(env, device)
_, _, screen_height, screen_width = init_screen.shape

policy_net = DQN(screen_height, screen_width).to(device)
target_net = DQN(screen_height, screen_width).to(device)

target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.RMSprop(policy_net.parameters())
memory = ReplayMemory(10000)

steps_done = 0
num_episodes = 300
for i_episode in range(num_episodes):
    env.reset()
예제 #11
0
from collections import deque
import gym
import numpy as np
from agent import DDPG
from utils import get_screen

env = gym.make('Pendulum-v0')

agent = DDPG(env, memory=False)
agent.load_model()

env.reset()
pixel = env.render(mode='rgb_array')
state = deque([get_screen(pixel) for _ in range(3)], maxlen=3)
cumulative_reward = 0
for timestep in range(200):
    action = agent.get_action(np.array(state)[np.newaxis])
    _, reward, _, _ = env.step(action * 2)
    pixel = env.render(mode='rgb_array')
    state_ = state.copy()
    state_.append(get_screen(pixel))
    state = state_
    cumulative_reward += reward
print('Cumulative Reward: {}'.format(cumulative_reward))
예제 #12
0
def trainD(file_name="Distral_1col", list_of_envs=[GridworldEnv(4),
            GridworldEnv(5)], batch_size=128, gamma=0.999, alpha=0.9,
            beta=5, eps_start=0.9, eps_end=0.05, eps_decay=5,
            is_plot=False, num_episodes=200,
            max_num_steps_per_episode=1000, learning_rate=0.001,
            memory_replay_size=10000, memory_policy_size=1000):
    """
    Soft Q-learning training routine. Retuns rewards and durations logs.
    Plot environment screen
    """
    # action dimension
    num_actions = list_of_envs[0].action_space.n
    # total envs
    num_envs = len(list_of_envs)
    # pi_0
    policy = PolicyNetwork(num_actions)
    # Q value, every environment has one, used to calculate A_i,
    models = [DQN(num_actions) for _ in range(0, num_envs)]   ### Add torch.nn.ModuleList (?)
    # replay buffer for env ???
    memories = [ReplayMemory(memory_replay_size, memory_policy_size) for _ in range(0, num_envs)]

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    # device = "cpu"
    print(device)
    # model
    policy = policy.to(device)
    for i in range(len(models)):
        models[i] = models[i].to(device)

    # optimizer for every Q model
    optimizers = [optim.Adam(model.parameters(), lr=learning_rate)
                    for model in models]
    # optimizer for policy
    policy_optimizer = optim.Adam(policy.parameters(), lr=learning_rate)
    # optimizer = optim.RMSprop(model.parameters(), )

    # info list for each environment
    episode_durations = [[] for _ in range(num_envs)]   # list of local steps
    episode_rewards = [[] for _ in range(num_envs)]     # list of list of episode reward

    episodes_done = np.zeros(num_envs)      # episode num
    steps_done = np.zeros(num_envs)         # global timesteps for each env
    current_time = np.zeros(num_envs)       # local timesteps for each env

    # Initialize environments
    for env in list_of_envs:
        env.reset()

    while np.min(episodes_done) < num_episodes:
        policy.train()
        for model in models:
            model.train()

        # TODO: add max_num_steps_per_episode

        # Optimization is given by alterating minimization scheme:
        #   1. do the step for each env
        #   2. do one optimization step for each env using "soft-q-learning".
        #   3. do one optimization step for the policy

        #   1. do the step for each env
        for i_env, env in enumerate(list_of_envs):
            # print("Cur episode:", i_episode, "steps done:", steps_done,
            #         "exploration factor:", eps_end + (eps_start - eps_end) * \
            #         math.exp(-1. * steps_done / eps_decay))
        
            # last_screen = env.current_grid_map
            # ===========update step info begin========================
            current_screen = get_screen(env)
            # state
            state = current_screen # - last_screen
            # action chosen by pi_1~pi_i
            action = select_action(state, policy, models[i_env], num_actions,
                                    eps_start, eps_end, eps_decay,
                                    episodes_done[i_env], alpha, beta, device)
            # global_steps
            steps_done[i_env] += 1
            # local steps
            current_time[i_env] += 1
            # reward
            _, reward, done, _ = env.step(action[0, 0])
            reward = Tensor([reward])

            # next state
            last_screen = current_screen
            current_screen = get_screen(env)
            if not done:
                next_state = current_screen # - last_screen
            else:
                next_state = None

            # add to buffer
            time = Tensor([current_time[i_env]])
            memories[i_env].push(state, action, next_state, reward, time)

            #   2. do one optimization step for each env using "soft-q-learning".
            # Perform one step of the optimization (on the target network)
            optimize_model(policy, models[i_env], optimizers[i_env],
                            memories[i_env], batch_size, alpha, beta, gamma, device)
            # ===========update step info end ========================


            # ===========update episode info begin ====================
            if done:
                print("ENV:", i_env, "iter:", episodes_done[i_env],
                    "\treward:", env.episode_total_reward,
                    "\tit:", current_time[i_env], "\texp_factor:", eps_end +
                    (eps_start - eps_end) * math.exp(-1. * episodes_done[i_env] / eps_decay))
                # reset env
                env.reset()
                # episode steps
                episodes_done[i_env] += 1
                # append each episode local timesteps list for every env
                episode_durations[i_env].append(current_time[i_env])
                # reset local timesteps
                current_time[i_env] = 0
                # append total episode_reward to list
                episode_rewards[i_env].append(env.episode_total_reward)
                if is_plot:
                    plot_rewards(episode_rewards, i_env)
            # ===========update episode info end ====================

        #   3. do one optimization step for the policy
        # after all envs has performed one step, optimize policy
        optimize_policy(policy, policy_optimizer, memories, batch_size,
                    num_envs, gamma, device)

    print('Complete')
    env.render(close=True)
    env.close()
    if is_plot:
        plt.ioff()
        plt.show()

    ## Store Results

    np.save(file_name + '-distral-2col-rewards', episode_rewards)
    np.save(file_name + '-distral-2col-durations', episode_durations)

    return models, policy, episode_rewards, episode_durations
예제 #13
0
import torch
import gym
from utils import get_screen

# Import environment
env = gym.make('Acrobot-v1')
# Define model and load pre-trained weights
trained_Q_network = torch.load('trained_DQN_model',
                               map_location=lambda storage, loc: storage)

# Reset env
_ = env.reset()
current_screen = get_screen(env)
state = current_screen
# Present trained behaviour over episodes
num_test_episodes = 30

episodes_passed = 0
acc_episodic_reward = 0.0

while episodes_passed < num_test_episodes:
    # Choose action greedily
    action = trained_Q_network.select_action(state)
    # Act on env
    _, reward, done, _ = env.step(action)
    last_screen = current_screen
    current_screen = get_screen(env)
    next_state = current_screen - last_screen
    # Add to accumulative reward
    acc_episodic_reward += reward
    # When episode is done - reset and print
예제 #14
0
def trainSQL0(file_name="SQL0", env=GridworldEnv(1), batch_size=128,
            gamma=0.999, beta=5, eps_start=0.9, eps_end=0.05, eps_decay=1000,
            is_plot=False, num_episodes=500, max_num_steps_per_episode=1000,
            learning_rate=0.001, memory_replay_size=10000):
    """
    Soft Q-learning training routine when observation vector is input
    Retuns rewards and durations logs.
    Plot environment screen
    """
    if is_plot:
        env.reset()
        plt.ion()
        plt.figure()
        plt.imshow(get_screen(env).cpu().squeeze(0).squeeze(0).numpy(),
                   interpolation='none')
        plt.draw()
        plt.pause(0.00001)

    num_actions = env.action_space.n
    input_size = env.observation_space.shape[0]
    model = DQN(input_size, num_actions)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    # optimizer = optim.RMSprop(model.parameters(), )

    use_cuda = torch.cuda.is_available()
    if use_cuda:
        model.cuda()

    memory = ReplayMemory(memory_replay_size)

    episode_durations = []
    mean_durations = []
    episode_rewards = []
    mean_rewards = []

    steps_done, t = 0, 0
    # plt.ion()
    for i_episode in range(num_episodes):
        if i_episode % 20 == 0:
            clear_output()
        if i_episode != 0:
            print("Cur episode:", i_episode, "steps done:", episode_durations[-1],
                    "exploration factor:", eps_end + (eps_start - eps_end) * \
                    math.exp(-1. * steps_done / eps_decay), "reward:", env.episode_total_reward)
        # Initialize the environment and state
        state = torch.from_numpy( env.reset() ).type(torch.FloatTensor).view(-1,input_size)

        for t in count():
            # Select and perform an action
            action = select_action(state, model, num_actions,
                                    eps_start, eps_end, eps_decay, steps_done)
            next_state_tmp, reward, done, _ = env.step(action[0, 0])
            reward = Tensor([reward])

            # Observe new state
            next_state = torch.from_numpy( next_state_tmp ).type(torch.FloatTensor).view(-1,input_size)

            if done:
                next_state = None

            # Store the transition in memory
            memory.push(state, action, next_state, reward)

            # Move to the next state
            state = next_state
            # plot_state(state)
            # env.render()

            # Perform one step of the optimization (on the target network)
            optimize_model(model, optimizer, memory, batch_size, gamma, beta)  #### Difference w.r.t DQN
            if done or t + 1 >= max_num_steps_per_episode:
                episode_durations.append(t + 1)
                episode_rewards.append(env.episode_total_reward)  ##### Modify for OpenAI envs such as CartPole
                if is_plot:
                    plot_durations(episode_durations, mean_durations)
                    plot_rewards(episode_rewards, mean_rewards)
                steps_done += 1
                break

    print('Complete')
    env.render(close=True)
    env.close()
    if is_plot:
        plt.ioff()
        plt.show()

    ## Store Results

    np.save(file_name + '-sql0-rewards', episode_rewards)
    np.save(file_name + '-sql0-durations', episode_durations)

    return model, episode_rewards, episode_durations