Пример #1
0
def load_database(n_parts, DB_ID, buffer_size, level):
    database = ExperienceBuffer(buffer_size, level)
    for i in range(0, n_parts):
        PATH = LOAD_PATH + DB_ID + '/SAC_training_level1_database_part_' + str(
            i) + '.p'
        database.buffer += pickle.load(open(PATH, 'rb'))
    return database
Пример #2
0
def train(params, log_dir, local_log, random_seed, trial, agent_id):

    # define device on which to run
    device = torch.device(params["DEVICE"])

    # create env and add specific conifigurations to Malmo
    env = make_env(params["DEFAULT_ENV_NAME"])
    env.configure(client_pool=[('127.0.0.1', 10000), ('127.0.0.1', 10001)])
    env.configure(allowDiscreteMovement=["move", "turn"]) # , log_level="INFO")
    env.configure(videoResolution=[84,84])
    env.configure(stack_frames=4)
    env = wrap_env_malmo(env)

    if random_seed:
        env.seed(random_seed)

    print("Observation Space: ", env.observation_space)
    print("Action Space: ", env.action_space)

    # initialize agent
    bufer = ExperienceBuffer(params["REPLAY_SIZE"])
    # buffer = ExperienceBuffer(params["REPLAY_SIZE"])            
    net = DQN(env.observation_space.shape, env.action_space.n, params["DEVICE"]).to(device)
    tgt_net = DQN(env.observation_space.shape, env.action_space.n, params["DEVICE"]).to(device)
    epsilon = params["EPSILON_START"]
    gamma = params["GAMMA"]
    tau = params["SOFT_UPDATE_TAU"]
    agent = Agent('agent' + str(agent_id), env, bufer, net, tgt_net, gamma, epsilon, tau, 
        trial, log_dir, params)

    # other variables
    agent.optimizer = optim.Adam(agent.net.parameters(), lr=params["LEARNING_RATE"])
    agent.print_color = COLORS[agent_id]

    local_log[agent.alias+"-"+str(trial)] = {"rewards": [],"steps": []}

    # fill buffer with initial size - don't count these episodes
    agent.fill_buffer()

    # training loop
    ep_count = 0
    while not agent.completed:

        ep_count += 1

        episode_over = False
        episode_start = time.time()        
        while not episode_over:

            # play step
            frame_start = time.time()
            episode_over, done_reward = agent.play_step(device=device)
            agent.frame_idx+= 1

            #### Folllowing methods on episode basis
            if done_reward is not None:

                # calculate episode speed
                agent.ep_speed = time.time() - episode_start
                # reset trackers
                episode_start = time.time()

                # save to local log as well
                local_log[agent.alias+"-"+str(trial)]["rewards"].append(agent.total_rewards[-1])
                local_log[agent.alias+"-"+str(trial)]["steps"].append(agent.total_steps[-1])

                if params["INDEPENDENT_EVALUATION"]:
                    offline_evaluation(params, agent)
                else:
                    online_evaluation(params, agent)

                ## check if problem has been solved
                if agent.mean_reward is not None:
                    if agent.mean_reward > params["MEAN_REWARD_BOUND"]:
                        print(colored("%s solved in %d episodes!" % (agent.alias, len(agent.total_rewards)), agent.print_color))
                        agent.completed = True

                # if no sign of converging, also break
                if len(agent.total_rewards) >= params["MAX_GAMES_PLAYED"]:
                    agent.completed = True

            #### Folllowing methods on frame basis
            # decay epsilon linearly on frames
            agent.epsilon = max(params["EPSILON_FINAL"], params["EPSILON_START"] - \
                (agent.frame_idx-params["REPLAY_START_SIZE"]) / params["EPSILON_DECAY_LAST_FRAME"])
            
            # update at every frame using soft updates
            if params["SOFT"]:
                agent.soft_update_target_network()
            # or hard updates
            else:
                if agent.frame_idx % params["SYNC_TARGET_FRAMES"] == 0:
                    agent.hard_update_target_network()
            
            ## learn
            loss_t = agent.learn(device)

            # record
            agent.frame_speed = 1000 / (time.time() - frame_start)
            if params["DEBUG"]: 
                agent.record_frame(loss_t.detach().item()) # detach required?


    # del bufer to force gc later, occupies too much memory
    del bufer
    # closes tensorboard writer
    agent.writer.close()
Пример #3
0
    
    
    agent = create_third_level_agent(concept_path, args.load_concept_id, args.n_concepts, noisy=noisy, 
        n_heads=n_heads, init_log_alpha=args.init_log_alpha, latent_dim=args.vision_latent_dim, 
        parallel=args.parallel_q_nets, lr=args.lr, lr_alpha=args.lr_alpha, lr_actor=args.lr_actor, min_entropy_factor=args.entropy_factor, 
        lr_c=args.lr_c, lr_Alpha=args.lr_c_Alpha, entropy_update_rate=args.entropy_update_rate, init_Epsilon=args.init_epsilon_MC,
        delta_Epsilon=args.delta_epsilon_MC)
    
    if args.load_id is not None:
        if args.load_best:
            agent.load(MODEL_PATH + env_name + '/best_', args.load_id)
        else:
            agent.load(MODEL_PATH + env_name + '/last_', args.load_id)
    agents = collections.deque(maxlen=args.n_agents)
    agents.append(agent)
    
    os.makedirs(MODEL_PATH + env_name, exist_ok=True)

    database = ExperienceBuffer(buffer_size, level=2)

    trainer = Trainer(optimizer_kwargs=optimizer_kwargs)
    returns = trainer.loop(env, agents, database, n_episodes=n_episodes, render=args.render, 
                            max_episode_steps=n_steps_in_second_level_episode, 
                            store_video=store_video, wandb_project=wandb_project, 
                            MODEL_PATH=MODEL_PATH, train=(not args.eval),
                            initialization=initialization, init_buffer_size=init_buffer_size,
                            save_step_each=save_step_each, train_each=args.train_each, 
                            n_step_td=n_step_td, train_n_MC=args.train_n_mc, rest_n_MC=args.rest_n_mc,
                            eval_MC=args.eval_MC)
    G = returns.mean()    
    print("Mean episode return: {:.2f}".format(G)) 
    parser.add_argument(
        "--vision_latent_dim",
        default=DEFAULT_VISION_LATENT_DIM,
        help="Dimensionality of feature vector added to inner state, default="
        + str(DEFAULT_VISION_LATENT_DIM))
    args = parser.parse_args()

    render_kwargs = {
        'pixels': {
            'width': 168,
            'height': 84,
            'camera_name': 'front_camera'
        }
    }

    database = ExperienceBuffer(args.buffer_size, level=3)
    trainer = Trainer()

    env_model_pairs = load_env_model_pairs(args.file)
    n_envs = len(env_model_pairs)
    n_episodes = (args.buffer_size * args.save_step_each) // args.n_steps
    store_video = False

    for env_number, (env_name, model_id) in enumerate(env_model_pairs.items()):
        task_database = ExperienceBuffer(args.buffer_size // n_envs, level=2)

        env = AntPixelWrapper(
            PixelObservationWrapper(gym.make(env_name).unwrapped,
                                    pixels_only=False,
                                    render_kwargs=render_kwargs.copy()))
Пример #5
0
def train(params, log_dir, local_log, random_seed, trial):

    # define device on which to run
    device = torch.device(params["DEVICE"])

    ## Marlo specifics
    # get join tokens
    env = init_environment(params["DEFAULT_ENV_NAME"])

    agents = []
    for aid in range(params["NUM_AGENTS"]):

        # initialize bufer
        if params["SHARING"] and params["PRIORITIZED_SHARING"]:
            bufer = ExperienceBufferGridImage(params["REPLAY_SIZE"])
        else:
            bufer = ExperienceBuffer(params["REPLAY_SIZE"])            

        # initialize agent        
        net = DQN(env.observation_space.shape, env.action_space.n, params["DEVICE"]).to(device)
        tgt_net = DQN(env.observation_space.shape, env.action_space.n, params["DEVICE"]).to(device)
        epsilon = params["EPSILON_START"]
        gamma = params["GAMMA"]
        tau = params["SOFT_UPDATE_TAU"]
        agent = Agent('agent' + str(aid), env, bufer, net, tgt_net, gamma, epsilon, tau, 
            trial, log_dir, params)

        # other variables
        agent.optimizer = optim.Adam(agent.net.parameters(), lr=params["LEARNING_RATE"])
        agent.print_color = COLORS[aid]

        local_log[agent.alias+"-"+str(trial)] = {"rewards": [],"steps": []}

        # fill buffer with initial size - don't count these episodes
        agent.fill_buffer()

        agents.append(agent)

    # training loop
    ep_count = 0
    while sum(map(lambda agent:agent.completed, agents)) != len(agents):

        # overall count of episodes
        ep_count += 1

        # sharing
        if params["SHARING"] and ep_count % params["SHARING_INTERVAL"] == 0 and ep_count > 0:
            if params["PRIORITIZED_SHARING"]:
                share(agents, params["BATCH_SIZE_TRANSFER"], params["REPLAY_START_SIZE"], params["SHARING_THRESHOLD"])
            else:
                share_no_mask(agents, params["BATCH_SIZE_TRANSFER"], params["REPLAY_START_SIZE"])

        # each agent does one episode
        for agent in agents:

            ## Before 2 agents perform, act, do one round of experience share
            # given a sharing interval and it is not the first episode
            if not agent.completed:

                episode_over = False
                episode_start = time.time()        
                while not episode_over:

                    # play step
                    frame_start = time.time()
                    episode_over, done_reward = agent.play_step(device=device)
                    agent.frame_idx+= 1

                    #### Folllowing methods on episode basis
                    if done_reward is not None:

                        # calculate episode speed
                        agent.ep_speed = 1 / (time.time() - episode_start)
                        # reset trackers
                        episode_start = time.time()

                        # save to local log as well
                        local_log[agent.alias+"-"+str(trial)]["rewards"].append(agent.total_rewards[-1])
                        local_log[agent.alias+"-"+str(trial)]["steps"].append(agent.total_steps[-1])

                        if params["INDEPENDENT_EVALUATION"]:
                            offline_evaluation(params, agent, log_dir)
                        else:
                            online_evaluation(params, agent, log_dir)

                        ## check if problem has been solved
                        # need a minimum number of episodes to evaluate
                        if len(agent.total_rewards) >= params["NUMBER_EPISODES_MEAN"]:                                
                            # and mean reward has to go above boundary
                            if agent.mean_reward >= params["MEAN_REWARD_BOUND"]:
                                    print(colored("%s solved in %d episodes!" % (agent.alias, len(agent.total_rewards)), agent.print_color))
                                    agent.completed = True

                        # if no sign of converging, also break
                        if len(agent.total_rewards) >= params["MAX_GAMES_PLAYED"]:
                            agent.completed = True

                    #### Folllowing methods on frame basis
                    # decay epsilon linearly on frames
                    agent.epsilon = max(params["EPSILON_FINAL"], params["EPSILON_START"] - \
                        agent.frame_idx / params["EPSILON_DECAY_LAST_FRAME"])
                    
                    # update at every frame using soft updates
                    if params["SOFT"]:
                        agent.soft_update_target_network()
                    # or hard updates
                    else:
                        if agent.frame_idx % params["SYNC_TARGET_FRAMES"] == 0:
                            agent.hard_update_target_network()
                    
                    ## learn
                    loss_t = agent.learn(device)

                    # record
                    agent.frame_speed = 1 / (time.time() - frame_start)
                    if params["DEBUG"]: 
                        agent.record_frame(loss_t.detach().item()) # detach required?


    # del bufer to force gc later, occupies too much memory
    del bufer
    for agent in agents:
        del agent.exp_buffer
    # closes tensorboard writer
    agent.writer.close()
Пример #6
0
def DQN_experiment(params, log_dir, random_seed=None):

    # define device on which to run
    device = torch.device(params["DEVICE"])

    # fix replay start sie to be equal to replay size
    params["REPLAY_START_SIZE"] = params["REPLAY_SIZE"]

    ## initialize global variables
    # initialize local log trackers 
    log_episodes_count = []
    log_ma_steps = []
    log_md_steps = []
    log_ma_rewards = []
    log_md_rewards = []

    colors=['green','red','blue','yellow','cyan','magenta','grey','white']

    # try several times and average results, needs to compensate for stochasticity
    for trial in range(params["NUM_TRIALS"]):

        # initialize environment
        agents = []

        # need to be one env per agent
        env = make_env(params["DEFAULT_ENV_NAME"])
        if random_seed:
            env.seed(random_seed)

        # initialize agents
        for idx in range(params["NUM_AGENTS"]):

            # initialize agent
            buffer = ExperienceBuffer(params["REPLAY_SIZE"], env)
            net = DQN(env.observation_space.shape[0], env.action_space.n, params["DEVICE"]).to(device)
            tgt_net = DQN(env.observation_space.shape[0], env.action_space.n, params["DEVICE"]).to(device)
            epsilon = params["EPSILON_START"]
            gamma = params["GAMMA"]
            tau = params["SOFT_UPDATE_TAU"]
            agent = Agent('agent' + str(idx+1), env, buffer, net, tgt_net, gamma, epsilon, tau, trial, log_dir)

            # other variables
            agent.optimizer = optim.Adam(agent.net.parameters(), lr=params["LEARNING_RATE"])
            agent.print_color = colors[idx]

            agents.append(agent)    


        ######### training loop
        ################################

        ts = time.time() # track start time


        ######### 1. Filling replay bugg
        ################################

        # both agents fill their buffer prior to experience
        for agent in agents:
            while True:
            
                # add frame count
                agent.frame_idx+= 1

                # play step
                episode_over, done_reward = agent.play_step(device=device)
                if params["DEBUG"]: agent.record()

                # check if minimum buffer size has been achieved. if not, move on, do not do learning
                if len(agent.exp_buffer) >= params["REPLAY_START_SIZE"]:
                    agent.reset()
                    break    


        ######### 1. They start alternating
        ################################

        episode_start = time.time()        
        ep_count = 0
        # while all agents have not completed:    
        while sum(map(lambda agent:agent.completed, agents)) != len(agents):

            ep_count += 1

            # agents alternate
            for agent in agents:

                ## Before 2 agents perform, act, do one round of experience share
                # given a sharing interval and it is not the first episode
                if params["SHARING"] and ep_count % params["SHARING_INTERVAL"] == 0 and ep_count > 0:

                    # agent 1 requests
                    student, teacher = agents[0], agents[1]
                    transfer_mask = student.request_share(threshold=0)
                    transfer_batch = teacher.exp_buffer.sample_with_mask(student.steps[-1], transfer_mask)
                    student.exp_buffer.extend(transfer_batch)

                    # agent 2 requests
                    student, teacher = agents[1], agents[0]
                    transfer_mask = student.request_share(threshold=0)
                    transfer_batch = teacher.exp_buffer.sample_with_mask(student.steps[1], transfer_mask)
                    student.exp_buffer.extend(transfer_batch)


                # check if agent has not completed the task already
                # if it does, go to the next agent
                if not agent.completed:

                    # play until episode is over
                    episode_over = False
                    while not episode_over:

                        # add frame count
                        agent.frame_idx+= 1

                        # play step
                        episode_over, done_reward = agent.play_step(device=device)

                        if done_reward is not None:

                            # calculate speed
                            agent.speed = (agent.frame_idx - agent.ts_frame) / (time.time() - ts)
                            agent.ts_frame = agent.frame_idx
                            ts = time.time()

                            # get time between episodes

                            ## verify completion and report metrics
                            if params["INDEPENDENT_EVALUATION"]:

                                if len(agent.total_rewards) % params["TRACKING_INTERVAL"] == 0:
                                    agent.test_rewards = []
                                    evaluation_start = time.time()
                                    for _ in range(100):
                                        done_reward = False
                                        while not done_reward:
                                            _, done_reward = agent.play_step(device=device, test=True)
                                        agent.test_rewards.append(done_reward)
                                    evaluation_time = time.time() - evaluation_start

                                    # only report after one episode ends
                                    agent.mean_reward = np.mean(agent.test_rewards)
                                    agent.std_reward = np.std(agent.test_rewards)

                                    # calculate elapsed time
                                    episode_end = time.time()
                                    episode_speed = params["TRACKING_INTERVAL"] / (episode_end - episode_start) 
                                    episode_start = time.time()

                                    # report
                                    print(colored("%s, %d: done %d episodes, mean reward %.2f, std reward %.2f, eps %.2f, speed %d f/s, ep_speed %.2f e/s, eval_time %.2f s" % (
                                        agent.alias, agent.frame_idx, len(agent.total_rewards), agent.mean_reward, agent.std_reward, agent.epsilon, agent.speed, episode_speed, evaluation_time
                                    ), agent.print_color))
                                    
                                    ## check if reward has improved from last iteration
                                    if agent.mean_reward is not None:
                                        if agent.mean_reward > params["MEAN_REWARD_BOUND"]:
                                            print(colored("%s solved in %d episodes!" % (agent.alias, len(agent.total_rewards)), agent.print_color))
                                            # save final version
                                            # save final version
                                            # torch.save(agent.net.state_dict(), "weights/" + params["DEFAULT_ENV_NAME"] + "-" + agent.alias + "-best.dat")
                                            # mark as completed
                                            agent.completed = True
                                            # save local log
                                            log_episodes_count[agent.alias].append(len(agent.total_rewards))
                                            log_steps[agent.alias].append(len(agent.total_rewards))

                            ## approach to track evaluation using moving averages:
                            else:
                                # only report after one episode ends
                                agent.mean_reward = np.mean(agent.total_rewards[-params["NUMBER_EPISODES_MEAN"]:])
                                agent.std_reward = np.std(agent.total_rewards[-params["NUMBER_EPISODES_MEAN"]:])

                                # calculate elapsed time
                                episode_end = time.time()
                                episode_speed = 1 / (episode_end - episode_start)
                                episode_start = time.time()

                                # report
                                if len(agent.total_rewards) % params["TRACKING_INTERVAL"] == 0:
                                    print(colored("%s, %d: done %d episodes, mean reward %.2f, std reward %.2f, eps %.2f, speed %d f/s, ep_speed %.2f e/s" % (
                                        agent.alias, agent.frame_idx, len(agent.total_rewards), agent.mean_reward, agent.std_reward, agent.epsilon, agent.speed, episode_speed
                                    ), agent.print_color))
                                
                                ## check if reward has improved from last iteration
                                if agent.mean_reward is not None:
                                    if agent.mean_reward > params["MEAN_REWARD_BOUND"]:
                                        print(colored("%s solved in %d episodes!" % (agent.alias, len(agent.total_rewards)), agent.print_color))
                                        # save final version
                                        # torch.save(agent.net.state_dict(), "weights/" + params["DEFAULT_ENV_NAME"] + "-" + agent.alias + "-best.dat")
                                        # mark as completed
                                        agent.completed = True
                                        # save local log
                                        log_episodes_count.append(len(agent.total_rewards))
                                        log_ma_rewards.append(np.mean(agent.total_rewards[-params["REPORTING_INTERVAL"]:]))
                                        log_md_rewards.append(np.std(agent.total_rewards[-params["REPORTING_INTERVAL"]:]))
                                        log_ma_steps.append(np.mean(agent.total_steps[-params["REPORTING_INTERVAL"]:]))
                                        log_md_steps.append(np.std(agent.total_steps[-params["REPORTING_INTERVAL"]:]))

                        # if no sign of converging, also break
                        # but don't store the result
                        if len(agent.total_rewards) > params["MAX_GAMES_PLAYED"]:
                            agent.completed = True

                        # decay epsilon after the first episodes that fill the buffer
                        # decay epsilon linearly on frames
                        agent.epsilon = max(params["EPSILON_FINAL"], params["EPSILON_START"] - (agent.frame_idx-params["REPLAY_START_SIZE"]) / params["EPSILON_DECAY_LAST_FRAME"])
                            
                        # update at every frame using soft updates
                        if params["SOFT"]:
                            agent.soft_update_target_network()
                        else:                        
                            if agent.frame_idx % params["SYNC_TARGET_FRAMES"] == 0:
                                agent.tgt_net.load_state_dict(agent.net.state_dict())
                            
                        ## learn
                        # zero gradients
                        agent.optimizer.zero_grad()
                        # sample from buffer
                        batch = agent.exp_buffer.sample(params["BATCH_SIZE"])
                        # calculate loss
                        # decide to leave it on the agent as a static method, instead of floating around
                        loss_t = agent.calc_loss(batch, device=device)
                        # calculate gradients
                        loss_t.backward()
                        # gradient clipping
                        if params["GRADIENT_CLIPPING"]: nn.utils.clip_grad_norm_(net.parameters(), params["GRAD_L2_CLIP"])
                        # optimize
                        agent.optimizer.step()

                        # track agent parameters, including loss function
                        # detach loss before extracting value - not sure if needed, but better safe than sorry
                        if params["DEBUG"]: agent.record(loss_t.detach().item())


    for agent in agents:
        agent.writer.close()

    # return local log with results
    local_log = {
        "episodes_count": log_episodes_count,
        "ma_steps": log_ma_steps,
        "md_steps": log_md_steps,
        "ma_rewards": log_ma_rewards,
        "md_rewards": log_md_rewards
    }
    return local_log
Пример #7
0
    # Initilize Weights-and-Biases project
    if wandb_project:
        wandb.init(project=project_name)

        # Log hyperparameters in WandB project
        wandb.config.update(args)
        wandb.config.active_multitask = DEFAULT_ACTIVE_MULTITASK
        wandb.config.active_dc_torque = DAFAULT_DC_TORQUE

    env = gym.make(args.env_name)

    agent = generate_agent(env, args.load_id, args.load_best,
                           actor_critic_kwargs)

    database = ExperienceBuffer(args.buffer_size, level=1)

    trainer = Trainer(optimizer_kwargs=optimizer_kwargs)

    returns = trainer.loop(env,
                           agent,
                           database,
                           n_episodes=n_episodes,
                           render=args.render,
                           max_episode_steps=args.n_steps_in_episode,
                           store_video=store_video,
                           wandb_project=wandb_project,
                           MODEL_PATH=MODEL_PATH,
                           train=(not args.eval),
                           initialization=args.initialization,
                           init_buffer_size=args.init_buffer_size)