Exemplo n.º 1
0
def train(params):
    
    env = gym.make(params['env_name'])
    params['ob_dim'] = env.observation_space.shape[0]
    params['ac_dim'] = env.action_space.shape[0]

    master = Learner(params)
        
    n_eps = 0
    n_iter = 0
    ts_cumulative = 0
    ts, rollouts, rewards, max_rwds, dists, min_dists, agents, lambdas = [0], [0], [], [], [], [], [], []
    params['num_sensings'] = params['sensings']

    master.agent=0
    # get initial states so you can get behavioral embeddings
    population = [master.agents[x].rollout(env, params['steps'], incl_data=True) for x in master.agents.keys()]
    all_states =  [s[0] for x in population for s in x[1]]
    master.selected = select_states(master, params, all_states)
    master.update_embeddings(params, population)
    master.calc_pairwise_dists(params)
    master.select_agent()
    master.get_agent()
    #initial reward
    reward = master.policy.rollout(env, params['steps'], incl_data=False)

    rewards.append(reward)
    agents.append(master.agent)
    dists.append(master.dists)
    max_reward=reward
    max_rwds.append(max_reward)
    min_dists.append(master.min_dist)
    
    if params['w_nov'] < 0:
        bb = BayesianBandits()
        params['w_nov'] = 0
    lambdas.append(params['w_nov'])
        
    while n_iter < params['max_iter']:

        print('Iter: %s, Eps: %s, Mean: %s, Max: %s, Best: %s, MeanD: %s, MinD: %s, Lam: %s' %(n_iter, n_eps, np.round(reward,4), np.round(max_reward,4), master.agent, np.round(master.dists,4), np.round(master.min_dist,4), params['w_nov']))
       
        if (n_iter>0) & (params['num_agents'] > 1):
            master.calc_pairwise_dists(params)
            master.select_agent()
            master.get_agent()

        ## Main Function Call
        params['n_iter'] = n_iter
        if params['num_agents'] > 1:
            gradient, timesteps = population_update(master, params)
            n_eps += 2*params['num_sensings'] * params['num_agents']
        else:
            gradient, timesteps = individual_update(master, params)
            n_eps += 2*params['num_sensings']
            
        ts_cumulative += timesteps
        all_states += master.states
        if params['num_sensings'] < len(all_states):
            all_states = sample(all_states, params['num_sensings'])

        gradient /= (np.linalg.norm(gradient) / master.policy.N + 1e-8)
                
        n_iter += 1
        update = Adam(gradient, master, params['learning_rate'], n_iter)

        rwds, trajectories = [], []
        if params['num_evals'] > 0:
            seeds = [int(np.random.uniform()*10000) for _ in range(params['num_evals'])]
        for i in range(params['num_agents']):
            master.agent = i
            master.get_agent()
            master.policy.update(master.policy.params + update[(i*master.policy.N):((i+1)*master.policy.N)])
            if params['num_evals'] > 0:
                reward = 0
                for j in range(params['num_evals']):
                    r, traj = master.policy.rollout(env, params['steps'], incl_data=True, seed =seeds[j])
                    reward += r
                reward /= params['num_evals']
            else:
                reward, traj = master.policy.rollout(env, params['steps'], incl_data=True)
            rwds.append(reward)
            trajectories.append(traj)
            if reward > master.best[i]:
                master.best[i] = reward
                np.save('data/%s/weights/Seed%s_Agent%s' %(params['dir'], params['seed'], i), master.policy.params)
            master.reward[i].append(reward)
            master.update_agent()
        reward = np.mean(rwds)
        max_reward = max(rwds)
        traj = trajectories[np.argmax(rwds)]
        master.agent = np.argmax(rwds)

        
        # Update selected states
        master.selected = select_states(master, params, all_states)
        master.update_embeddings(params)
        
        master.embedding = embed(params, traj, master.policy, master.selected)
        rewards.append(reward)
        max_rwds.append(max_reward)
        master.reward[master.agent].append(reward)
        if reward > master.best[master.agent]:
            master.best[master.agent] = reward
            np.save('data/%s/weights/Seed%s_Agent%s' %(params['dir'], params['seed'], master.agent), master.policy.params)
        
        ## update the bandits
        try:
            bb.update_dists(reward)
            params['w_nov'] = bb.sample()
        except NameError:
            pass
        
        lambdas.append(params['w_nov'])
        rollouts.append(n_eps)
        agents.append(master.agent)
        dists.append(master.dists)
        min_dists.append(master.min_dist)
        ts.append(ts_cumulative)
        master.update_agent()

        if n_iter % params['flush'] == 0:
            reset_ray(master, params)
            master.init_workers(params)
        
        out = pd.DataFrame({'Rollouts': rollouts, 'Reward': rewards, 'Max': max_rwds, 'Timesteps': ts, 'Dists': dists, 'Min_Dist':min_dists, 'Agent': agents, 'Lambda': lambdas})
        out.to_csv('data/%s/results/Seed%s.csv' %(params['dir'], params['seed']), index=False)