def train(params): env = gym.make(params['env_name']) params['ob_dim'] = env.observation_space.shape[0] params['ac_dim'] = env.action_space.shape[0] master = Learner(params) n_eps = 0 n_iter = 0 ts_cumulative = 0 ts, rollouts, rewards, max_rwds, dists, min_dists, agents, lambdas = [0], [0], [], [], [], [], [], [] params['num_sensings'] = params['sensings'] master.agent=0 # get initial states so you can get behavioral embeddings population = [master.agents[x].rollout(env, params['steps'], incl_data=True) for x in master.agents.keys()] all_states = [s[0] for x in population for s in x[1]] master.selected = select_states(master, params, all_states) master.update_embeddings(params, population) master.calc_pairwise_dists(params) master.select_agent() master.get_agent() #initial reward reward = master.policy.rollout(env, params['steps'], incl_data=False) rewards.append(reward) agents.append(master.agent) dists.append(master.dists) max_reward=reward max_rwds.append(max_reward) min_dists.append(master.min_dist) if params['w_nov'] < 0: bb = BayesianBandits() params['w_nov'] = 0 lambdas.append(params['w_nov']) while n_iter < params['max_iter']: print('Iter: %s, Eps: %s, Mean: %s, Max: %s, Best: %s, MeanD: %s, MinD: %s, Lam: %s' %(n_iter, n_eps, np.round(reward,4), np.round(max_reward,4), master.agent, np.round(master.dists,4), np.round(master.min_dist,4), params['w_nov'])) if (n_iter>0) & (params['num_agents'] > 1): master.calc_pairwise_dists(params) master.select_agent() master.get_agent() ## Main Function Call params['n_iter'] = n_iter if params['num_agents'] > 1: gradient, timesteps = population_update(master, params) n_eps += 2*params['num_sensings'] * params['num_agents'] else: gradient, timesteps = individual_update(master, params) n_eps += 2*params['num_sensings'] ts_cumulative += timesteps all_states += master.states if params['num_sensings'] < len(all_states): all_states = sample(all_states, params['num_sensings']) gradient /= (np.linalg.norm(gradient) / master.policy.N + 1e-8) n_iter += 1 update = Adam(gradient, master, params['learning_rate'], n_iter) rwds, trajectories = [], [] if params['num_evals'] > 0: seeds = [int(np.random.uniform()*10000) for _ in range(params['num_evals'])] for i in range(params['num_agents']): master.agent = i master.get_agent() master.policy.update(master.policy.params + update[(i*master.policy.N):((i+1)*master.policy.N)]) if params['num_evals'] > 0: reward = 0 for j in range(params['num_evals']): r, traj = master.policy.rollout(env, params['steps'], incl_data=True, seed =seeds[j]) reward += r reward /= params['num_evals'] else: reward, traj = master.policy.rollout(env, params['steps'], incl_data=True) rwds.append(reward) trajectories.append(traj) if reward > master.best[i]: master.best[i] = reward np.save('data/%s/weights/Seed%s_Agent%s' %(params['dir'], params['seed'], i), master.policy.params) master.reward[i].append(reward) master.update_agent() reward = np.mean(rwds) max_reward = max(rwds) traj = trajectories[np.argmax(rwds)] master.agent = np.argmax(rwds) # Update selected states master.selected = select_states(master, params, all_states) master.update_embeddings(params) master.embedding = embed(params, traj, master.policy, master.selected) rewards.append(reward) max_rwds.append(max_reward) master.reward[master.agent].append(reward) if reward > master.best[master.agent]: master.best[master.agent] = reward np.save('data/%s/weights/Seed%s_Agent%s' %(params['dir'], params['seed'], master.agent), master.policy.params) ## update the bandits try: bb.update_dists(reward) params['w_nov'] = bb.sample() except NameError: pass lambdas.append(params['w_nov']) rollouts.append(n_eps) agents.append(master.agent) dists.append(master.dists) min_dists.append(master.min_dist) ts.append(ts_cumulative) master.update_agent() if n_iter % params['flush'] == 0: reset_ray(master, params) master.init_workers(params) out = pd.DataFrame({'Rollouts': rollouts, 'Reward': rewards, 'Max': max_rwds, 'Timesteps': ts, 'Dists': dists, 'Min_Dist':min_dists, 'Agent': agents, 'Lambda': lambdas}) out.to_csv('data/%s/results/Seed%s.csv' %(params['dir'], params['seed']), index=False)