def test_lio(config): seed = config.main.seed np.random.seed(seed) random.seed(seed) tf.set_random_seed(seed) dir_name = config.main.dir_name exp_name = config.main.exp_name log_path = os.path.join('..', 'results', exp_name, dir_name) model_name = config.main.model_name n_test = config.alg.n_test env = room_symmetric.Env(config.env) if config.lio.use_actor_critic: from lio_ac import LIO else: from lio_agent import LIO list_agents = [] for agent_id in range(env.n_agents): list_agents.append( LIO(config.lio, env.l_obs, env.l_action, config.nn, 'agent_%d' % agent_id, config.env.r_multiplier, env.n_agents, agent_id)) for agent_id in range(env.n_agents): list_agents[agent_id].receive_list_of_agents(list_agents) list_agents[agent_id].create_policy_gradient_op() list_agents[agent_id].create_update_op() for agent_id in range(env.n_agents): list_agents[agent_id].create_reward_train_op() if config.lio.asymmetric: assert config.env.n_agents == 2 for agent_id in range(env.n_agents): list_agents[agent_id].set_can_give( agent_id != config.lio.idx_recipient) config_proto = tf.ConfigProto() if config.main.use_gpu: config_proto.device_count['GPU'] = 1 config_proto.gpu_options.allow_growth = True else: config_proto.device_count['GPU'] = 0 sess = tf.Session(config=config_proto) saver = tf.train.Saver() print("Restoring variables from %s" % dir_name) saver.restore(sess, os.path.join(log_path, model_name)) _ = evaluate.test_room_symmetric(n_test, env, sess, list_agents, log=True, log_path=log_path)
def train(config): # set random seed seed = 1234 np.random.seed(seed) random.seed(seed) torch.manual_seed(seed) results_one = [] results_two = [] reward_one_to_two = [] reward_two_to_one = [] # init game env env = room_symmetric.Env(config.env) agents = [] for i in range(env.n_agents): agents.append(Actor(i, 7, env.n_agents)) # epoch start for epoch in range(5000): if (epoch + 1) % 500 == 0: print("Epoch: ", epoch + 1, "/5000") """ The first trajectory generation """ trajs = [Trajectory() for _ in range(env.n_agents)] list_obs = env.reset() list_obs_next = None done = None while not done: if list_obs_next is not None: list_obs = list_obs_next # decide actions from observations list_act, list_act_hot = action_sampling(agents, list_obs) # give incentivisation inctv_to, inctv_from = give_incentivisation(agents, list_act_hot, config) # execute step list_obs_next, env_rewards, done = env.step(list_act, inctv_to) # save trajectory for agent in agents: trajs[agent.id].add(agent.get_obs(), agent.get_action(), agent.get_action_hot(), env_rewards[agent.id], inctv_from[agent.id]) for agent in agents: agent.update_policy(trajs) """ The second trajectory generation """ # Generate a new trajectory trajs_new = [Trajectory() for _ in range(env.n_agents)] list_obs = env.reset() list_obs_next = None done = False result_one_new = 0 result_two_new = 0 while not done: if list_obs_next is not None: list_obs = list_obs_next # decide actions from observations list_act, list_act_hot = action_sampling(agents, list_obs) # give incentivisation inctv_to, new_inctv_from_others = give_incentivisation(agents, list_act_hot, config) reward_one_to_two.append(inctv_to[0]) reward_two_to_one.append(inctv_to[1]) # execute step list_obs_next, env_rewards, done = env.step(list_act, inctv_to) for agent in agents: trajs_new[agent.id].add(agent.get_obs(), agent.get_action(), agent.get_action_hot(), env_rewards[agent.id], new_inctv_from_others[agent.id]) result_one_new += env_rewards[0] result_two_new += env_rewards[1] if done: results_one.append(result_one_new) results_two.append(result_two_new) # compute new log prob act log_prob_act_other = [[] for _ in range(config.env.n_agents)] for agent in agents: states_new = [trajectory.get_state() for trajectory in trajs_new] actions_new = [trajectory.get_action() for trajectory in trajs_new] logits, _ = agent.policy_net(states_new[agent.id], agent.new_params) # grad_graph(logits, 'logits') log_prob = F.log_softmax(logits, dim=-1) log_prob_act = torch.stack([log_prob[i][actions_new[agent.id][i]] for i in range(len(actions_new[agent.id]))], dim=0) log_prob_act_other[agent.id] = log_prob_act # optimizer.zero_grad() # loss_p = [torch.Tensor() for _ in range(2)] # for agent in agents: # loss_p[agent.id] = agent.update_rewards_giving(trajs, trajs_new, log_prob_act_other) # loss = loss_p[0] + loss_p[1] # loss.backward() # optimizer.step() for agent in agents: agent.update_rewards_giving(trajs, trajs_new, log_prob_act_other) for agent in agents: agent.update_to_new_params() return results_one, results_two, reward_one_to_two, reward_two_to_one
def train(config): seed = config.main.seed np.random.seed(seed) random.seed(seed) tf.set_random_seed(seed) dir_name = config.main.dir_name exp_name = config.main.exp_name log_path = os.path.join('..', 'results', exp_name, dir_name) model_name = config.main.model_name save_period = config.main.save_period os.makedirs(log_path, exist_ok=True) # Keep a record of parameters used for this run with open(os.path.join(log_path, 'config.json'), 'w') as f: json.dump(config, f, indent=4, sort_keys=True) n_episodes = int(config.alg.n_episodes) n_eval = config.alg.n_eval period = config.alg.period epsilon = config.lio.epsilon_start epsilon_step = (epsilon - config.lio.epsilon_end) / config.lio.epsilon_div if config.env.name == 'er': env = room_symmetric.Env(config.env) elif config.env.name == 'ipd': env = ipd_wrapper.IPD(config.env) if config.lio.decentralized: from lio_decentralized import LIO elif config.lio.use_actor_critic: from lio_ac import LIO else: from lio_agent import LIO list_agents = [] for agent_id in range(env.n_agents): if config.lio.decentralized: list_agent_id_opp = list(range(env.n_agents)) del list_agent_id_opp[agent_id] list_agents.append( LIO(config.lio, env.l_obs, env.l_action, config.nn, 'agent_%d' % agent_id, config.env.r_multiplier, env.n_agents, agent_id, list_agent_id_opp)) else: list_agents.append( LIO(config.lio, env.l_obs, env.l_action, config.nn, 'agent_%d' % agent_id, config.env.r_multiplier, env.n_agents, agent_id)) for agent_id in range(env.n_agents): if config.lio.decentralized: list_agents[agent_id].create_opp_modeling_op() else: list_agents[agent_id].receive_list_of_agents(list_agents) list_agents[agent_id].create_policy_gradient_op() list_agents[agent_id].create_update_op() if config.lio.use_actor_critic: list_agents[agent_id].create_critic_train_op() for agent_id in range(env.n_agents): list_agents[agent_id].create_reward_train_op() # This handles the special case of two asymmetric agents, # one of which is the reward-giver and the other is the recipient if config.lio.asymmetric: assert config.env.n_agents == 2 for agent_id in range(env.n_agents): list_agents[agent_id].set_can_give( agent_id != config.lio.idx_recipient) config_proto = tf.ConfigProto() if config.main.use_gpu: config_proto.device_count['GPU'] = 1 config_proto.gpu_options.allow_growth = True else: config_proto.device_count['GPU'] = 0 sess = tf.Session(config=config_proto) sess.run(tf.global_variables_initializer()) if config.lio.use_actor_critic: for agent in list_agents: sess.run(agent.list_initialize_v_ops) list_agent_meas = [] if config.env.name == 'er': list_suffix = [ 'reward_total', 'n_lever', 'n_door', 'received', 'given', 'r-lever', 'r-start', 'r-door' ] elif config.env.name == 'ipd': list_suffix = ['given', 'received', 'reward_env', 'reward_total'] for agent_id in range(1, env.n_agents + 1): for suffix in list_suffix: list_agent_meas.append('A%d_%s' % (agent_id, suffix)) saver = tf.train.Saver(max_to_keep=config.main.max_to_keep) header = 'episode,step_train,step,' header += ','.join(list_agent_meas) if config.env.name == 'er': header += ',steps_per_eps\n' else: header += '\n' with open(os.path.join(log_path, 'log.csv'), 'w') as f: f.write(header) step = 0 step_train = 0 for idx_episode in range(1, n_episodes + 1): list_buffers = run_episode(sess, env, list_agents, epsilon, prime=False) step += len(list_buffers[0].obs) if config.lio.decentralized: for idx, agent in enumerate(list_agents): agent.train_opp_model(sess, list_buffers, epsilon) for idx, agent in enumerate(list_agents): agent.update(sess, list_buffers[idx], epsilon) list_buffers_new = run_episode(sess, env, list_agents, epsilon, prime=True) step += len(list_buffers_new[0].obs) for agent in list_agents: if agent.can_give: agent.train_reward(sess, list_buffers, list_buffers_new, epsilon) for idx, agent in enumerate(list_agents): if config.lio.decentralized: agent.train_opp_model(sess, list_buffers_new, epsilon) else: agent.update_main(sess) step_train += 1 if idx_episode % period == 0: if config.env.name == 'er': (reward_total, n_move_lever, n_move_door, rewards_received, rewards_given, steps_per_episode, r_lever, r_start, r_door) = evaluate.test_room_symmetric( n_eval, env, sess, list_agents) matrix_combined = np.stack([ reward_total, n_move_lever, n_move_door, rewards_received, rewards_given, r_lever, r_start, r_door ]) elif config.env.name == 'ipd': given, received, reward_env, reward_total = evaluate.test_ipd( n_eval, env, sess, list_agents) matrix_combined = np.stack( [given, received, reward_env, reward_total]) s = '%d,%d,%d' % (idx_episode, step_train, step) for idx in range(env.n_agents): s += ',' if config.env.name == 'er': s += ('{:.3e},{:.3e},{:.3e},{:.3e},{:.3e},' '{:.3e},{:.3e},{:.3e}').format(*matrix_combined[:, idx]) elif config.env.name == 'ipd': s += '{:.3e},{:.3e},{:.3e},{:.3e}'.format( *matrix_combined[:, idx]) if config.env.name == 'er': s += ',%.2f\n' % steps_per_episode else: s += '\n' with open(os.path.join(log_path, 'log.csv'), 'a') as f: f.write(s) if idx_episode % save_period == 0: saver.save( sess, os.path.join(log_path, '%s.%d' % (model_name, idx_episode))) if epsilon > config.lio.epsilon_end: epsilon -= epsilon_step saver.save(sess, os.path.join(log_path, model_name))
def train(config): # set seeds for the training # seed = config.main.seed seed = 12345 np.random.seed(seed) random.seed(seed) torch.manual_seed(seed) n_episodes = int(config.alg.n_episodes) n_eval = config.alg.n_eval period = config.alg.period results_one = [] results_two = [] reward_one_to_two = [] reward_two_to_one = [] # 初始化环境 env = room_symmetric.Env(config.env) agents = [] for i in range(config.env.n_agents): agents.append(Actor(i, 7, config.env.n_agents)) # epoch start for epoch in range(6000): trajs = [Trajectory() for _ in range(env.n_agents)] list_obs = env.reset() list_obs_next = None done = False result_one = 0 result_two = 0 while not done: list_act = [] list_act_hot = [] if list_obs_next is not None: list_obs = list_obs_next # set observations and decide actions for agent in agents: agent.set_obs(list_obs[agent.id]) agent.action_sampling() list_act_hot.append(agent.get_action_hot()) list_act.append(agent.get_action()) list_rewards = [] total_reward_given_to_each_agent = np.zeros(env.n_agents) # give rewards for agent in agents: reward = agent.give_reward(list_act_hot) reward[agent.id] = 0 total_reward_given_to_each_agent += reward reward = np.delete(reward, agent.id) list_rewards.append(reward) # execute step list_obs_next, env_rewards, done = env.step(list_act, list_rewards) for agent in agents: reward_given = total_reward_given_to_each_agent[agent.id] trajs[agent.id].add(agent.get_obs(), agent.get_action(), agent.get_action_hot(), env_rewards[agent.id], reward_given) result_one += env_rewards[0] result_two += env_rewards[1] # if done: # results_one.append(result_one) # results_two.append(result_two) for agent in agents: agent.update_policy(trajs[agent.id]) # Generate a new trajectory trajs_new = [Trajectory() for _ in range(env.n_agents)] list_obs = env.reset() list_obs_next = None done = False result_one_new = 0 result_two_new = 0 while not done: list_act = [] list_act_hot = [] if list_obs_next is not None: list_obs = list_obs_next # set observations and decide actions for agent in agents: agent.set_obs(list_obs[agent.id]) agent.action_sampling() list_act_hot.append(agent.get_action_hot()) list_act.append(agent.get_action()) list_rewards = [] total_reward_given_to_each_agent = np.zeros(env.n_agents) # give rewards for agent in agents: reward = agent.give_reward(list_act_hot) reward[agent.id] = 0 total_reward_given_to_each_agent += reward reward = np.delete(reward, agent.id) list_rewards.append(reward) if agent.id == 0: reward_one_to_two.append(reward) else: reward_two_to_one.append(reward) # execute step list_obs_next, env_rewards, done = env.step(list_act, list_rewards) for agent in agents: reward_given = total_reward_given_to_each_agent[agent.id] trajs_new[agent.id].add(agent.get_obs(), agent.get_action(), agent.get_action_hot(), env_rewards[agent.id], reward_given) result_one_new += env_rewards[0] result_two_new += env_rewards[1] if done: results_one.append(result_one_new) results_two.append(result_two_new) for agent in agents: agent.update_rewards_giving(trajs, trajs_new) return results_one, results_two, reward_one_to_two, reward_two_to_one
def train(config): # set seeds for the train seed = config.main.seed np.random.seed(seed) random.seed(seed) tf.set_random_seed(seed) # namespace dir_name = config.main.dir_name exp_name = config.main.exp_name log_path = os.path.join('..', 'results', exp_name, dir_name) model_name = config.main.model_name save_period = config.main.save_period # create folder for results os.makedirs(log_path, exist_ok=True) # record parameter of this run with open(os.path.join(log_path, 'config.json'), 'w') as f: json.dump(config, f, indent=4, sort_keys=True) # set hyper-parameters n_episodes = int(config.alg.n_episodes) n_eval = config.alg.n_eval period = config.alg.period # parameters for epsilon-greedy method epsilon = config.lio.epsilon_start epsilon_step = (epsilon - config.lio.epsilon_end) / config.lio.epsilon_div # make env env = room_symmetric.Env(config.env) from lio_agent import LIO # init lio agents list_agents = [] for agent_id in range(env.n_agents): list_agents.append( LIO(config.lio, env.l_obs, env.l_action, config.nn, 'agent_%d' % agent_id, config.env.r_multiplier, env.n_agents, agent_id)) # init training optimizers for agent_id in range(env.n_agents): list_agents[agent_id].receive_list_of_agents(list_agents) list_agents[agent_id].create_policy_gradient_op() list_agents[agent_id].create_update_op() for agent_id in range(env.n_agents): list_agents[agent_id].create_reward_train_op() # This handles the special case of two asymmetric agents, # one of which is the reward-giver and the other is the recipient if config.lio.asymmetric: assert config.env.n_agents == 2 for agent_id in range(env.n_agents): list_agents[agent_id].set_can_give( agent_id != config.lio.idx_recipient) config_proto = tf.ConfigProto() if config.main.use_gpu: config_proto.device_count['GPU'] = 1 config_proto.gpu_options.allow_growth = True else: config_proto.device_count['GPU'] = 0 sess = tf.Session(config=config_proto) sess.run(tf.global_variables_initializer()) # list_agent_meas = [] list_suffix = [ 'reward_total', 'n_lever', 'n_door', 'received', 'given', 'r-lever', 'r-start', 'r-door' ] for agent_id in range(1, env.n_agents + 1): for suffix in list_suffix: list_agent_meas.append('A%d_%s' % (agent_id, suffix)) # save models saver = tf.train.Saver(max_to_keep=config.main.max_to_keep) header = 'spisode, step_train, step,' header += ','.join(list_agent_meas) header += ',step_per_eps\n' with open(os.path.join(log_path, 'log.csv'), 'w') as f: f.write(header) # episode start step = 0 step_train = 0 for idx_episode in range(1, n_episodes + 1): # generate a trajectory list_buffers = run_episode(sess, env, list_agents, epsilon, prime=False) step += len(list_buffers[0].obs) for idx, agent in enumerate(list_agents): agent.update(sess, list_buffers[idx], epsilon) # generate new trajectory with new parameters of incentive function list_buffers_new = run_episode(sess, env, list_agents, epsilon, prime=True) step += len(list_buffers_new[0].obs) # train incentive function for agent in list_agents: if agent.can_give: agent.train_reward(sess, list_buffers, list_buffers_new, epsilon) for idx, agent in enumerate(list_agents): agent.update_main(sess) step_train += 1 # add results to the result file if idx_episode % period == 0: (reward_total, n_move_lever, n_move_door, rewards_received, rewards_given, steps_per_episode, r_lever, r_start, r_door) = evaluate.test_room_symmetric(n_eval, env, sess, list_agents) matrix_combined = np.stack([ reward_total, n_move_lever, n_move_door, rewards_received, rewards_given, r_lever, r_start, r_door ]) s = '%d,%d,%d' % (idx_episode, step_train, step) for idx in range(env.n_agents): s += ',' s += ('{:.3e},{:.3e},{:.3e},{:.3e},{:.3e},' '{:.3e},{:.3e},{:.3e}').format(*matrix_combined[:, idx]) s += ',%.2f\n' % steps_per_episode with open(os.path.join(log_path, 'log.csv'), 'a') as f: f.write(s) if idx_episode % save_period == 0: saver.save( sess, os.path.join(log_path, '%s.%d' % (model_name, idx_episode))) if epsilon > config.lio.epsilon_end: epsilon -= epsilon_step saver.save(sess, os.path.join(log_path, model_name))
def train(config): # set seeds for the training # seed = config.main.seed seed = 1234 np.random.seed(seed) random.seed(seed) torch.manual_seed(seed) n_episodes = int(config.alg.n_episodes) n_eval = config.alg.n_eval period = config.alg.period results_one = [] results_two = [] reward_one_to_two = [] reward_two_to_one = [] # 初始化环境 env = room_symmetric.Env(config.env) agents = [] for i in range(config.env.n_agents): agents.append(Actor(i, 7, config.env.n_agents)) # epoch start for epoch in range(5000): trajs = [Trajectory() for _ in range(env.n_agents)] list_obs = env.reset() list_obs_next = None done = False result_one = 0 result_two = 0 while not done: list_act = [] list_act_hot = [] if list_obs_next is not None: list_obs = list_obs_next # set observations and decide actions for agent in agents: agent.set_obs(list_obs[agent.id]) agent.action_sampling() list_act_hot.append(agent.get_action_hot()) list_act.append(agent.get_action()) list_rewards = [] total_reward_given_to_each_agent = torch.zeros(env.n_agents) reward = [None for _ in range(env.n_agents)] # give rewards for agent in agents: reward[agent.id] = agent.give_reward(list_act_hot) for idx in range(env.n_agents): if idx != agent.id: total_reward_given_to_each_agent[idx] += reward[ agent.id][idx] # 各个智能体受到的激励 reward_sum = (reward[agent.id].sum() - reward[agent.id][agent.id] ).detach().numpy() # 计算自己在这一步给予别人的总激励 list_rewards.append(reward_sum) # execute step list_obs_next, env_rewards, done = env.step(list_act, list_rewards) for agent in agents: reward_given = total_reward_given_to_each_agent[agent.id] trajs[agent.id].add(agent.get_obs(), agent.get_action(), agent.get_action_hot(), env_rewards[agent.id], reward_given) result_one += env_rewards[0] result_two += env_rewards[1] for agent in agents: agent.update_policy(trajs[agent.id]) # Generate a new trajectory trajs_new = [Trajectory() for _ in range(env.n_agents)] list_obs = env.reset() list_obs_next = None done = False result_one_new = 0 result_two_new = 0 while not done: list_act = [] list_act_hot = [] if list_obs_next is not None: list_obs = list_obs_next # set observations and decide actions for agent in agents: agent.set_obs(list_obs[agent.id]) agent.action_sampling(agent.new_params) list_act_hot.append(agent.get_action_hot()) list_act.append(agent.get_action()) list_rewards = [] total_reward_given_to_each_agent = torch.zeros(env.n_agents) reward_new = [None for _ in range(env.n_agents)] # give rewards for agent in agents: reward_new[agent.id] = agent.give_reward(list_act_hot) reward_sum = torch.zeros(1) for idx in range(env.n_agents): if idx != agent.id: total_reward_given_to_each_agent[idx] += reward_new[ agent.id][idx] reward_sum += reward_new[agent.id][ idx] # 计算自己总共给予了多少报酬 reward_sum = (reward_new[agent.id].sum() - reward_new[agent.id][agent.id]).detach().numpy() list_rewards.append(reward_sum) if agent.id == 0: reward_one_to_two.append(reward_sum) else: reward_two_to_one.append(reward_sum) # execute step list_obs_next, env_rewards, done = env.step(list_act, list_rewards) for agent in agents: reward_given = total_reward_given_to_each_agent[agent.id] trajs_new[agent.id].add(agent.get_obs(), agent.get_action(), agent.get_action_hot(), env_rewards[agent.id], reward_given) result_one_new += env_rewards[0] result_two_new += env_rewards[1] if done: results_one.append(result_one_new) results_two.append(result_two_new) # compute new log prob act log_prob_act_other = [[] for _ in range(config.env.n_agents)] for agent in agents: states_new = [trajectory.get_state() for trajectory in trajs_new] actions_new = [trajectory.get_action() for trajectory in trajs_new] logits, _ = agent.policy_net(states_new[agent.id], agent.new_params) # grad_graph(logits, 'logits') log_prob = F.log_softmax(logits, dim=-1) log_prob_act = torch.stack([ log_prob[i][actions_new[agent.id][i]] for i in range(len(actions_new[agent.id])) ], dim=0) log_prob_act_other[agent.id] = log_prob_act for agent in agents: agent.update_rewards_giving(trajs, trajs_new, log_prob_act_other) for agent in agents: agent.update_to_new_params() return results_one, results_two, reward_one_to_two, reward_two_to_one
def train(config): # set random seed seed = 1234 np.random.seed(seed) random.seed(seed) torch.manual_seed(seed) num_epoch = 5000 lr_a = 0.01 epsilon = 0.5 epsilon_end = 0.1 epsilon_div = 1e3 epsilon_step = ((epsilon - epsilon_end) / epsilon_div) results_one = [] results_two = [] reward_one_to_two = [] reward_two_to_one = [] # init game env env = room_symmetric.Env(config.env) agents = [] for i in range(env.n_agents): agents.append(Actor(i, 7, env.n_agents, lr=0.01)) # epoch start for epoch in range(num_epoch): if (epoch + 1) % 500 == 0: print("Epoch: ", epoch + 1, "/", num_epoch) """The first trajectory generation""" trajs = [Trajectory() for _ in range(env.n_agents)] list_obs = env.reset() list_obs_next = None done = None while not done: if list_obs_next is not None: list_obs = list_obs_next # decide actions from observations list_act, list_act_hot = action_sampling(agents, list_obs, epsilon) # give incentivisation inctv_to, inctv_from = give_incentivisation( agents, list_act_hot, config) # execute step list_obs_next, env_rewards, done = env.step(list_act, inctv_to) # save trajectory for agent in agents: trajs[agent.id].add(agent.get_obs(), agent.get_action(), agent.get_action_hot(), env_rewards[agent.id], inctv_from[agent.id]) for agent in agents: agent.update_policy(trajs, lr=lr_a) """The second trajectory generation""" # Generate a new trajectory trajs_new = [Trajectory() for _ in range(env.n_agents)] list_obs = env.reset() list_obs_next = None done = False result_one_new = 0 result_two_new = 0 while not done: if list_obs_next is not None: list_obs = list_obs_next # decide actions from observations list_act, list_act_hot = action_sampling(agents, list_obs, epsilon) # give incentivisation inctv_to, new_inctv_from_others = give_incentivisation( agents, list_act_hot, config) reward_one_to_two.append(inctv_to[0]) reward_two_to_one.append(inctv_to[1]) # execute step list_obs_next, env_rewards, done = env.step(list_act, inctv_to) for agent in agents: trajs_new[agent.id].add(agent.get_obs(), agent.get_action(), agent.get_action_hot(), env_rewards[agent.id], new_inctv_from_others[agent.id]) result_one_new += env_rewards[0] result_two_new += env_rewards[1] if done: results_one.append(result_one_new) results_two.append(result_two_new) # compute new log prob act log_prob_act_other = compute_log_prob_act_other( agents, trajs_new, config) for agent in agents: agent.update_rewards_giving(trajs, trajs_new, log_prob_act_other) for agent in agents: agent.update_to_new_params() if epsilon > epsilon_end: epsilon -= epsilon_step return results_one, results_two, reward_one_to_two, reward_two_to_one