示例#1
0
def play(env, agent, config, evaluation=False):
  if evaluation:
    print('\nEvaluation:')
  mean_rewards = 0
  max_reward = 0
  max_mean_rewards = 0
  num_episodes = config['eval_episodes'] if evaluation else config['train_episodes']
  for episode in tqdm.tqdm(range(num_episodes)):
    observations, infos = env.reset()
    infos_array = dict_to_array(infos, config['environment_batch_size'])
    rewards = [0] * config['environment_batch_size']
    dones = [False] * config['environment_batch_size']
    max_score = max([info['max_score'] for info in infos_array])

    steps = 0
    # TODO: maybe condition on max_steps as well.
    while not all(dones):
      win_factor = max_mean_rewards / float(max_score) if config['use_adaptive_epsilon'] else None
      actions = agent.choose_actions(observations, infos_array, dones, evaluation,
                                     win_factor=win_factor)
      new_observations, new_rewards, new_dones, new_infos = env.step(actions)
      new_infos_array = dict_to_array(new_infos, config['environment_batch_size'])
      for idx, done in enumerate(dones):
        if not done and not evaluation:
          agent.add_state(observations[idx],
                          infos_array[idx],
                          actions[idx],
                          new_observations[idx],
                          new_infos_array[idx],
                          new_rewards[idx] - rewards[idx],
                          dones[idx])
      observations = new_observations
      infos_array = new_infos_array
      rewards = new_rewards
      dones = new_dones
      if not evaluation and steps % config['update_frequency'] == 0:
        agent.train()
      steps += 1
    mean_rewards = np.mean(rewards)
    max_reward = max(max_reward, max(rewards))
    max_mean_rewards = max(max_mean_rewards, np.mean(rewards))
    wins_percentage = sum([info['has_won'] for info in infos_array]) * 100. / len(infos_array)
    print('Mean rewards: {}({}), steps: {}, max reward: {}({}), wins percentage - {}'.format(
        mean_rewards, max_mean_rewards, steps, max_reward, max_score, wins_percentage))
    agent.end_episode()
    if not evaluation and mean_rewards > 0.99 * max_score:
      break
  return
示例#2
0
 def __group_by_field(self, i, field):
     '''Group a test index subset i by field (SNP=0, sample=1).'''
     size = self.problem.genotype.data.shape[field]
     group_count = util.dict_to_array(statutil.group_by_value(self.test_index[field][i]))
     result = np.zeros((size,), dtype=int)
     result[group_count['k']] = group_count['v']
     return result
示例#3
0
 def __group_by_field(self, i, field):
     '''Group a test index subset i by field (SNP=0, sample=1).'''
     size = self.problem.genotype.data.shape[field]
     group_count = util.dict_to_array(
         statutil.group_by_value(self.test_index[field][i]))
     result = np.zeros((size, ), dtype=int)
     result[group_count['k']] = group_count['v']
     return result
示例#4
0
 def __group_by_field(g, i, field):
     '''Group a test index subset i by field (SNP=0, sample=1).'''
     group_count = util.dict_to_array(statutil.group_by_value(i[field]))
     result = np.zeros((g.shape[field],), dtype=int)
     result[group_count['k']] = group_count['v']
     return result
示例#5
0
 def __group_by_field(g, i, field):
     '''Group a test index subset i by field (SNP=0, sample=1).'''
     group_count = util.dict_to_array(statutil.group_by_value(i[field]))
     result = np.zeros((g.shape[field], ), dtype=int)
     result[group_count['k']] = group_count['v']
     return result