예제 #1
0
파일: run.py 프로젝트: zhangzongliang/ddpg
  def run(self):
    self.t_train = 0
    self.t_test = 0

    # create filtered environment
    self.env = filter_env.makeFilteredEnv(gym.make(FLAGS.env))
    # self.env = gym.make(FLAGS.env)
    
    self.env.monitor.start(FLAGS.outdir+'/monitor/',video_callable=lambda _: False)
    gym.logger.setLevel(gym.logging.WARNING)

    dimO = self.env.observation_space.shape
    dimA = self.env.action_space.shape
    print(dimO,dimA)

    import pprint
    pprint.pprint(self.env.spec.__dict__,width=1)

    self.agent = ddpg.Agent(dimO=dimO,dimA=dimA)

    returns = []

    # main loop
    while self.t_train < FLAGS.total:

      # test
      T = self.t_test
      R = []
      self.env.monitor.start(FLAGS.outdir+'/monitor/',video_callable=lambda _: False,resume=True)
      while self.t_test - T < FLAGS.test:
        R.append(self.run_episode(test=True,monitor=(len(R)==0)))
      avr = np.mean(R)
      print('Average test return\t{} after {} timesteps of training'.format(avr,self.t_train))
      # save return
      returns.append((self.t_train, avr))
      np.save(FLAGS.outdir+"/returns.npy",returns)

      # evaluate required number of episodes for gym
      if self.env.spec.reward_threshold is not None and avr > self.env.spec.reward_threshold:
        for i in range(self.env.spec.trials):
          self.run_episode(test=True)

      self.env.monitor.close()


      # train
      T = self.t_train
      R = []
      while self.t_train - T < FLAGS.train:
        R.append(self.run_episode(test=False))
      avr = np.mean(R)
      print('Average training return\t{} after {} timesteps of training'.format(avr,self.t_train))

    self.env.monitor.close()
    # upload results
    if FLAGS.upload:
      gym.upload(FLAGS.outdir+"/monitor",algorithm_id = GYM_ALGO_ID)
예제 #2
0
def test_ddpg_replay(tmpdir):
    import ddpg
    import numpy as np
    np.set_printoptions(threshold=np.nan)

    ddpg.FLAGS.warmup = 10000
    ddpg.FLAGS.outdir = tmpdir.strpath
    # test replay memory
    a = ddpg.Agent([1], [1])
    a.reset([0])
    T = 10
    actions = []
    for t in range(0, T):
        actions.append(a.act())
        a.observe(t, False, [t + 1])
예제 #3
0
  def run(self,t_train = 1000000,t_warmup=20000,f_test=20,env='Pendulum-v0',render=False,**kwargs):
    self.t_warmup = t_warmup
    self.t_log = 103
    #self.env = gym.make(s.env)
    #from gym.envs.classic_control.dl import DoubleLinkEnv
    self.env = doublelink.DoubleLinkEnv()
    # self.env.monitor.start('./monitor/',video_callable=lambda _: False) TODO: fix on cluster
    dimO = self.env.observation_space.shape
    dimA = self.env.action_space.shape
    print('dimO: '+str(dimO) +'  dimA: '+str(dimA))
    # agent
    self.agent = ddpg.Agent(dimO=dimO,dimA=dimA,**kwargs)

    # main loop
    while self.agent.t < t_train:
      # train
      for i in xrange(f_test): self.run_episode(test=False)

      # test
      R = np.mean([self.run_episode(test=True,render=render) for _ in range(5)])
      print('Average return '+str(R)+ ' after '+str(self.agent.t)+' timesteps of training')
예제 #4
0
    def run(self, env):
        self.t_train = 0
        self.t_test = 0

        # create filtered environment
        # self.env = filter_env.makeFilteredEnv(gym.make(FLAGS.env))
        self.env = filter_env.makeFilteredEnv(gym.make(env))

        self.t_elapsed = []

        # self.env = gym.make(FLAGS.env)

        if tf.gfile.Exists(FLAGS.outdir):
            tf.gfile.DeleteRecursively(FLAGS.outdir)
        # self.env.monitor.start(FLAGS.outdir + '/monitor/', video_callable=lambda _: False)
        # gym.logger.setLevel(gym.logging.WARNING)

        dimO = self.env.observation_space.shape
        dimA = self.env.action_space.shape
        print 'observationspace action space',
        print(dimO, dimA)

        import pprint
        pprint.pprint(self.env.spec.__dict__, width=1)

        self.agent = ddpg.Agent(dimO=dimO, dimA=dimA)

        returns = []

        it = 0
        episodelengths = []
        testlengths = []

        if env == 'Reacher-v1':
            self.train_frequency = 1
            test_frequency = 3
            plot_frequency = 1

        if env == 'MountainCarContinuous-v0':
            test_frequency = 10
            plot_frequency = 1
            self.train_frequency = 16

        if env == 'InvertedPendulum-v1':
            test_frequency = 100
            plot_frequency = 300
            self.train_frequency = 1

        print 'using train frequency', self.train_frequency

        # main loop
        while self.t_train < FLAGS.total:

            it += 1

            episodelengths.append(self.run_episode(test=False))

            if it % test_frequency == 0:
                testlengths.append(self.run_episode(test=True))

            if it % plot_frequency == 0:
                print 'avg time for sim step:', np.mean(
                    np.array(self.t_elapsed))
                plotting.plot_episode_lengths(episodelengths)
                plotting.plot_episode_lengths(testlengths)
                # plotting.plot_replay_memory_2d_state_histogramm(self.agent.rm.observations)
                # plotting.plot_learned_mu(self.agent.act_test, self.env)

            # else:
            #     # test
            #     T = self.t_test
            #     R = []
            #
            #     while self.t_test - T < FLAGS.test:
            #         # print 'running test episode'
            #         R.append(self.run_episode(test=True, monitor=(self.t_test - T < FLAGS.monitor * FLAGS.test)))
            #     avr = np.mean(R)
            #     print('Average test return\t{} after {} timesteps of training'.format(avr, self.t_train))
            #     # save return
            #     returns.append((self.t_train, avr))
            #     np.save(FLAGS.outdir + "/returns.npy", returns)
            #
            #     # evaluate required number of episodes for gym and end training when above threshold
            #     if self.env.spec.reward_threshold is not None and avr > self.env.spec.reward_threshold:
            #         avr = np.mean([self.run_episode(test=True) for _ in range(self.env.spec.trials)])
            #         if avr > self.env.spec.reward_threshold:
            #             break
            #
            #     # train
            #     T = self.t_train
            #     R = []
            #     while self.t_train - T < FLAGS.train:
            #         # print 'running train episode'
            #         R.append(self.run_episode(test=False))
            #     avr = np.mean(R)
            #     print('Average training return\t{} after {} timesteps of training'.format(avr, self.t_train))

        # self.env.monitor.close()
        # upload results
        if FLAGS.upload:
            gym.upload(FLAGS.outdir + "/monitor", algorithm_id=GYM_ALGO_ID)
예제 #5
0
state_inp = concat_obs_goal(observation)

# parameters for the agent
d_alpha = 0.000025
d_beta = 0.00025
d_tau = 0.001
d_layer1_size = 256
d_layer2_size = 256
d_layer3_size = 256
d_layer4_size = 256
d_input_dims = state_inp.shape[0]
d_num_actions = env.action_space.shape[0]
d_output_dir = "tmp/fetch_test/"

fetch_agent = ddpg.Agent(d_alpha, d_beta, d_input_dims, d_tau, env,
                         d_num_actions, d_layer1_size, d_layer2_size,
                         d_layer3_size, d_layer4_size, d_output_dir)

fetch_agent.load_models()

score_history = []
num_steps = 50
test_no_episodes = 100
score_file_name = "inference.csv"

for i in range(test_no_episodes):  #Total episodes to train

    observation = env.reset()  #Sample desired goal

    desired_goal = observation["desired_goal"]
    curr_state = observation["observation"]
예제 #6
0
  def run(self):
    self.t_train = 0
    self.t_test = 0

    # create filtered environment
    self.env = filter_env.makeFilteredEnv(gym.make(FLAGS.env), skip_space_norm=FLAGS.skip_space_norm,
                                          wolpertinger=FLAGS.wolpertinger)
    # self.env = gym.make(FLAGS.env)
    
    self.env.monitor.start(FLAGS.outdir+'/monitor/',video_callable=lambda _: False)
    # self.env.monitor.start(FLAGS.outdir+'/monitor/',video_callable=lambda _: True)
    gym.logger.setLevel(gym.logging.WARNING)

    dimO = self.env.observation_space.shape
    dimA = self.env.action_space.shape
    print(dimO,dimA)

    import pprint
    pprint.pprint(self.env.spec.__dict__,width=1)

    wolp = None
    if FLAGS.wolpertinger:
        wolp = wp.Wolpertinger(self.env, i=FLAGS.wp_total_actions,
                               action_set=wp.load_action_set(FLAGS.wp_action_set_file,
                                                             i=FLAGS.wp_total_actions, action_shape=dimA[0])
                               ).g
    elif FLAGS.fmpolicy:
        wolp = fmp.FMPolicy(self.env).g

    self.agent = ddpg.Agent(dimO=dimO, dimA=dimA, custom_policy=FLAGS.wolpertinger or FLAGS.fmpolicy,
                            env_dtype=str(self.env.action_space.high.dtype))

    returns = []

    # main loop
    while self.t_train < FLAGS.total:

      # test
      T = self.t_test
      R = []
      if self.t_train - T > 0 or FLAGS.train == 0:
        while self.t_test - T < FLAGS.test:
          R.append(self.run_episode(test=True, monitor=(self.t_test - T < FLAGS.monitor * FLAGS.test), custom_policy=wolp))
          self.t_test += 1
        avr = np.mean(R)
        # print('Average test return\t{} after {} timesteps of training'.format(avr,self.t_train))
        with open(os.path.join(FLAGS.outdir, "output.log"), mode='a') as f:
          # f.write('Average test return\t{} after {} timesteps of training\n'.format(avr, self.t_train))
          f.write('Average test return\t{} after {} timesteps\n'.format(avr, self.t_train + FLAGS.test))
        # save return
        returns.append((self.t_train, avr))
        np.save(FLAGS.outdir+"/returns.npy",returns)

        s = self.agent.checkpoint_session()
        with open(os.path.join(FLAGS.outdir, "output.log"), mode='a') as f:
            f.write('Checkpoint saved at {} \n'.format(s))

        # evaluate required number of episodes for gym and end training when above threshold
        if self.env.spec.reward_threshold is not None and avr > self.env.spec.reward_threshold:
          # TODO: it is supposed that when testing the model does not have to use the full wolpertinger policy?
          # TODO: to avoid the item not found exception in environment, custom policy is being sent to the run_episode
          avr = np.mean([self.run_episode(test=True, custom_policy=wolp) for _ in range(self.env.spec.trials)]) # trials???
          # print('TRIALS => Average return{}\t Reward Threshold {}'.format(avr, self.env.spec.reward_threshold))
          with open(os.path.join(FLAGS.outdir, "output.log"), mode='a') as f:
            f.write('TRIALS => Average return{}\t Reward Threshold {}\n'.format(avr, self.env.spec.reward_threshold))
          if avr > self.env.spec.reward_threshold:
            s = self.agent.checkpoint_session()
            with open(os.path.join(FLAGS.outdir, "output.log"), mode='a') as f:
                f.write('Final Checkpoint saved at {} \n'.format(s))
            break

      # train
      T = self.t_train
      R = []
      start_time = time.time()
      while self.t_train - T < FLAGS.train:
        R.append(self.run_episode(test=False, custom_policy=wolp))
        self.t_train += 1
      end_time = time.time()
      avr = np.mean(R)
      # print('Average training return\t{} after {} timesteps of training'.format(avr,self.t_train))
      with open(os.path.join(FLAGS.outdir, "output.log"), mode='a') as f:
        f.write('Average training return\t{} after {} timesteps of training. Batch time: {} sec.\n'
                .format(avr, self.t_train, end_time - start_time))

    self.env.monitor.close()
    f.close()
    # upload results
    if FLAGS.upload:
      gym.upload(FLAGS.outdir+"/monitor",algorithm_id = GYM_ALGO_ID)
예제 #7
0
def train(hp, simulator, unity_worker_id):
    random.seed(1234)
    np.random.seed(2345)
    torch.manual_seed(4567)

    time_start = time.time()
    run_id = time.strftime('%b%d_%H-%M-%S', time.localtime(time_start))
    run_id = f'{run_id}_{unity_worker_id}'

    file_handler = logging.FileHandler(f'run/log/{run_id}.log')
    file_handler.setFormatter(FORMATTER)
    ROOT_LOGGER.addHandler(file_handler)

    repo = git.Repo()

    logger.info('======= run_id %s started at %s =======', run_id,
                time.strftime('%b%d_%H-%M-%S_%z'))
    logger.info('using commit: %s (clean=%s): %s', repo.head.commit.hexsha,
                not repo.is_dirty(), repo.head.commit.message)

    logger.info('run_id %s using hyper parameters: %s, unity_worker_id: %s',
                run_id, hp, unity_worker_id)

    setup_env()

    writer = tensorboardX.SummaryWriter(os.path.join('run/summary', run_id))
    env = UnityEnvironment(file_name=simulator, worker_id=unity_worker_id)
    state_length = env.brains[BRAIN_NAME].vector_observation_space_size
    action_length = env.brains[BRAIN_NAME].vector_action_space_size

    agent = ddpg.Agent(state_length=state_length * NUM_STACKS,
                       action_length=action_length,
                       hp=hp,
                       writer=writer)

    window_rewards = collections.deque(maxlen=SOLVE_NUM_EPISODES)

    last_save_episode = None

    pbar = tqdm.tqdm(total=hp.num_episodes, desc=run_id)
    for i_episode in range(hp.num_episodes):
        states = env.reset(train_mode=True)[BRAIN_NAME].vector_observations

        # this records the episode reward for each agent
        episode_rewards = np.zeros(NUM_HOMOGENEOUS_AGENTS)

        episode_length = 0

        while True:
            episode_length += 1
            actions = agent.act(states)
            env_info = env.step(actions)[BRAIN_NAME]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done

            agent.step(states, actions, rewards, next_states, dones)
            episode_rewards += rewards

            if any(dones):
                break
            else:
                states = next_states

        pbar.update(1)
        writer.add_scalar('episode_length', episode_length, i_episode)

        # the episode reward is defined to be the maximum of all agents
        episode_reward = np.max(episode_rewards)
        writer.add_scalar('episode_reward_agent_max', episode_reward,
                          i_episode)
        for i, reward in enumerate(episode_rewards):
            writer.add_scalar(f'agent_{i}/episode_reward', reward, i_episode)

        window_rewards.append(episode_reward)
        mean_reward = np.mean(window_rewards)
        writer.add_scalar(
            f'episode_reward_agent_max_avg_over_{SOLVE_NUM_EPISODES}_episodes',
            mean_reward, i_episode)

        if (len(window_rewards) >= SOLVE_NUM_EPISODES
                and mean_reward >= SOLVE_REWARD
                and (last_save_episode is None
                     or i_episode - last_save_episode >= hp.save_interval)):

            last_save_episode = i_episode
            save_dir = os.path.join('run/model', f'{run_id}_{i_episode}')
            os.makedirs(save_dir, exist_ok=True)
            agent.save(save_dir)
            logger.info('model saved to directory: %s', save_dir)

    time_stop = time.time()
    logger.info('run_id %s completed at %s, time cost: %s seconds', run_id,
                time.strftime('%b%d_%H-%M-%S_%z', time.localtime(time_stop)),
                f'{time_stop - time_start:.2f}')
    env.close()
예제 #8
0
 def __init__(self, hp):
     self.hp = hp
     self.memory = replay_buffer.ReplayBuffer(hp)
     self.agents = [ddpg.Agent(self.hp) for _ in range(self.hp.num_agents)]
     self.losses = (0., 0.)