示例#1
0
    def make_obs_ph(name):
        obs_shape = env.observation_space.shape

        # if flatten_obs:
        #     flattened_env_shape = 1
        #     for dim_size in env.observation_space.shape:
        #         flattened_env_shape *= dim_size
        #     obs_shape = (flattened_env_shape,)

        return U.BatchInput(obs_shape, name=name)
示例#2
0
    def _observation_ph_generator(self, name):
        env = self.env

        if isinstance(env.observation_space, (MultiBinary, Discrete)):
            batch_shape = (env.observation_space.n, )
        elif isinstance(env.observation_space, Box):
            batch_shape = env.observation_space.shape
        else:
            raise ValueError("Unexpected observation space")

        return tf_util.BatchInput(batch_shape, name=name)
示例#3
0
    def _build_model(self):
        sess = U.get_session()
        if sess is None:
            sess = U.make_session(8)
            sess.__enter__()
        self.act, self.train, self.update_target, self.debug = deepq.build_train(
            make_obs_ph=lambda name: U.BatchInput(shape=[2, self.state_size],
                                                  name=name),
            q_func=self.model2,
            num_actions=self.action_size,
            optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            scope=self.scope,
            double_q=True,
            param_noise=True)

        # 初始化tf环境
        U.initialize()
        self.update_target()
 def make_obs_ph(name):
     return U.BatchInput((32, 32), name=name)
示例#5
0
def main():
    with U.make_session(8):
        env = gym.make("Pendulum-v0")

        act, train, update_target, debug = deepq.build_train(
            make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape,
                                                  name=name),
            q_func=model,
            num_actions=env.action_space,
            optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
        )

        # Create the replay buffer
        replay_buffer = ReplayBuffer(50000)
        # Create the schedule for exploration starting from 1 (every action is random) down to
        # 0.02 (98% of actions are selected according to values predicted by the model).
        exploration = LinearSchedule(schedule_timesteps=10000,
                                     initial_p=1.0,
                                     final_p=0.02)

        # Initialize the parameters and copy them to the target network.
        U.initialize()
        update_target()

        episode_rewards = [0.0]
        obs = env.reset()
        for t in itertools.count():
            env.render()

            # Take action and update exploration to the newest value
            action = act(obs[None], update_eps=exploration.value(t))[0]
            new_obs, rew, done, _ = env.step(action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0)

            is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200
            if is_solved:
                # Show off the result
                env.render()
            else:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if t > 1000:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                        32)
                    train(obses_t, actions, rewards, obses_tp1, dones,
                          np.ones_like(rewards))
                # Update target network periodically.
                if t % 1000 == 0:
                    update_target()

            if done and len(episode_rewards) % 10 == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", len(episode_rewards))
                logger.record_tabular(
                    "mean episode reward",
                    round(np.mean(episode_rewards[-101:-1]), 1))
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(t)))
                logger.dump_tabular()
示例#6
0
    return info['rewards']


if __name__ == '__main__':
    with U.make_session(4) as sess:
        args = parse_args()
        env, _ = dqn_core.make_env_atari(args.env)

        if args.random_action > 0:
            env = dqn_core.ActionRandomizer(env, args.random_action)

        model_parent_path = dqn_core.parent_path(args.model_dir)
        old_args = json.load(open(model_parent_path + '/args.json'))

        var_func, cvar_func = dqn_core.models.atari_model()
        act = dqn_core.build_act(make_obs_ph=lambda name: U.BatchInput(
            env.observation_space.shape, name=name),
                                 var_func=var_func,
                                 cvar_func=cvar_func,
                                 num_actions=env.action_space.n,
                                 nb_atoms=old_args['nb_atoms'])
        U.load_state(os.path.join(args.model_dir, "saved"))

        rewards = run(env, act, args.stochastic, args.nb_episodes)

    print('---------------------')
    for alpha in np.arange(0.05, 1.05, 0.05):
        v, cv = var_cvar_from_samples(rewards, alpha)
        print('CVaR_{:.2f} = {}'.format(alpha, cv))
 def make_obs_ph(name):
     return U.BatchInput((16, 16), name=name)
示例#8
0
 def make_obs_ph(name):
   return U.BatchInput((84,84,4), name=name)
示例#9
0
def main(_):
    print("Used flags:", FLAGS)
    config = configparser.ConfigParser()
    config.read(FLAGS.config_file)
    timer = time.time()

    ps_hosts = FLAGS.ps_hosts.split(",") if FLAGS.ps_hosts else config.get(FLAGS.config, 'ps_hosts').split(",")
    worker_hosts = FLAGS.worker_hosts.split(",") if FLAGS.worker_hosts else config.get(FLAGS.config, 'worker_hosts').split(",")
    job = FLAGS.job_name
    task = FLAGS.task_index
    learning_rate = config.getfloat(FLAGS.config, 'learning_rate')
    batch_size = config.getint(FLAGS.config, 'batch_size')
    memory_size = config.getint(FLAGS.config, 'memory_size')
    target_update = config.getint(FLAGS.config, 'target_update')
    seed = FLAGS.seed if FLAGS.seed else config.getint(FLAGS.config, 'seed')
    max_comm_rounds = config.getint(FLAGS.config, 'comm_rounds')
    epochs = config.getint(FLAGS.config, 'start_epoch')
    end_epoch = config.getint(FLAGS.config, 'end_epoch')
    epoch_decay = config.getint(FLAGS.config, 'epoch_decay')
    # epoch_decay_rate = (epochs - end_epoch) / epoch_decay
    epoch = LinearSchedule(epoch_decay, end_epoch, epochs)
    backup = config.getint(FLAGS.config, 'backup')  # unused in async
    sync = config.getboolean(FLAGS.config, 'sync')
    gradient_prio = False if not sync else config.getboolean(FLAGS.config, 'gradient_prio')
    sync_workers = len(worker_hosts)-backup
    mute = FLAGS.mute if FLAGS.mute else config.getboolean(FLAGS.config, 'mute')
    animate = 0
    draw = 0

    print("Config:\nps_hosts={}\nworker_hosts={}\njob_name={}\ntask_index={}\nlearning_rate={}\n"
          "batch_size={}\nmemory_size={}\ntarget_update={}\nseed={}\ncomm_rounds={}\nepochs={}\n"
          "end_epoch={}\nepoch_decay={}\nnbackup={}\nsync={}"
          .format(ps_hosts, worker_hosts, job, task, learning_rate, batch_size, memory_size, target_update,
                  seed, max_comm_rounds, epochs, end_epoch, epoch_decay, backup, sync))

    cluster = tf.train.ClusterSpec({'ps': ps_hosts, 'worker': worker_hosts})
    chief = True if job == 'worker' and task == 0 else False
    print("/job:", job, "/task:", task, " - Chief: ", chief, sep='')

    # Create server
    server = tf.train.Server(cluster, job_name=job, task_index=task)

    run_code = "{}-{}-p-{}-w-{}-E-{}-b-{}-m-{}-N-{}-lr-{}-B-{}-s-{}-".\
        format(datetime.now().strftime("%y%m%d-%H%M%S"), env_name, len(ps_hosts), len(worker_hosts),
               epochs, batch_size, memory_size, target_update, learning_rate, backup, seed)
    run_code += "-sync" if sync else "-async"

    # Set a unique random seed for each client
    seed = ((seed * 10) + task)
    random.seed(seed)

    if not mute:
        print("Run code:", run_code)

    # Start parameter servers
    if job == 'ps':
        server.join()

    # Start training
    with U.make_session(num_cpu=4, target=server.target) as sess:
        # Create the environment
        env = gym.make(env_name)
        env.seed(seed)
        tf.set_random_seed(seed)

        # Create all the functions necessary to train the model
        act, train, global_opt,  update_target, update_weights, sync_opt, debug = deepq.build_train(
            make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape, name=name),
            q_func=model,
            num_actions=env.action_space.n,
            optimizer=tf.train.AdamOptimizer(learning_rate=learning_rate),
            # optimizer=tf.train.GradientDescentOptimizer(learning_rate=learning_rate),
            chief=chief,
            server=server,
            workers=sync_workers
        )
        # Create the replay buffer
        replay_buffer = ReplayBuffer(memory_size)
        # Create the schedule for exploration starting from 1 (every action is random) down to
        # 0.02 (98% of actions are selected according to values predicted by the model).
        exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02)

        if not chief:
            if not mute:
                print("Worker {}/{} will sleep (3s) for chief to initialize variables".format(task+1, len(worker_hosts)))
            time.sleep(4)

        # Initialize the parameters and copy them to the target network.
        U.initialize(chief=chief)

        if chief:
            sess.run(debug['run_code'].assign(run_code))
            if not mute:
                print("Set global run code to:", run_code)

        if not mute:
            print("initialized variables, sleeping for 1 sec")
        time.sleep(2)

        if not chief:
            while not sess.run(tf.is_variable_initialized(debug['run_code'])):
                if not mute:
                    print("Global run code not yet initialized")
                time.sleep(2)
            run_code = str(sess.run(debug['run_code']).decode())
            if run_code == '':
                if not mute:
                    print("Run code empty. Trying to fetch again...")
                time.sleep(5)
            if not mute:
                print("Read global run code:", run_code)

        run_code += "(w" + str(task) + ")"
        print("Final run_code:", run_code)

        t_global_old = update_weights()[0][0]
        update_target()
        exp_gen = 1000  # For how many timesteps sould we only generate experience (not train)
        t_start = exp_gen
        comm_rounds = 0
        comm_rounds_global = 0
        dt = 0
        write_csv(run_code, log=["episode", "reward" + str(task), "avg_reward" + str(task), "t_global", "cr"])

        episode_rewards = [0.0]
        cr_reward = 0
        obs = env.reset()
        for t in itertools.count():
            # Take action and update exploration to the newest value
            action = act(obs[None], update_eps=exploration.value(t))[0]
            new_obs, rew, done, _ = env.step(action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            cr_reward += rew

            # Animate every <animate> episodes
            if not mute and chief and animate > 0 and (len(episode_rewards) % animate) == 0:
                if done:
                    print("ep", len(episode_rewards), "ended with reward:", episode_rewards[-1])
                env.render()

            if done:
                if not mute and chief and draw > 0 and len(episode_rewards) % draw == 0:
                    env.render()
                avg_rew = np.round(np.mean(np.array(episode_rewards[-100:])), 1)
                write_csv(run_code, [len(episode_rewards), episode_rewards[-1], avg_rew, debug['t_global']()[0], comm_rounds_global])

                obs = env.reset()
                episode_rewards.append(0)

            [converged] = sync_opt['check_converged']()
            is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= max_reward or converged
            if is_solved or comm_rounds >= max_comm_rounds:
                sync_opt['set_converged']([True])
                if not mute:
                    print("Converged was set to", sync_opt['check_converged']()[0])
                write_csv_final(run_code, str(len(episode_rewards)), worker_hosts, chief, comm_rounds_global, mute)
                print("Converged after:  ", len(episode_rewards), "episodes")
                print("Agent total steps:", t)
                print("Global steps:     ", debug['t_global']()[0])
                sec = round(time.time() - timer)
                print("Total time:", sec // 3600, "h", (sec % 3600) // 60, "min", sec % 60, "s")
                return
            else:
                if t >= exp_gen:
                # if t >= batch_size:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                    td_error = train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards))

                    if t - t_start >= np.round(epoch.value(comm_rounds)):  

                        cr_old = comm_rounds_global

                        # Apply gradients to weights in PS
                        if sync:
                            # Tell the ps we are done and want to submit score
                            [[comm_rounds_global], [worker_count]] = sync_opt['request_submit']()

                            if comm_rounds_global == comm_rounds:
                                if worker_count <= sync_workers:
                                    # If allowed to submit score, do it
                                    [comm_rounds_global] = sync_opt['submit_score']([cr_reward])

                                    if chief: 
                                        [submits] = sync_opt['set_submit']([0])
                                        while worker_count != sync_workers:
                                            if sync_opt['check_converged']()[0]:
                                                if not mute:
                                                    print("Other worker converged! Finishing in check_wc")
                                                break
                                            worker_count = sync_opt['check_wc']()[0]

                                    while sync_opt['check_submit']()[0] == -1:
                                        if sync_opt['check_converged']()[0]:
                                            if not mute:
                                                print("Other worker converged! Finishing in check_submit")
                                            break
                                      
                                        pass

                                    if sync_opt['check_converged']()[0]:
                                        if not mute:
                                            print("Other worker converged! Continuing before submit")
                                        continue

                                    # Now all eligible workers have sent their score and gradient round has started
                                    # Submit gradient
                                    # TODO 4th argument overrides everything else unles it is set to -1 in the code
                                    [[dt], [comm_rounds_global], [factor]] = global_opt([t - t_start], [t_global_old],
                                                                              [cr_reward], [1/len(worker_hosts)], [True])

                                    submits = sync_opt['inc_submit']()
                                    if chief:
                                        while not sync_opt['check_submit']()[0] == sync_workers:
                                            if sync_opt['check_converged']()[0]:
                                                if not mute:
                                                    print("Other worker converged! Finishing in check_submit (chief)")
                                                break
                                          
                                            pass
                                        # print("Round", comm_rounds, "finished")
                                        [w] = sync_opt['reset_wc']()[0]
                                        # print("Worker count reset to:", w)
                                        sync_opt['reset_score']()
                                        submits = sync_opt['set_submit']([-1])
                                        # print("Submit round finished. Submits set to:", submits[0])
                                        [r] = sync_opt['inc_comm_round']()[0]
                                        # print("New round started:", r)

                                    # Normal workers wait until GCR > CR
                                    if not chief:
                                        while sync_opt['check_round']()[0] <= comm_rounds:
                                            if sync_opt['check_converged']()[0]:
                                                if not mute:
                                                    print("Other worker converged! Finishing in check_round")
                                                break
                                            # print("Worker submitted, waiting for next round:", comm_rounds + 1)
                                            # time.sleep(0.1)
                                            pass

                                else: #elif worker_count > sync_workers:
                                    # If not allowed to submit score, wait for next round to start
                                    if not mute:
                                        print("Worker finished too late but before new round started (", comm_rounds_global, ")")
                                        print("WC(", worker_count, ") > N(", sync_workers, ")", sep="")
                                    target = np.floor(comm_rounds_global + 1)  # +1 if x.0, +0.5 if x.5
                                    while not sync_opt['check_round']()[0] >= target:
                                        pass

                            elif comm_rounds_global > comm_rounds:
                                # This means the worker is behind. Do nothing and start next round
                                if not mute:
                                    print("Communication round ", comm_rounds, "missed. Actual round:", comm_rounds_global)
                                # TODO How to handle round count when skipping rounds?
                                comm_rounds = comm_rounds_global - 1

                            elif comm_rounds_global < comm_rounds:
                                print("WARNING! Worker ahead of global:", comm_rounds, ">", comm_rounds_global)
                                time.sleep(5)

                        else:
                            sync_opt['inc_comm_round']()
                            [[dt], [comm_rounds_global], [factor]] = global_opt([t - t_start], [t_global_old], [0], [-1], [False])

                        # Update the local weights with the new global weights from PS
                        t_global_old = update_weights()[0][0]

                        comm_rounds += 1
                        # print("Round finished. Increasing local comm_round to:", comm_rounds)
                        cr_reward = 0
                        # TODO RE-ENABLE comm-rounds LOGGING
                        # write_csv(run_code, [comm_rounds, t, dt, epoch.value(comm_rounds)], comm_rounds=True)

                        t_start = t
                if t % target_update == 0:
                    update_target()

            if not mute and done and len(episode_rewards) % 10 == 0:
                last_rewards = episode_rewards[-101:-1]
                logger.record_tabular("steps", t)
                logger.record_tabular("global steps", debug['t_global']()[0])
                logger.record_tabular("communication rounds", comm_rounds)
                logger.record_tabular("episodes", len(episode_rewards))
                logger.record_tabular("mean episode reward", np.round(np.mean(episode_rewards[-101:-1]), 4))
                logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                # logger.record_tabular("last gradient factor", np.round(factor, 4))
                logger.dump_tabular()
                rew_ill = ['●' if x >= max_reward else str(int(np.floor(x / (max_reward/10)))) if x >= (max_reward/10) else '_' for x in last_rewards]
                streak = 0
                for i in reversed(rew_ill):
                    if i == "●":
                        streak += 1
                    else:
                        break
                #print("[" + ''.join(rew_ill) + "] ([● " + str(rew_ill.count('●')) + " | " + str(rew_ill.count('9')) +
                      " | " + str(rew_ill.count('8')) + " | " + str(rew_ill.count('7')) +
                      " | " + str(rew_ill.count('6')) + " | " + str(rew_ill.count('5')) +
                      " | " + str(rew_ill.count('4')) + " | " + str(rew_ill.count('3')) +
                      " | " + str(rew_ill.count('2')) + " | " + str(rew_ill.count('1')) +
                      " | " + str(rew_ill.count('_')) + " _]/" + str(len(rew_ill)) + " {S:" + str(streak) + "})", sep='')
示例#10
0
 def make_obs_ph(name):
   return U.BatchInput((64, 64), name=name)
示例#11
0
 def make_placeholder(name):
     """Make a placeholder input."""
     return tf_util.BatchInput(env.observation_space.shape, name=name)
示例#12
0
 def make_obs_ph(name):
     import dqn.tf_util as U
     return U.BatchInput(observation_shape, name=name)
示例#13
0
 def make_obs_ph(name):
     return U.BatchInput((observation_space_shape[0] +
                          env_transfer.observation_space.shape[0], ),
                         name=name)
 def make_obs_ph(name):
   return U.BatchInput(env.observation_spec()["screen"], name=name)
 def make_obs_ph(name):
     return U.BatchInput((num_actions, num_actions), name=name)
示例#16
0
 def make_obs_ph(name):
     return U.BatchInput(observation_space_shape, name=name)
示例#17
0
 def make_obs_ph(name):
     return U.BatchInput((env.observation_space.shape[0] * 2, ), name=name)
示例#18
0
    def evaluate(self, num_episodes, render=False):
        with U.make_session(NUM_CORES):
            self.t0 = time.time()
            env = self.env.env

            # Create all the functions necessary to train the model
            act, train, update_target, debug = deepq.build_train(
                    make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape, name=name),
                    q_func=model,
                    num_actions=env.action_space.n,
                    optimizer=tf.train.AdamOptimizer(learning_rate=5e-4)
            )
            # Create the replay buffer
            replay_buffer = ReplayBuffer(50000)
            # Create the schedule for exploration starting from 1 (every action is random) down to
            # 0.02 (98% of actions are selected according to values predicted by the model).
            exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02)

            # Initialize the parameters and copy them to the target network.
            U.initialize()
            update_target()

            self.episode_count += 1
            state = env.reset()
            self.scores = [0.0]
            episode_q = []

            for t in itertools.count():
                action = act(state[None], update_eps=exploration.value(t))[0]
                observation, reward, done, _ = env.step(action)
                replay_buffer.add(state, action, reward, observation, float(done))

                state = observation
                self.scores[-1] += reward

                episode_q.append(float(debug['q_values'](state[None]).max()))

                if render:
                    env.render()

                if done:
                    print('{0}, score: {1} ({2})'.format(len(self.scores), self.scores[-1], np.mean(self.scores[-100:])))
                    self.evaluation.info['q_values'].append(np.mean(episode_q))

                    if len(self.scores) >= num_episodes:
                        return self.final_evaluation()

                    state = env.reset()
                    episode_q = []
                    self.scores.append(0)

                    if self.env.solved(self.scores):
                        self.evaluation.info['solved'] = len(self.scores)

                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if t > 1000:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32)
                    train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards))

                # Update target network periodically.
                if t % 1000 == 0:
                    update_target()

            U.reset()
            return self.final_evaluation()
示例#19
0
        # out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
        out = layers.fully_connected(out, num_outputs=512, activation_fn=tf.nn.relu)
        out = layers.fully_connected(out, num_outputs=256, activation_fn=tf.nn.relu)
        out = layers.fully_connected(out, num_outputs=256, activation_fn=tf.nn.relu)
        out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
        # out = layers.layer_norm(out, center=True, scale=True)
        return out


if __name__ == '__main__':
    with U.make_session(8):
        # Create the environment
        env = gym.make("CartPole-v0")
        # Create all the functions necessary to train the model
        act, train, update_target, debug = deepq.build_train(
            make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape, name=name),
            q_func=model,
            num_actions=env.action_space.n,
            optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            param_noise=False
        )
        # Create the replay buffer
        replay_buffer = PrioritizedReplayBuffer(50000, alpha=0.6)
        # Create the schedule for exploration starting from 1 (every action is random) down to
        # 0.02 (98% of actions are selected according to values predicted by the model).
        exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02)

        # Initialize the parameters and copy them to the target network.
        U.initialize()
        update_target()
示例#20
0
    compass_channel /= 180.0

    return np.concatenate([pov, compass_channel], axis=-1)


if __name__ == '__main__':
    with U.make_session(8):
        # Create the environment
        env = gym.make("MineRLNavigateDense-v0")
        spaces = env.observation_space.spaces['pov']
        shape = list(spaces.shape)
        shape[-1] += 1

        # Create all the functions necessary to train the model
        act, train, update_target, debug = deepq.build_train(
            make_obs_ph=lambda name: U.BatchInput(shape, name=name),
            q_func=model,
            num_actions=4,
            optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
        )
        # Create the replay buffer
        replay_buffer = ReplayBuffer(30000)
        # Create the schedule for exploration starting from 1 (every action is random) down to
        # 0.02 (98% of actions are selected according to values predicted by the model).
        exploration = LinearSchedule(schedule_timesteps=100000,
                                     initial_p=1.0,
                                     final_p=0.02)

        # Initialize the parameters and copy them to the target network.
        U.initialize()
        update_target()