示例#1
0
    def __init__(self, env_name, config, upload_dir=None):
        config.update({"alg": "EvolutionStrategies"})

        Algorithm.__init__(self, env_name, config, upload_dir=upload_dir)

        policy_params = {
            "ac_noise_std": 0.01
        }

        env = gym.make(env_name)
        utils.make_session(single_threaded=False)
        self.policy = policies.GenericPolicy(
            env.observation_space, env.action_space, **policy_params)
        tf_util.initialize()
        self.optimizer = optimizers.Adam(self.policy, config["stepsize"])
        self.ob_stat = utils.RunningStat(env.observation_space.shape, eps=1e-2)

        # Create the shared noise table.
        print("Creating shared noise table.")
        noise_id = create_shared_noise.remote()
        self.noise = SharedNoiseTable(ray.get(noise_id))

        # Create the actors.
        print("Creating actors.")
        self.workers = [
            Worker.remote(config, policy_params, env_name, noise_id)
            for _ in range(config["num_workers"])]

        self.episodes_so_far = 0
        self.timesteps_so_far = 0
        self.tstart = time.time()
        self.iteration = 0
示例#2
0
    def __init__(self, env_name, config, upload_dir=None):
        config.update({"alg": "PolicyGradient"})

        Algorithm.__init__(self, env_name, config, upload_dir=upload_dir)

        # TODO(ekl): preprocessor should be associated with the env elsewhere
        if self.env_name == "Pong-v0":
            preprocessor = AtariPixelPreprocessor()
        elif self.env_name == "Pong-ram-v3":
            preprocessor = AtariRamPreprocessor()
        elif self.env_name == "CartPole-v0" or self.env_name == "CartPole-v1":
            preprocessor = NoPreprocessor()
        elif self.env_name == "Hopper-v1":
            preprocessor = NoPreprocessor()
        elif self.env_name == "Walker2d-v1":
            preprocessor = NoPreprocessor()
        elif self.env_name == "Humanoid-v1":
            preprocessor = NoPreprocessor()
        else:
            preprocessor = AtariPixelPreprocessor()

        self.preprocessor = preprocessor
        self.global_step = 0
        self.j = 0
        self.kl_coeff = config["kl_coeff"]
        self.model = Agent(self.env_name, 1, self.preprocessor, self.config,
                           self.logdir, False)
        self.agents = [
            RemoteAgent.remote(self.env_name, 1, self.preprocessor,
                               self.config, self.logdir, True)
            for _ in range(config["num_agents"])
        ]
        self.start_time = time.time()
示例#3
0
  def __init__(self, env_name, config):
    Algorithm.__init__(self, env_name, config)

    # TODO(ekl) the preprocessor should be associated with the env elsewhere
    if self.env_name == "Pong-v0":
      preprocessor = AtariPixelPreprocessor()
    elif self.env_name == "Pong-ram-v3":
      preprocessor = AtariRamPreprocessor()
    elif self.env_name == "CartPole-v0":
      preprocessor = NoPreprocessor()
    elif self.env_name == "Walker2d-v1":
      preprocessor = NoPreprocessor()
    else:
      preprocessor = AtariPixelPreprocessor()

    self.preprocessor = preprocessor
    self.global_step = 0
    self.j = 0
    self.kl_coeff = config["kl_coeff"]
    self.model = Agent(
        self.env_name, 1, self.preprocessor, self.config, False)
    self.agents = [
        RemoteAgent.remote(
            self.env_name, 1, self.preprocessor, self.config, True)
        for _ in range(config["num_agents"])]
示例#4
0
    def __init__(self, env_name, config):
        Algorithm.__init__(self, env_name, config)

        policy_params = {
            "ac_bins": "continuous:",
            "ac_noise_std": 0.01,
            "nonlin_type": "tanh",
            "hidden_dims": [256, 256],
            "connection_type": "ff"
        }

        # Create the shared noise table.
        print("Creating shared noise table.")
        noise_id = create_shared_noise.remote()
        self.noise = SharedNoiseTable(ray.get(noise_id))

        # Create the actors.
        print("Creating actors.")
        self.workers = [
            Worker.remote(config, policy_params, env_name, noise_id)
            for _ in range(config.num_workers)
        ]

        env = gym.make(env_name)
        utils.make_session(single_threaded=False)
        self.policy = policies.MujocoPolicy(env.observation_space,
                                            env.action_space, **policy_params)
        tf_util.initialize()
        self.optimizer = optimizers.Adam(self.policy, config.stepsize)
        self.ob_stat = utils.RunningStat(env.observation_space.shape, eps=1e-2)

        self.episodes_so_far = 0
        self.timesteps_so_far = 0
        self.tstart = time.time()
        self.iteration = 0
示例#5
0
文件: dqn.py 项目: xgong/ray
    def __init__(self, env_name, config, upload_dir=None):
        config.update({"alg": "DQN"})
        Algorithm.__init__(self, env_name, config, upload_dir=upload_dir)
        env = gym.make(env_name)
        env = ScaledFloatFrame(wrap_dqn(env))
        self.env = env
        model = models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
                                  hiddens=[256],
                                  dueling=True)
        sess = U.make_session(num_cpu=config["num_cpu"])
        sess.__enter__()

        def make_obs_ph(name):
            return U.BatchInput(env.observation_space.shape, name=name)

        self.act, self.optimize, self.update_target, self.debug = build_train(
            make_obs_ph=make_obs_ph,
            q_func=model,
            num_actions=env.action_space.n,
            optimizer=tf.train.AdamOptimizer(learning_rate=config["lr"]),
            gamma=config["gamma"],
            grad_norm_clipping=10)
        # Create the replay buffer
        if config["prioritized_replay"]:
            self.replay_buffer = PrioritizedReplayBuffer(
                config["buffer_size"],
                alpha=config["prioritized_replay_alpha"])
            prioritized_replay_beta_iters = (
                config["prioritized_replay_beta_iters"])
            if prioritized_replay_beta_iters is None:
                prioritized_replay_beta_iters = (
                    config["schedule_max_timesteps"])
            self.beta_schedule = LinearSchedule(
                prioritized_replay_beta_iters,
                initial_p=config["prioritized_replay_beta0"],
                final_p=1.0)
        else:
            self.replay_buffer = ReplayBuffer(config["buffer_size"])
            self.beta_schedule = None
        # Create the schedule for exploration starting from 1.
        self.exploration = LinearSchedule(
            schedule_timesteps=int(config["exploration_fraction"] *
                                   config["schedule_max_timesteps"]),
            initial_p=1.0,
            final_p=config["exploration_final_eps"])

        # Initialize the parameters and copy them to the target network.
        U.initialize()
        self.update_target()

        self.episode_rewards = [0.0]
        self.episode_lengths = [0.0]
        self.saved_mean_reward = None
        self.obs = self.env.reset()
        self.num_timesteps = 0
        self.num_iterations = 0
示例#6
0
 def __init__(self, env_name, config):
     Algorithm.__init__(self, env_name, config)
     self.env = create_env(env_name)
     self.policy = LSTMPolicy(self.env.observation_space.shape,
                              self.env.action_space.n, 0)
     self.agents = [
         Runner.remote(env_name, i) for i in range(config["num_workers"])
     ]
     self.parameters = self.policy.get_weights()
     self.iteration = 0
示例#7
0
文件: a3c.py 项目: xgong/ray
 def __init__(self, env_name, config, upload_dir=None):
     config.update({"alg": "A3C"})
     Algorithm.__init__(self, env_name, config, upload_dir=upload_dir)
     self.env = create_env(env_name)
     self.policy = LSTMPolicy(self.env.observation_space.shape,
                              self.env.action_space.n, 0)
     self.agents = [
         Runner.remote(env_name, i, self.logdir)
         for i in range(config["num_workers"])
     ]
     self.parameters = self.policy.get_weights()
     self.iteration = 0
示例#8
0
    def __init__(self, env_name, config, upload_dir=None):
        config.update({"alg": "DQN"})
        Algorithm.__init__(self, env_name, config, upload_dir=upload_dir)
        env = gym.make(env_name)
        # TODO(ekl): replace this with RLlib preprocessors
        if "NoFrameskip" in env_name:
            env = ScaledFloatFrame(wrap_dqn(env))
        self.env = env

        num_cpu = config["num_cpu"]
        tf_config = tf.ConfigProto(inter_op_parallelism_threads=num_cpu,
                                   intra_op_parallelism_threads=num_cpu)
        self.sess = tf.Session(config=tf_config)
        self.dqn_graph = models.DQNGraph(env, config)

        # Create the replay buffer
        if config["prioritized_replay"]:
            self.replay_buffer = PrioritizedReplayBuffer(
                config["buffer_size"],
                alpha=config["prioritized_replay_alpha"])
            prioritized_replay_beta_iters = (
                config["prioritized_replay_beta_iters"])
            if prioritized_replay_beta_iters is None:
                prioritized_replay_beta_iters = (
                    config["schedule_max_timesteps"])
            self.beta_schedule = LinearSchedule(
                prioritized_replay_beta_iters,
                initial_p=config["prioritized_replay_beta0"],
                final_p=1.0)
        else:
            self.replay_buffer = ReplayBuffer(config["buffer_size"])
            self.beta_schedule = None
        # Create the schedule for exploration starting from 1.
        self.exploration = LinearSchedule(
            schedule_timesteps=int(config["exploration_fraction"] *
                                   config["schedule_max_timesteps"]),
            initial_p=1.0,
            final_p=config["exploration_final_eps"])

        # Initialize the parameters and copy them to the target network.
        self.sess.run(tf.global_variables_initializer())
        self.dqn_graph.update_target(self.sess)

        self.episode_rewards = [0.0]
        self.episode_lengths = [0.0]
        self.saved_mean_reward = None
        self.obs = self.env.reset()
        self.num_timesteps = 0
        self.num_iterations = 0
        self.file_writer = tf.summary.FileWriter(self.logdir, self.sess.graph)
示例#9
0
    def __init__(self, env_name, config, upload_dir=None):
        config.update({"alg": "PolicyGradient"})

        Algorithm.__init__(self, env_name, config, upload_dir=upload_dir)

        self.global_step = 0
        self.j = 0
        self.kl_coeff = config["kl_coeff"]
        self.model = Agent(self.env_name, 1, self.config, self.logdir, False)
        self.agents = [
            RemoteAgent.remote(
                self.env_name, 1, self.config, self.logdir, True)
            for _ in range(config["num_agents"])]
        self.start_time = time.time()
示例#10
0
    def __init__(self, env_name, config, upload_dir=None):
        config.update({"alg": "PolicyGradient"})

        Algorithm.__init__(self, env_name, config, upload_dir=upload_dir)

        self.global_step = 0
        self.j = 0
        self.kl_coeff = config["kl_coeff"]
        self.model = Agent(self.env_name, 1, self.config, self.logdir, False)
        self.agents = [
            RemoteAgent.remote(
                self.env_name, 1, self.config, self.logdir, True)
            for _ in range(config["num_agents"])]
        self.start_time = time.time()
        # TF does not support to write logs to S3 at the moment
        write_tf_logs = config["write_logs"] and self.logdir.startswith("file")
        if write_tf_logs:
            self.file_writer = tf.summary.FileWriter(
                self.logdir, self.model.sess.graph)
        else:
            self.file_writer = None
        self.saver = tf.train.Saver(max_to_keep=None)