def __init__(self, env_name, config, upload_dir=None): config.update({"alg": "EvolutionStrategies"}) Algorithm.__init__(self, env_name, config, upload_dir=upload_dir) policy_params = { "ac_noise_std": 0.01 } env = gym.make(env_name) utils.make_session(single_threaded=False) self.policy = policies.GenericPolicy( env.observation_space, env.action_space, **policy_params) tf_util.initialize() self.optimizer = optimizers.Adam(self.policy, config["stepsize"]) self.ob_stat = utils.RunningStat(env.observation_space.shape, eps=1e-2) # Create the shared noise table. print("Creating shared noise table.") noise_id = create_shared_noise.remote() self.noise = SharedNoiseTable(ray.get(noise_id)) # Create the actors. print("Creating actors.") self.workers = [ Worker.remote(config, policy_params, env_name, noise_id) for _ in range(config["num_workers"])] self.episodes_so_far = 0 self.timesteps_so_far = 0 self.tstart = time.time() self.iteration = 0
def __init__(self, env_name, config, upload_dir=None): config.update({"alg": "PolicyGradient"}) Algorithm.__init__(self, env_name, config, upload_dir=upload_dir) # TODO(ekl): preprocessor should be associated with the env elsewhere if self.env_name == "Pong-v0": preprocessor = AtariPixelPreprocessor() elif self.env_name == "Pong-ram-v3": preprocessor = AtariRamPreprocessor() elif self.env_name == "CartPole-v0" or self.env_name == "CartPole-v1": preprocessor = NoPreprocessor() elif self.env_name == "Hopper-v1": preprocessor = NoPreprocessor() elif self.env_name == "Walker2d-v1": preprocessor = NoPreprocessor() elif self.env_name == "Humanoid-v1": preprocessor = NoPreprocessor() else: preprocessor = AtariPixelPreprocessor() self.preprocessor = preprocessor self.global_step = 0 self.j = 0 self.kl_coeff = config["kl_coeff"] self.model = Agent(self.env_name, 1, self.preprocessor, self.config, self.logdir, False) self.agents = [ RemoteAgent.remote(self.env_name, 1, self.preprocessor, self.config, self.logdir, True) for _ in range(config["num_agents"]) ] self.start_time = time.time()
def __init__(self, env_name, config): Algorithm.__init__(self, env_name, config) # TODO(ekl) the preprocessor should be associated with the env elsewhere if self.env_name == "Pong-v0": preprocessor = AtariPixelPreprocessor() elif self.env_name == "Pong-ram-v3": preprocessor = AtariRamPreprocessor() elif self.env_name == "CartPole-v0": preprocessor = NoPreprocessor() elif self.env_name == "Walker2d-v1": preprocessor = NoPreprocessor() else: preprocessor = AtariPixelPreprocessor() self.preprocessor = preprocessor self.global_step = 0 self.j = 0 self.kl_coeff = config["kl_coeff"] self.model = Agent( self.env_name, 1, self.preprocessor, self.config, False) self.agents = [ RemoteAgent.remote( self.env_name, 1, self.preprocessor, self.config, True) for _ in range(config["num_agents"])]
def __init__(self, env_name, config): Algorithm.__init__(self, env_name, config) policy_params = { "ac_bins": "continuous:", "ac_noise_std": 0.01, "nonlin_type": "tanh", "hidden_dims": [256, 256], "connection_type": "ff" } # Create the shared noise table. print("Creating shared noise table.") noise_id = create_shared_noise.remote() self.noise = SharedNoiseTable(ray.get(noise_id)) # Create the actors. print("Creating actors.") self.workers = [ Worker.remote(config, policy_params, env_name, noise_id) for _ in range(config.num_workers) ] env = gym.make(env_name) utils.make_session(single_threaded=False) self.policy = policies.MujocoPolicy(env.observation_space, env.action_space, **policy_params) tf_util.initialize() self.optimizer = optimizers.Adam(self.policy, config.stepsize) self.ob_stat = utils.RunningStat(env.observation_space.shape, eps=1e-2) self.episodes_so_far = 0 self.timesteps_so_far = 0 self.tstart = time.time() self.iteration = 0
def __init__(self, env_name, config, upload_dir=None): config.update({"alg": "DQN"}) Algorithm.__init__(self, env_name, config, upload_dir=upload_dir) env = gym.make(env_name) env = ScaledFloatFrame(wrap_dqn(env)) self.env = env model = models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=True) sess = U.make_session(num_cpu=config["num_cpu"]) sess.__enter__() def make_obs_ph(name): return U.BatchInput(env.observation_space.shape, name=name) self.act, self.optimize, self.update_target, self.debug = build_train( make_obs_ph=make_obs_ph, q_func=model, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=config["lr"]), gamma=config["gamma"], grad_norm_clipping=10) # Create the replay buffer if config["prioritized_replay"]: self.replay_buffer = PrioritizedReplayBuffer( config["buffer_size"], alpha=config["prioritized_replay_alpha"]) prioritized_replay_beta_iters = ( config["prioritized_replay_beta_iters"]) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = ( config["schedule_max_timesteps"]) self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=config["prioritized_replay_beta0"], final_p=1.0) else: self.replay_buffer = ReplayBuffer(config["buffer_size"]) self.beta_schedule = None # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(config["exploration_fraction"] * config["schedule_max_timesteps"]), initial_p=1.0, final_p=config["exploration_final_eps"]) # Initialize the parameters and copy them to the target network. U.initialize() self.update_target() self.episode_rewards = [0.0] self.episode_lengths = [0.0] self.saved_mean_reward = None self.obs = self.env.reset() self.num_timesteps = 0 self.num_iterations = 0
def __init__(self, env_name, config): Algorithm.__init__(self, env_name, config) self.env = create_env(env_name) self.policy = LSTMPolicy(self.env.observation_space.shape, self.env.action_space.n, 0) self.agents = [ Runner.remote(env_name, i) for i in range(config["num_workers"]) ] self.parameters = self.policy.get_weights() self.iteration = 0
def __init__(self, env_name, config, upload_dir=None): config.update({"alg": "A3C"}) Algorithm.__init__(self, env_name, config, upload_dir=upload_dir) self.env = create_env(env_name) self.policy = LSTMPolicy(self.env.observation_space.shape, self.env.action_space.n, 0) self.agents = [ Runner.remote(env_name, i, self.logdir) for i in range(config["num_workers"]) ] self.parameters = self.policy.get_weights() self.iteration = 0
def __init__(self, env_name, config, upload_dir=None): config.update({"alg": "DQN"}) Algorithm.__init__(self, env_name, config, upload_dir=upload_dir) env = gym.make(env_name) # TODO(ekl): replace this with RLlib preprocessors if "NoFrameskip" in env_name: env = ScaledFloatFrame(wrap_dqn(env)) self.env = env num_cpu = config["num_cpu"] tf_config = tf.ConfigProto(inter_op_parallelism_threads=num_cpu, intra_op_parallelism_threads=num_cpu) self.sess = tf.Session(config=tf_config) self.dqn_graph = models.DQNGraph(env, config) # Create the replay buffer if config["prioritized_replay"]: self.replay_buffer = PrioritizedReplayBuffer( config["buffer_size"], alpha=config["prioritized_replay_alpha"]) prioritized_replay_beta_iters = ( config["prioritized_replay_beta_iters"]) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = ( config["schedule_max_timesteps"]) self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=config["prioritized_replay_beta0"], final_p=1.0) else: self.replay_buffer = ReplayBuffer(config["buffer_size"]) self.beta_schedule = None # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(config["exploration_fraction"] * config["schedule_max_timesteps"]), initial_p=1.0, final_p=config["exploration_final_eps"]) # Initialize the parameters and copy them to the target network. self.sess.run(tf.global_variables_initializer()) self.dqn_graph.update_target(self.sess) self.episode_rewards = [0.0] self.episode_lengths = [0.0] self.saved_mean_reward = None self.obs = self.env.reset() self.num_timesteps = 0 self.num_iterations = 0 self.file_writer = tf.summary.FileWriter(self.logdir, self.sess.graph)
def __init__(self, env_name, config, upload_dir=None): config.update({"alg": "PolicyGradient"}) Algorithm.__init__(self, env_name, config, upload_dir=upload_dir) self.global_step = 0 self.j = 0 self.kl_coeff = config["kl_coeff"] self.model = Agent(self.env_name, 1, self.config, self.logdir, False) self.agents = [ RemoteAgent.remote( self.env_name, 1, self.config, self.logdir, True) for _ in range(config["num_agents"])] self.start_time = time.time()
def __init__(self, env_name, config, upload_dir=None): config.update({"alg": "PolicyGradient"}) Algorithm.__init__(self, env_name, config, upload_dir=upload_dir) self.global_step = 0 self.j = 0 self.kl_coeff = config["kl_coeff"] self.model = Agent(self.env_name, 1, self.config, self.logdir, False) self.agents = [ RemoteAgent.remote( self.env_name, 1, self.config, self.logdir, True) for _ in range(config["num_agents"])] self.start_time = time.time() # TF does not support to write logs to S3 at the moment write_tf_logs = config["write_logs"] and self.logdir.startswith("file") if write_tf_logs: self.file_writer = tf.summary.FileWriter( self.logdir, self.model.sess.graph) else: self.file_writer = None self.saver = tf.train.Saver(max_to_keep=None)