def __init__(self, env_creator, config, logdir): DQNEvaluator.__init__(self, env_creator, config, logdir) # Create extra workers if needed if self.config["num_workers"] > 1: remote_cls = ray.remote(num_cpus=1)(DQNEvaluator) self.workers = [ remote_cls.remote(env_creator, config, logdir) for _ in range(self.config["num_workers"]) ] else: self.workers = [] # Create the replay buffer if config["prioritized_replay"]: self.replay_buffer = PrioritizedReplayBuffer( config["buffer_size"], alpha=config["prioritized_replay_alpha"]) prioritized_replay_beta_iters = \ config["prioritized_replay_beta_iters"] if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = \ config["schedule_max_timesteps"] self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=config["prioritized_replay_beta0"], final_p=1.0) else: self.replay_buffer = ReplayBuffer(config["buffer_size"]) self.beta_schedule = None self.samples_to_prioritize = None
def __init__(self, env_name, config, upload_dir=None): config.update({"alg": "DQN"}) Algorithm.__init__(self, env_name, config, upload_dir=upload_dir) env = gym.make(env_name) env = ScaledFloatFrame(wrap_dqn(env)) self.env = env model = models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=True) sess = U.make_session(num_cpu=config["num_cpu"]) sess.__enter__() def make_obs_ph(name): return U.BatchInput(env.observation_space.shape, name=name) self.act, self.optimize, self.update_target, self.debug = build_train( make_obs_ph=make_obs_ph, q_func=model, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=config["lr"]), gamma=config["gamma"], grad_norm_clipping=10) # Create the replay buffer if config["prioritized_replay"]: self.replay_buffer = PrioritizedReplayBuffer( config["buffer_size"], alpha=config["prioritized_replay_alpha"]) prioritized_replay_beta_iters = ( config["prioritized_replay_beta_iters"]) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = ( config["schedule_max_timesteps"]) self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=config["prioritized_replay_beta0"], final_p=1.0) else: self.replay_buffer = ReplayBuffer(config["buffer_size"]) self.beta_schedule = None # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(config["exploration_fraction"] * config["schedule_max_timesteps"]), initial_p=1.0, final_p=config["exploration_final_eps"]) # Initialize the parameters and copy them to the target network. U.initialize() self.update_target() self.episode_rewards = [0.0] self.episode_lengths = [0.0] self.saved_mean_reward = None self.obs = self.env.reset() self.num_timesteps = 0 self.num_iterations = 0
def _init(self): config = self.config env = gym.make(self.env_name) # TODO(ekl): replace this with RLlib preprocessors if "NoFrameskip" in self.env_name: env = ScaledFloatFrame(wrap_dqn(env)) self.env = env num_cpu = config["num_cpu"] tf_config = tf.ConfigProto( inter_op_parallelism_threads=num_cpu, intra_op_parallelism_threads=num_cpu) self.sess = tf.Session(config=tf_config) self.dqn_graph = models.DQNGraph(env, config) # Create the replay buffer if config["prioritized_replay"]: self.replay_buffer = PrioritizedReplayBuffer( config["buffer_size"], alpha=config["prioritized_replay_alpha"]) prioritized_replay_beta_iters = ( config["prioritized_replay_beta_iters"]) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = ( config["schedule_max_timesteps"]) self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=config["prioritized_replay_beta0"], final_p=1.0) else: self.replay_buffer = ReplayBuffer(config["buffer_size"]) self.beta_schedule = None # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int( config["exploration_fraction"] * config["schedule_max_timesteps"]), initial_p=1.0, final_p=config["exploration_final_eps"]) # Initialize the parameters and copy them to the target network. self.sess.run(tf.global_variables_initializer()) self.dqn_graph.update_target(self.sess) self.episode_rewards = [0.0] self.episode_lengths = [0.0] self.saved_mean_reward = None self.obs = self.env.reset() self.num_timesteps = 0 self.num_iterations = 0 self.file_writer = tf.summary.FileWriter(self.logdir, self.sess.graph) self.saver = tf.train.Saver(max_to_keep=None)
def __init__(self, env_creator, config, logdir): env = env_creator() env = wrap_dqn(env, config["model"]) self.env = env self.config = config tf_config = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=tf_config) self.dqn_graph = models.DQNGraph(env, config, logdir) # Create the replay buffer if config["prioritized_replay"]: self.replay_buffer = PrioritizedReplayBuffer( config["buffer_size"], alpha=config["prioritized_replay_alpha"]) prioritized_replay_beta_iters = \ config["prioritized_replay_beta_iters"] if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = \ config["schedule_max_timesteps"] self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=config["prioritized_replay_beta0"], final_p=1.0) else: self.replay_buffer = ReplayBuffer(config["buffer_size"]) self.beta_schedule = None # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(config["exploration_fraction"] * config["schedule_max_timesteps"]), initial_p=1.0, final_p=config["exploration_final_eps"]) # Initialize the parameters and copy them to the target network. self.sess.run(tf.global_variables_initializer()) self.dqn_graph.update_target(self.sess) self.set_weights_time = RunningStat(()) self.sample_time = RunningStat(()) self.grad_time = RunningStat(()) # Note that workers don't need target vars to be synced self.variables = ray.experimental.TensorFlowVariables( tf.group(self.dqn_graph.q_t, self.dqn_graph.q_tp1), self.sess) self.episode_rewards = [0.0] self.episode_lengths = [0.0] self.saved_mean_reward = None self.obs = self.env.reset() self.file_writer = tf.summary.FileWriter(logdir, self.sess.graph)
def __init__(self, env_creator, config, logdir): env = env_creator() # TODO(ekl): replace this with RLlib preprocessors if "NoFrameskip" in env.spec.id: env = ScaledFloatFrame(wrap_dqn(env)) self.env = env self.config = config num_cpu = config["num_cpu"] tf_config = tf.ConfigProto(inter_op_parallelism_threads=num_cpu, intra_op_parallelism_threads=num_cpu) self.sess = tf.Session(config=tf_config) self.dqn_graph = models.DQNGraph(env, config) # Create the replay buffer if config["prioritized_replay"]: self.replay_buffer = PrioritizedReplayBuffer( config["buffer_size"], alpha=config["prioritized_replay_alpha"]) prioritized_replay_beta_iters = \ config["prioritized_replay_beta_iters"] if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = \ config["schedule_max_timesteps"] self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=config["prioritized_replay_beta0"], final_p=1.0) else: self.replay_buffer = ReplayBuffer(config["buffer_size"]) self.beta_schedule = None # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(config["exploration_fraction"] * config["schedule_max_timesteps"]), initial_p=1.0, final_p=config["exploration_final_eps"]) # Initialize the parameters and copy them to the target network. self.sess.run(tf.global_variables_initializer()) self.dqn_graph.update_target(self.sess) self.variables = ray.experimental.TensorFlowVariables( tf.group(self.dqn_graph.q_tp1, self.dqn_graph.q_t), self.sess) self.episode_rewards = [0.0] self.episode_lengths = [0.0] self.saved_mean_reward = None self.obs = self.env.reset() self.file_writer = tf.summary.FileWriter(logdir, self.sess.graph)