def __init__(self, name, model, lstm_model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): self.args = args self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index obs_ph_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) self.local_q_func = local_q_func self.act, self.p_debug = p_act(scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=self.agent_index, p_func=model, lstm_model=lstm_model, num_units=self.args.num_units, use_lstm=False, reuse=False) # Create experience buffer self.replay_buffer = ReplayBuffer(args.buffer_size) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None
def __init__(self, name, agents_number, agent_index, actors, act_space_n, args, common_obs_shape, sep_obs_shape, model, lstm_model, cnn_model, cnn_scope=None, lstm_scope=None, reuse=False, local_q_func=False, session=None): self.actors = actors self.name = name self.n = agents_number self.agent_index = agent_index self.args = args self.history_length = args.history_length common_obs_shape = [args.history_length] + list(common_obs_shape) common_obs_ph = U.BatchInput(common_obs_shape, name="common_observation").get() sep_obs_shape = [args.history_length] + list(sep_obs_shape[1:]) sep_obs_ph_n = [ U.BatchInput(sep_obs_shape, name="common_observation" + str(i)).get() for i in range(self.n) ] # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_common_obs_ph=common_obs_ph, make_sep_obs_ph_n=sep_obs_ph_n, act_space_n=act_space_n, cnn_model=cnn_model, cnn_scope=cnn_scope, q_index=agent_index, q_func=model, lstm_model=lstm_model, lstm_scope=lstm_scope, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), args=self.args, grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, reuse=False, use_lstm=self.args.use_lstm, session=session) self.replay_buffer = ReplayBuffer(args.buffer_size, args.history_length) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None
def __init__(self, name, agents_number, act_space_n, agent_index, args, common_obs_shape, sep_obs_shape, model, lstm_model, cnn_model, lstm_scope=None, cnn_scope=None, reuse=False, session=None, local_q_func=False): self.args = args self.name = name self.n = agents_number self.agent_index = agent_index self.local_q_func = local_q_func sep_obs_shape = [args.history_length] + list(sep_obs_shape[1:]) common_obs_shape = [args.history_length] + list(common_obs_shape) common_obs_ph = U.BatchInput(common_obs_shape, name="common_observation").get() sep_obs_ph_n = [ U.BatchInput(sep_obs_shape, name="common_observation" + str(i)).get() for i in range(self.n) ] self.act, self.p_debug = p_act( make_common_obs_ph=common_obs_ph, make_sep_obs_ph_n=sep_obs_ph_n, act_space_n=act_space_n, p_index=self.agent_index, p_func=model, lstm_model=lstm_model, cnn_model=cnn_model, lstm_scope=lstm_scope, cnn_scope=cnn_scope, use_lstm=self.args.use_lstm, use_cnn=self.args.use_cnn, reuse=reuse, session=session, scope=self.name, num_units=self.args.num_units, ) # Create experience buffer self.replay_buffer = ReplayBuffer(args.buffer_size, args.history_length) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None
def __init__(self, env_name, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): self.env_name = env_name self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.env_name + self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.env_name + self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_scope="common_" + self.name, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) # Create experience buffer self.replay_buffer = ReplayBuffer(args.buffer_size) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None
def __init__(self, name, model, lstm_model, obs_shape_n, act_space_n, agent_index, actors, args, local_q_func=False, session=None, lstm_scope=None): self.actors = actors self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args self.history_length = args.history_length obs_ph_n = [] for i in range(self.n): obs_shape = [args.history_length] + list(obs_shape_n[i]) obs_ph_n.append( U.BatchInput((obs_shape), name="observation" + str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, lstm_model=lstm_model, lstm_scope=lstm_scope, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), args=self.args, grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, reuse=False, use_lstm=self.args.use_lstm, session=session) self.replay_buffer = ReplayBuffer(args.buffer_size, args.history_length) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None
class MADDPGAgentTrainer(AgentTrainer): def __init__(self, name, model, lstm_model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False, reuse=False, session=None): self.args = args self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index obs_ph_n = [] for i in range(self.n): obs_shape = [args.history_length] + list(obs_shape_n[i]) obs_ph_n.append( U.BatchInput((obs_shape), name="observation" + str(i)).get()) self.local_q_func = local_q_func self.act, self.p_debug = p_act(scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=self.agent_index, p_func=model, lstm_model=lstm_model, num_units=self.args.num_units, use_lstm=self.args.use_lstm, reuse=reuse, session=session) # Create experience buffer self.replay_buffer = ReplayBuffer(args.buffer_size, args.history_length) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None def action(self, obs): obs = np.array(obs.queue) return self.act(obs[None])[0] def experience(self, obs, act, rew, new_obs, done, terminal): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, float(done))
def __init__(self, name, model, lstm_model, obs_shape_n, act_space_n, agent_index, actor_env, args, local_q_func=False, session=None): self.args = args self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index obs_ph_n = [] for i in range(self.n): obs_shape = [args.history_length] + list(obs_shape_n[i]) # obs_shape.append() obs_ph_n.append( U.BatchInput((obs_shape), name="observation" + str(i)).get()) optimizer = tf.train.AdamOptimizer(learning_rate=self.args.lr) self.p_train, self.p_update = p_train(scope=self.name, p_scope=actor_env, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=self.agent_index, p_func=model, q_func=model, lstm_model=lstm_model, optimizer=optimizer, grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=self.args.num_units, reuse=True, use_lstm=self.args.use_lstm, session=session, args=args) # Create experience buffer self.replay_buffer = ReplayBuffer(args.buffer_size, args.history_length) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None
class MADDPGAgentTrainer(AgentTrainer): def __init__(self, env_name, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): self.env_name = env_name self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.env_name + self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.env_name + self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_scope="common_" + self.name, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) # Create experience buffer self.replay_buffer = ReplayBuffer(args.buffer_size) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None def action(self, obs): return self.act(obs[None])[0] def experience(self, obs, act, rew, new_obs, done, terminal): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, float(done)) def preupdate(self): self.replay_sample_index = None def update(self, agents, t): if len( self.replay_buffer ) < self.max_replay_buffer_len: # replay buffer is not large enough return if not t % 100 == 0: # only update every 100 steps return self.replay_sample_index = self.replay_buffer.make_index( self.args.batch_size) # collect replay sample from all agents obs_n = [] obs_next_n = [] act_n = [] index = self.replay_sample_index for i in range(self.n): # buffer obs, act, rew, obs_next, done = agents[ i].replay_buffer.sample_index(index) obs_n.append(obs) obs_next_n.append(obs_next) act_n.append(act) obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index) # train q network num_sample = 1 target_q = 0.0 for j in range(num_sample): target_act_next_n = [ agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n) ] target_q_next = self.q_debug['target_q_values']( *(obs_next_n + target_act_next_n)) target_q += rew + self.args.gamma * (1.0 - done) * target_q_next target_q /= num_sample q_loss = self.q_train(*(obs_n + act_n + [target_q])) # train p network p_loss = self.p_train(*(obs_n + act_n)) self.p_update() self.q_update() return [ q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q) ]
class MADDPGAgentTrainer(AgentTrainer): def __init__(self, name, model, lstm_model, obs_shape_n, act_space_n, agent_index, actors, args, local_q_func=False, session=None, lstm_scope=None): self.actors = actors self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args self.history_length = args.history_length obs_ph_n = [] for i in range(self.n): obs_shape = [args.history_length] + list(obs_shape_n[i]) obs_ph_n.append( U.BatchInput((obs_shape), name="observation" + str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, lstm_model=lstm_model, lstm_scope=lstm_scope, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), args=self.args, grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, reuse=False, use_lstm=self.args.use_lstm, session=session) self.replay_buffer = ReplayBuffer(args.buffer_size, args.history_length) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None def experience(self, obs, act, rew, new_obs, done, terminal): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, float(done)) def preupdate(self): self.replay_sample_index = None def update(self, agents, t): # 训练critic # print("hello, nihao a ") if len( self.replay_buffer ) < self.max_replay_buffer_len: # replay buffer is not large enough return if not t % 100 == 0: # only update every 100 steps return # print("critic update") self.replay_sample_index = self.replay_buffer.make_index( self.args.batch_size) # collect replay sample from all agents obs_n = [] obs_next_n = [] act_n = [] index = self.replay_sample_index for i in range(self.n): # buffer obs, act, rew, obs_next, done = agents[ i].replay_buffer.sample_index(index) obs_n.append(obs) obs_next_n.append(obs_next) act_n.append(act) obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index) # train q network num_sample = 1 target_q = 0.0 for i in range(num_sample): target_act_next_n = [ self.actors[j].p_debug['target_act'](obs_next_n[i]) for j in range(self.n) ] target_q_next = self.q_debug['target_q_values']( *(obs_next_n + target_act_next_n)) target_q += rew + self.args.gamma * (1.0 - done) * target_q_next target_q /= num_sample q_loss = self.q_train(*(obs_n + act_n + [target_q])) # train p network self.q_update() # print("step: ", t, "q_loss: ", q_loss) return [q_loss, np.mean(target_q), np.mean(rew), np.std(target_q)]
class MADDPGAgentTrainer(AgentTrainer): def __init__(self, name, agents_number, agent_index, actors, act_space_n, args, common_obs_shape, sep_obs_shape, model, lstm_model, cnn_model, cnn_scope=None, lstm_scope=None, reuse=False, local_q_func=False, session=None): self.actors = actors self.name = name self.n = agents_number self.agent_index = agent_index self.args = args self.history_length = args.history_length common_obs_shape = [args.history_length] + list(common_obs_shape) common_obs_ph = U.BatchInput(common_obs_shape, name="common_observation").get() sep_obs_shape = [args.history_length] + list(sep_obs_shape[1:]) sep_obs_ph_n = [ U.BatchInput(sep_obs_shape, name="common_observation" + str(i)).get() for i in range(self.n) ] # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_common_obs_ph=common_obs_ph, make_sep_obs_ph_n=sep_obs_ph_n, act_space_n=act_space_n, cnn_model=cnn_model, cnn_scope=cnn_scope, q_index=agent_index, q_func=model, lstm_model=lstm_model, lstm_scope=lstm_scope, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), args=self.args, grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, reuse=False, use_lstm=self.args.use_lstm, session=session) self.replay_buffer = ReplayBuffer(args.buffer_size, args.history_length) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None def experience(self, obs, act, rew, new_obs, done, terminal): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, done) def preupdate(self): self.replay_sample_index = None def update(self, agents, t, agent_index): # 训练critic # print("hello, nihao a ") if len( self.replay_buffer ) < self.max_replay_buffer_len: # replay buffer is not large enough return if not t % 100 == 0: # only update every 100 steps return # print("critic update") self.replay_sample_index = agents[0].replay_buffer.make_index( self.args.batch_size, agent_index) # collect replay sample from all agents index = self.replay_sample_index (common_obs_n, sep_obs_n), act_n, rew_n, (common_obs_next_n, sep_obs_next_n), done_n = \ agents[0].replay_buffer.sample_index(index) act, rew, done = act_n[:, agent_index], rew_n[:, agent_index], done_n[:, agent_index] # obs, obs_next = sep_obs_n[:, agent_index], sep_obs_next_n[:, agent_index] sep_obs_n_list, sep_obs_next_n_list, act_n_list = [], [], [] for i in range(self.n): sep_obs_n_list.append(sep_obs_n[:, :, i]) sep_obs_next_n_list.append(sep_obs_next_n[:, :, i]) act_n_list.append(act_n[:, i]) # train q network num_sample = 1 target_q = 0.0 for i in range(num_sample): # obs_next_n[j]是第j个智能体的observation # p_debug的输入应该是[[?, 4, 30, 30, 3], [?, 4, 4]] target_act_next_n = [ self.actors[j].p_debug['target_act'](common_obs_n, sep_obs_next_n[:, :, j]) for j in range(self.n) ] target_q_next = self.q_debug['target_q_values']( *([common_obs_next_n] + sep_obs_next_n_list + target_act_next_n)) target_q += rew + self.args.gamma * (1.0 - done) * target_q_next target_q /= num_sample q_loss = self.q_train(*([common_obs_n] + sep_obs_n_list + act_n_list + [target_q])) # train p network self.q_update() # print("step: ", t, "q_loss: ", q_loss)33 return [q_loss, np.mean(target_q), np.mean(rew), np.std(target_q)]
class MADDPGAgentTrainer(AgentTrainer): def __init__(self, name, model, lstm_model, obs_shape_n, act_space_n, agent_index, actor_env, args, local_q_func=False, session=None): self.args = args self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index obs_ph_n = [] for i in range(self.n): obs_shape = [args.history_length] + list(obs_shape_n[i]) # obs_shape.append() obs_ph_n.append( U.BatchInput((obs_shape), name="observation" + str(i)).get()) optimizer = tf.train.AdamOptimizer(learning_rate=self.args.lr) self.p_train, self.p_update = p_train(scope=self.name, p_scope=actor_env, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=self.agent_index, p_func=model, q_func=model, lstm_model=lstm_model, optimizer=optimizer, grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=self.args.num_units, reuse=True, use_lstm=self.args.use_lstm, session=session, args=args) # Create experience buffer self.replay_buffer = ReplayBuffer(args.buffer_size, args.history_length) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None def experience(self, obs, act, rew, new_obs, done, terminal): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, float(done)) def preupdate(self): self.replay_sample_index = None def update(self, agents, critics, t, index): # 这个就是训练actor的 # if len(self.replay_buffer) < self.max_replay_buffer_len: if len(critics[0].replay_buffer) < self.max_replay_buffer_len: return if not t % 100 == 0: # only update every 100 steps return # print("actor update") self.replay_sample_index = critics[index].replay_sample_index # collect replay sample from all agents obs_n = [] obs_next_n = [] act_n = [] index = self.replay_sample_index for i in range(self.n): # buffer obs, act, rew, obs_next, done = critics[ i].replay_buffer.sample_index(index) obs_n.append(obs) obs_next_n.append(obs_next) act_n.append(act) # train p network p_loss = self.p_train(*(obs_n + act_n)) self.p_update() # print("step: ", t, "p_loss: ", p_loss) return [p_loss]
class MADDPGAgentTrainer(AgentTrainer): def __init__(self, name, agents_number, act_space_n, agent_index, actor_scope, args, common_obs_shape, sep_obs_shape, model, lstm_model, cnn_model, cnn_scope=None, lstm_scope=None, reuse=False, local_q_func=False, session=None): self.args = args self.name = name self.n = agents_number self.agent_index = agent_index optimizer = tf.train.AdamOptimizer(learning_rate=self.args.lr) sep_obs_shape = [args.history_length] + list(sep_obs_shape[1:]) common_obs_shape = [args.history_length] + list(common_obs_shape) common_obs_ph = U.BatchInput(common_obs_shape, name="common_observation").get() sep_obs_ph_n = [ U.BatchInput(sep_obs_shape, name="common_observation" + str(i)).get() for i in range(self.n) ] self.p_train, self.p_update = p_train(scope=self.name, p_scope=actor_scope, make_common_obs_ph=common_obs_ph, make_sep_obs_ph_n=sep_obs_ph_n, act_space_n=act_space_n, p_index=self.agent_index, p_func=model, q_func=model, cnn_model=cnn_model, cnn_scope=cnn_scope, lstm_model=lstm_model, lstm_scope=lstm_scope, optimizer=optimizer, grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=self.args.num_units, reuse=True, use_lstm=self.args.use_lstm, session=session, args=args) # Create experience buffer self.replay_buffer = ReplayBuffer(args.buffer_size, args.history_length) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None def experience(self, obs, act, rew, new_obs, done, terminal): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, float(done)) def preupdate(self): self.replay_sample_index = None def update(self, agents, critics, t, agent_index): # 这个就是训练actor的 # if len(self.replay_buffer) < self.max_replay_buffer_len: if len(critics[0].replay_buffer) < self.max_replay_buffer_len: return if not t % 100 == 0: # only update every 100 steps return # print("actor update") self.replay_sample_index = critics[0].replay_buffer.make_index( self.args.batch_size, agent_index) # collect replay sample from all agents index = self.replay_sample_index (common_obs_n, sep_obs_n), act_n, rew_n, (common_obs_next_n, sep_obs_next_n), done_n = \ critics[0].replay_buffer.sample_index(index) act, rew, done = act_n[:, agent_index], rew_n[:, agent_index], done_n[:, agent_index] # obs, obs_next = sep_obs_n[:, agent_index], sep_obs_next_n[:, agent_index] sep_obs_n_list, sep_obs_next_n_list, act_n_list = [], [], [] for i in range(self.n): sep_obs_n_list.append(sep_obs_n[:, :, i]) sep_obs_next_n_list.append(sep_obs_next_n[:, :, i]) act_n_list.append(act_n[:, i]) # train p network p_loss = self.p_train(*([common_obs_n] + sep_obs_n_list + act_n_list)) self.p_update() # print("step: ", t, "p_loss: ", p_loss) return [p_loss]
class MADDPGAgentTrainer(AgentTrainer): def __init__(self, name, agents_number, act_space_n, agent_index, args, common_obs_shape, sep_obs_shape, model, lstm_model, cnn_model, lstm_scope=None, cnn_scope=None, reuse=False, session=None, local_q_func=False): self.args = args self.name = name self.n = agents_number self.agent_index = agent_index self.local_q_func = local_q_func sep_obs_shape = [args.history_length] + list(sep_obs_shape[1:]) common_obs_shape = [args.history_length] + list(common_obs_shape) common_obs_ph = U.BatchInput(common_obs_shape, name="common_observation").get() sep_obs_ph_n = [ U.BatchInput(sep_obs_shape, name="common_observation" + str(i)).get() for i in range(self.n) ] self.act, self.p_debug = p_act( make_common_obs_ph=common_obs_ph, make_sep_obs_ph_n=sep_obs_ph_n, act_space_n=act_space_n, p_index=self.agent_index, p_func=model, lstm_model=lstm_model, cnn_model=cnn_model, lstm_scope=lstm_scope, cnn_scope=cnn_scope, use_lstm=self.args.use_lstm, use_cnn=self.args.use_cnn, reuse=reuse, session=session, scope=self.name, num_units=self.args.num_units, ) # Create experience buffer self.replay_buffer = ReplayBuffer(args.buffer_size, args.history_length) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None def action(self, common_obs, sep_obs): # obs = np.array(obs.queue) # print(obs) return self.act(common_obs[None], sep_obs[None])[0] def experience(self, obs, act, rew, new_obs, done, terminal): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, float(done))