def __init__(self, env, agent, memory_max): self.env = env self.agent = agent self.memory_max = memory_max self.progbar = Progbar(self.memory_max) self.memory = dummy_obj.Memory(self.memory_max,["t","state","action","next_state","reward","terminated"])
def __init__(self, patch_size, dictionary_size, alpha, **kwargs): super(ReductionLayer, self).__init__(**kwargs) self.dict_size = dictionary_size self.patch_layer = patchlayer.PatchLayer(patch_size) self.alpha = alpha self.progbar = Progbar(100, stateful_metrics=["loss"]) self.old_D = 0.0
def __init__(self, env, gamma, max_steps): self.agent_type = "TRPO" policy = self.deep(env) self.old_policy = self.deep(env) super(TRPO, self).__init__(policy) self.discount = gamma self.env = env self.max_steps = max_steps self.setup_agent() self.baseline = deepfunctions.BaselineValueFunction(env) self.episodes = [] self.progbar = Progbar(100)
def __init__(self, env, deep_func, gamma, batch_size, memory_min, memory_max, update_double=10000, train_steps=1000000, log_freq=1000, eps_start=1, eps_decay=-1, eps_min=0.1): super(DDPG, self).__init__() self.env = env self.Q = self.model = deep_func(env) self.target_Q = deep_func(env) self.Q.summary() self.discount = gamma self.memory_min = memory_min self.memory_max = memory_max self.eps = eps_start self.train_steps = train_steps self.batch_size = batch_size self.done = 0 self.log_freq = log_freq self.progbar = Progbar(self.memory_max) self.memory = ReplayMemory( self.memory_max, ["state", "action", "reward", "next_state", "terminated"]) self.eps_decay = eps_decay if eps_decay == -1: self.eps_decay = 1 / train_steps self.eps_min = eps_min self.update_double = update_double self.actions = [] self.path_generator = self.roller() self.past_rewards = collections.deque([], 50)
def __init__(self, env, gamma, memory_max, batch_size, train_steps=1000000, log_freq = 1000, eps_start = 1, eps_decay = -1, eps_min = 0.1): model = self.deep(env) super(DDPG,self).__init__(model) self.discount = gamma self.env = env self.memory_max = memory_max self.eps = eps_start self.train_steps = train_steps self.batch_size = batch_size self.done = 0 self.log_freq = log_freq self.progbar = Progbar(self.memory_max) self.memory = dummy_obj.Memory(self.memory_max,self.batch_size,["t","state","action","reward","next_state","terminated"]) self.eps_decay = eps_decay if eps_decay == -1: self.eps_decay = 1/train_steps self.eps_min = eps_min
def __init__(self, env, gamma, batch_size, memory_max, double_update=100000, train_steps=1000000, log_freq=1000, eps_start=1, eps_decay=-1, eps_min=0.1): model = self.deep(env) self.agent_type = "DDQN" super(DDQN, self).__init__(model) self.target_model = self.deep(env) self.target_model.net.set_weights(self.model.net.get_weights()) self.discount = gamma self.env = env self.memory_max = memory_max self.eps = eps_start self.train_steps = train_steps self.batch_size = batch_size self.done = 0 self.log_freq = log_freq self.progbar = Progbar(self.memory_max) self.memory = dummy_obj.ReplayMemory( self.memory_max, ["t", "state", "action", "reward", "next_state", "terminated"]) self.eps_decay = eps_decay if eps_decay == -1: self.eps_decay = 1 / train_steps self.eps_min = eps_min self.update_double = double_update
class DDPG(Agent): """ Deep Deterministic Policy Gradient """ name = "DDPG" def __init__(self, env, deep_func, gamma, batch_size, memory_min, memory_max, update_double=10000, train_steps=1000000, log_freq=1000, eps_start=1, eps_decay=-1, eps_min=0.1): super(DDPG, self).__init__() self.env = env self.Q = self.model = deep_func(env) self.target_Q = deep_func(env) self.Q.summary() self.discount = gamma self.memory_min = memory_min self.memory_max = memory_max self.eps = eps_start self.train_steps = train_steps self.batch_size = batch_size self.done = 0 self.log_freq = log_freq self.progbar = Progbar(self.memory_max) self.memory = ReplayMemory( self.memory_max, ["state", "action", "reward", "next_state", "terminated"]) self.eps_decay = eps_decay if eps_decay == -1: self.eps_decay = 1 / train_steps self.eps_min = eps_min self.update_double = update_double self.actions = [] self.path_generator = self.roller() self.past_rewards = collections.deque([], 50) def act(self, state): if np.random.rand() < self.eps: return np.random.randint(self.env.action_space.n) return np.argmax(self.Q.predict(state)) def train(self): self.progbar.__init__(self.memory_min) while (self.memory.size < self.memory_min): self.path_generator.__next__() while (self.done < self.train_steps): to_log = 0 self.progbar.__init__(self.update_double) old_theta = self.Q.flattener.get() th0 = self.Q.net.dense[0].weight.detach().clone() self.target_Q.copy(self.Q) while to_log < self.update_double: self.path_generator.__next__() rollout = self.memory.sample(self.batch_size) state_batch = torch.tensor(rollout["state"], dtype=torch.float, device=device) action_batch = torch.tensor(rollout["action"], dtype=torch.long, device=device) reward_batch = torch.tensor(rollout["reward"], dtype=torch.float, device=device) non_final_batch = torch.tensor(1 - rollout["terminated"], dtype=torch.float, device=device) next_state_batch = torch.tensor(rollout["next_state"], dtype=torch.float, device=device) #current_q = self.Q(state_batch) current_q = self.Q(state_batch).gather( 1, action_batch.unsqueeze(1)).view(-1) _, a_prime = self.Q(next_state_batch).max(1) # Compute the target of the current Q values #target_q = self.target_Q(state_batch).gather(1, action_batch.unsqueeze(1)).view(-1) next_max_q = self.target_Q(next_state_batch).gather( 1, a_prime.unsqueeze(1)).view(-1) #target_q[torch.arange(self.batch_size).long(),action_batch.squeeze()] = reward_batch + self.discount * non_final_batch * next_max_q.squeeze() target_q = reward_batch + self.discount * non_final_batch * next_max_q.squeeze( ) # Compute loss loss = self.Q.total_loss(current_q, target_q.detach( )) # loss = self.Q.total_loss(current_q, target_q) # Optimize the model self.Q.optimizer.zero_grad() loss.backward( ) #error = target_q-current_q,current_q.backward(-1.0 * error.clamp(-1, 1)) self.Q.optimize() self.progbar.add(self.batch_size, values=[("Loss", float(loss.detach().cpu().numpy()))]) to_log += self.batch_size self.target_Q.copy(self.Q) new_theta = self.Q.flattener.get() th1 = self.Q.net.dense[0].weight.detach() self.log( "Delta Theta L1", float((new_theta - old_theta).mean().abs().detach().cpu().numpy())) self.log("Delta Dense Theta L1", float((th0 - th1).mean().abs().detach().cpu().numpy())) self.log("Av 50ep rew", np.mean(self.past_rewards)) self.log("Max 50ep rew", np.max(self.past_rewards)) self.log("Min 50ep rew", np.min(self.past_rewards)) self.log("Epsilon", self.eps) self.log("Done", self.done) self.log("Total", self.train_steps) self.target_Q.copy(self.Q) self.print() #self.play() self.save(self.env.name) def set_eps(self, x): self.eps = max(x, self.eps_min) def roller(self): state = self.env.reset() ep_reward = 0 while True: episode = self.memory.empty_episode() for i in range(self.batch_size): # save current state episode["state"].append(state) # act action = self.act(state) self.actions.append(action) state, rew, done, info = self.env.step(action) episode["next_state"].append(state) episode["action"].append(action) episode["reward"].append(rew) episode["terminated"].append(done) ep_reward += rew self.set_eps(self.eps - self.eps_decay) if done: self.past_rewards.append(ep_reward) state = self.env.reset() ep_reward = 0 self.done += 1 if not (self.done) % self.update_double: self.update = True # record the episodes self.memory.record(episode) if self.memory.size < self.memory_min: self.progbar.add(self.batch_size, values=[("Loss", 0.0)]) yield True def play(self, name='play'): name = name + self.env.name + str(self.eps) eps = self.eps self.set_eps(0) state = self.env.reset(record=True) done = False while not done: action = self.act(state) state, _, done, info = self.env.step(action) self.env.save_episode(name) self.set_eps(eps) def load(self): super(DDPG, self).load(self.env.name)
class ReductionLayer(Layer): def __init__(self, patch_size, dictionary_size, alpha, **kwargs): super(ReductionLayer, self).__init__(**kwargs) self.dict_size = dictionary_size self.patch_layer = patchlayer.PatchLayer(patch_size) self.alpha = alpha self.progbar = Progbar(100, stateful_metrics=["loss"]) self.old_D = 0.0 def build(self, input_shape): self.patch_layer.build(input_shape) self.input_shape_t = self.patch_layer.compute_output_shape(input_shape) self.dim = self.input_shape_t[-1] self.filters = self.dict_size self.strides = (1, self.dim) self.kernel_shape = (1, self.dim, self.dict_size) self.D0 = K.random_normal_variable((self.dim, self.dict_size), mean=0, scale=1) self.D = tf.matmul(tf.diag(1 / tf.norm(self.D0, axis=1)), self.D0) self.D_ols = tf.matmul(tf.linalg.inv( tf.matmul(self.D, self.D, transpose_a=True) + self.alpha * tf.eye(self.dict_size)), self.D, transpose_b=True) self.kernel = K.reshape(self.D_ols, self.kernel_shape) #self.add_weight(shape=self.kernel_shape, # initializer='glorot_uniform', # name='kernel') self.D_kernel = K.reshape(tf.matmul(self.D, self.D_ols), (1, self.dim, self.dim)) self.trainable_weights = [self.D0] def call(self, inputs): beta = K.conv1d(self.patch_layer(inputs), self.kernel, strides=1, padding='valid', data_format='channels_last', dilation_rate=1) return beta def fit(self, X, Y, batch_size=64): print("Fitting the reduction") n = len(X) self.progbar.__init__(n) for i in range(0, n, batch_size): weights = np.ones(min(n, i + batch_size) - i) inputs = X[i:min(i + batch_size, n)] targets = Y[i:min(n, i + batch_size)] self.fit_op([inputs, targets, weights]) self.progbar.add(min(batch_size, n - batch_size), values=[('loss', self.loss([inputs, targets, weights])[0])]) def display_update(self): res = np.linalg.norm(K.eval(self.D) - self.old_D) self.old_D = K.eval(self.D) return res def set_D(self, D): K.set_value(self.D_ridge, D) def compile(self, model): self.optimizer = tf.train.RMSPropOptimizer(0.001) self.opt = self.optimizer.minimize(model.total_loss, var_list=[self.D0]) self.fit_op = K.Function( [model.input, model.targets[0], model.sample_weights[0]], [self.opt]) self.loss = K.Function( [model.input, model.targets[0], model.sample_weights[0]], [model.total_loss]) print( "Reduction Layer Compiled, batch %d" % self.patch_layer.patch_size, "\n", "Output shape:", self.compute_output_shape(self.input_shape_t)) def compute_output_shape(self, input_shape): return self.patch_layer.compute_output_shape(input_shape)[:2] + ( self.dict_size, ) def get_config(self): config = { 'rank': 1, 'filters': self.dict_size, 'kernel_size': self.kernel_shape, 'strides': self.strides, 'padding': 'valid', 'data_format': 'channels_last', 'activation': 'linear', 'kernel_initializer': 'personal' } base_config = super(ReductionLayer, self).get_config() return dict(list(base_config.items()) + list(config.items()))
class DDPG(Agent): deep = deepfunctions.DeepQ def __init__(self, env, gamma, memory_max, batch_size, train_steps=1000000, log_freq = 1000, eps_start = 1, eps_decay = -1, eps_min = 0.1): model = self.deep(env) super(DDPG,self).__init__(model) self.discount = gamma self.env = env self.memory_max = memory_max self.eps = eps_start self.train_steps = train_steps self.batch_size = batch_size self.done = 0 self.log_freq = log_freq self.progbar = Progbar(self.memory_max) self.memory = dummy_obj.Memory(self.memory_max,self.batch_size,["t","state","action","reward","next_state","terminated"]) self.eps_decay = eps_decay if eps_decay == -1: self.eps_decay = 1/train_steps self.eps_min = eps_min def act(self,state): if np.random.rand()<self.eps: return self.env.action_space.sample() return np.argmax(self.model.predict(state)) def setup_agent(): current_state = K. def train(self): to_log = 0 self.progbar.__init__(self.batch_size*self.log_freq) while(self.done<self.train_steps): _ = self.env.reset() old_theta = self.Flaten.get() avg_rew = 0 max_rew = 0 min_rew = 0 while to_log <self.log_freq: self.get_episode() rollout = self.memory.sample() actions = rollout["action"] rewards = rollout["reward"] not_final = np.logical_not(rollout["terminated"]) avg_rew += np.mean(rewards) max_rew,min_rew = max(np.max(rewards),max_rew),min(min_rew,np.min(rewards)) target_q = self.model.predict(rollout["next_state"]) max_Q_prim = np.max(target_q,axis=1) for i in range(len(actions)): target_q[i,actions[i]] = rewards[i] + not_final[i]* self.discount*max_Q_prim[i] self.model.train_on_batch(rollout["state"],target_q) to_log+=1 new_theta = self.Flaten.get() self.log("Theta MSE",np.linalg.norm(new_theta-old_theta)) self.log("Average reward",np.mean(avg_rew/self.log_freq)) self.log("Max reward",max_rew) self.log("Min reward",min_rew) self.log("Epsilon",self.eps) self.log("Done",self.done) self.log("Total",self.train_steps) self.print_log() self.play() self.save(self.env.name) self.progbar.__init__(self.batch_size*self.log_freq) to_log = 0 def set_eps(self,x): self.eps = max(x,self.eps_min) def get_episode(self): episode = self.memory.empty_episode() state = self.env.current_state() for i in range(self.batch_size): self.progbar.add(1) self.done += 1 # save current state episode["state"].append(state) # act action = self.act(state) state, rew, done = self.env.step(action) episode["next_state"].append(state) episode["t"].append(i) episode["action"].append(action) episode["reward"].append(rew) episode["terminated"].append(done) self.set_eps(self.eps-self.eps_decay) if done: state = self.env.reset() # record the episodes self.memory.record(episode) del(episode) def play(self,name='play'): name = name+self.env.name+str(self.eps) eps = self.eps self.set_eps(0) state = self.env.reset() #print(self.env.t,end=",") done = False while not done: action = self.act(state) state, _, done = self.env.step(action) #print(self.env.t,end=",") self.env.draw(name) self.set_eps(eps)
class TRPO(Agent): options = { "cg_damping": (1e-1, "Add multiple of the identity to Fisher matrix during CG"), "max_kl": (1e-2, "KL divergence between old and new policy (averaged over state-space)" ), "linesearch_accept": (1e-1, "Lineseach accept ratio") } deep = deepfunctions.DeepPolicy def __init__(self, env, gamma, max_steps): self.agent_type = "TRPO" policy = self.deep(env) self.old_policy = self.deep(env) super(TRPO, self).__init__(policy) self.discount = gamma self.env = env self.max_steps = max_steps self.setup_agent() self.baseline = deepfunctions.BaselineValueFunction(env) self.episodes = [] self.progbar = Progbar(100) def setup_agent(self): self.states = self.model.input self.actions = K.placeholder(ndim=1, dtype='int32') self.advantages = K.placeholder(ndim=1) current_pi = self.model.output old_pi = self.old_policy(self.states) log_likeli_pi = utils.loglikelihood(self.actions, current_pi) log_likeli_old_pi = utils.loglikelihood(self.actions, old_pi) N = K.cast(K.shape(self.states)[0], dtype='float32') # Policy gradient: surrogate_loss = (-1.0 / N) * K.sum( K.exp(log_likeli_pi - log_likeli_old_pi) * self.advantages) policy_gradient = self.model.flattener.flatgrad(self.model.output) kl_firstfixed = K.mean(utils.entropy(current_pi)) grads = self.model.flattener.flatgrad(kl_firstfixed) flat_tangent = K.placeholder(ndim=1) grad_vector_product = K.sum(grads * flat_tangent) # Fisher-vector product fisher_vector_product = self.model.flattener.flatgrad( grad_vector_product) entropy = K.mean(utils.entropy(current_pi)) losses = [surrogate_loss, kl_firstfixed, entropy] self.loss_names = ["Surrogate", "KL", "Entropy"] args = [self.states, self.actions, self.advantages] self.compute_policy_gradient = K.function(args, [policy_gradient]) self.compute_losses = K.function(args, losses) self.compute_fisher_vector_product = K.function( [flat_tangent] + args, [fisher_vector_product]) def train(self): self.rollout() states = np.concatenate( [episode["state"] for episode in self.episodes], axis=0) actions = np.concatenate( [episode["action"] for episode in self.episodes], axis=0) advantages = np.concatenate( [episode["advantage"] for episode in self.episodes], axis=0) args = (states, actions, advantages) thprev = self.model.flattener.get_value() self.old_policy.flattener.set_value(thprev) g = self.compute_policy_gradient([*args])[0] losses_before = self.compute_losses([*args]) if np.allclose(g, 0): print("got zero gradient. not updating") else: print("Using Conjugate gradient") stepdir = m_utils.conjugate_gradient( lambda x: self.fisher_vector_product(x, args), -g) shs = .5 * stepdir.dot(self.fisher_vector_product(stepdir, args)) lm = np.sqrt(shs / self.options["max_kl"][0]) print("Lagrange multiplier:", lm, "norm(g):", np.linalg.norm(g)) fullstep = stepdir / lm def loss(th): self.model.flattener.set_value(th) return self.compute_losses([*args])[0] success, theta = m_utils.linesearch(loss, thprev, fullstep) print("Line-Search Success", success) self.model.flattener.set_value(theta) losses_after = self.compute_losses([*args]) for (lname, lbefore, lafter) in zip(self.loss_names, losses_before, losses_after): self.log(lname + "_before", lbefore) self.log(lname + "_after", lafter) self.print_log() self.model.save(self.env.name) def act(self, state, train=False): proba = self.model.predict(state) if train: action = utils.choice_weighted(proba) else: action = np.argmax(proba) return action def fisher_vector_product(self, p, args): return self.compute_fisher_vector_product( [p] + [*args])[0] + self.options["cg_damping"][0] * p def rollout(self): self.episodes = [] self.collected = 0 self.progbar.__init__(self.max_steps) while self.collected < self.max_steps: self.get_episode() self.compute_advantage() self.baseline.fit(self.episodes) def get_episode(self): state = self.env.reset() episode = { s: [] for s in ["t", "state", "action", "reward", "terminated"] } i = 0 while self.collected < self.max_steps: episode["t"].append(i) episode["state"].append(state) # act action = self.act(state, train=True) state, rew, done, info = self.env.step(action) episode["action"].append(action) episode["reward"].append(rew) episode["terminated"].append(done) i += 1 self.collected += 1 self.progbar.add(1, values=[('Info', info)]) if done: break for k, v in episode.items(): episode[k] = np.array(v) episode["return"] = discount(np.array(episode["reward"]), self.discount) self.episodes.append(episode) def compute_advantage(self): # Compute baseline, advantage for episode in self.episodes: b = episode["baseline"] = self.baseline.predict(episode) b1 = np.append(b, 0 if episode["terminated"][-1] else b[-1]) deltas = episode["reward"] + self.discount * b1[1:] - b1[:-1] episode["advantage"] = discount(deltas, self.discount) alladv = np.concatenate( [episode["advantage"] for episode in self.episodes]) # Standardize advantage std = alladv.std() mean = alladv.mean() for episode in self.episodes: episode["advantage"] = (episode["advantage"] - mean) / std def play(self, name='play'): state = self.env.reset() done = False while not done: action = self.act(state) state, _, done, _ = self.env.step(action) self.env.draw(name)
class Roller(object): def __init__(self, env, agent, memory_max): self.env = env self.agent = agent self.memory_max = memory_max self.progbar = Progbar(self.memory_max) self.memory = dummy_obj.Memory(self.memory_max,["t","state","action","next_state","reward","terminated"]) def rollout(self,num_steps): collected = 0 self.progbar.__init__(num_steps) self.agent.set_epsilon(1) self.agent.theta = 0 while collected < num_steps: collected += self.get_episode(num_steps-collected+1,1/num_steps) roll = self.memory.random_sample(num_steps) return roll def get_episode(self, length, eps): state = self.env.reset() episode = self.memory.empty_episode() i = 0 while i < length: self.progbar.add(1) # save current state episode["state"].append(state) # act action = self.agent.act(state) state, rew, done = self.env.step(action) episode["next_state"].append(state) episode["t"].append(i) episode["action"].append(action) episode["reward"].append(rew) episode["terminated"].append(done) self.agent.decrement_eps(eps) i += 1 if done: state = self.env.reset() break # record the episodes self.memory.record(episode) del(episode) return i def compute_advantage(self): # Compute baseline, advantage for episode in self.episodes: b = episode["baseline"] = self.baseline.predict(episode) b1 = np.append(b, 0 if episode["terminated"][-1] else b[-1]) deltas = episode["reward"] + self.discount*b1[1:] - b1[:-1] episode["advantage"] = discount(deltas, self.discount) alladv = np.concatenate([episode["advantage"] for episode in self.episodes]) # Standardize advantage std = alladv.std() mean = alladv.mean() for episode in self.episodes: episode["advantage"] = (episode["advantage"] - mean) / std def play(self,name='play'): eps = self.agent.eps self.agent.set_epsilon(0) state = self.env.reset() done = False while not done: action = self.agent.act(state) state, _, done = self.env.step(action) self.env.draw(name) self.agent.set_epsilon(eps)
class DQN(Agent): deep = deepfunctions.DeepQ def __init__(self, env, gamma, batch_size, memory_max, train_steps=1000000, log_freq = 1000, eps_start = 1, eps_decay = -1, eps_min = 0.1): model = self.deep(env) self.agent_type = "DQN" super(DQN,self).__init__(model) self.discount = gamma self.env = env self.memory_max = memory_max self.eps = eps_start self.train_steps = train_steps self.batch_size = batch_size self.done = 0 self.log_freq = log_freq self.progbar = Progbar(self.memory_max) self.memory = dummy_obj.ReplayMemory(self.memory_max,["t","state","action","reward","next_state","terminated"]) self.eps_decay = eps_decay if eps_decay == -1: self.eps_decay = 1/train_steps self.eps_min = eps_min def act(self,state): if np.random.rand()<self.eps: return self.env.action_space.sample() return np.argmax(self.model.predict(state)) def setup_model(self): current_state = K.placeholder(shape=(None,)+self.env.observation_space.shape) next_state = K.placeholder(shape=(None,)+self.env.observation_space.shape) action = K.placeholder(ndim=1) terminated = K.placeholder(ndim=1) reward = K.placeholder(ndim=1) current_Q = self.model.net(current_state) next_Q = self.model.net(next_state) target_Q optimizer = tf.train.RMSProp() loss = K.mean(K.square(target_q-current_Q)) op = K.Function([current_state,next_state,action,reward,terminated],[optimizer.minimize(loss)]) def train(self): to_log = 0 self.progbar.__init__(self.batch_size*self.log_freq) while(self.done<self.train_steps): _ = self.env.reset() old_theta = self.Flaten.get() avg_rew = 0 max_rew = 0 min_rew = 0 while to_log <self.log_freq: self.get_episode() rollout = self.memory.sample(self.batch_size) actions = rollout["action"] rewards = rollout["reward"] not_final = np.logical_not(rollout["terminated"]) avg_rew += np.mean(rewards) max_rew,min_rew = max(np.max(rewards),max_rew),min(min_rew,np.min(rewards)) target_q = self.model.predict(rollout["next_state"]) max_Q_prim = np.max(target_q,axis=1) for i in range(len(actions)): target_q[i,actions[i]] = rewards[i] + not_final[i]* self.discount*max_Q_prim[i] self.model.train_on_batch(rollout["state"],target_q) to_log+=1 new_theta = self.Flaten.get() self.log("Theta MSE",np.linalg.norm(new_theta-old_theta)) self.log("Average reward",np.mean(avg_rew/self.log_freq)) self.log("Max reward",max_rew) self.log("Min reward",min_rew) self.log("Epsilon",self.eps) self.log("Done",self.done) self.log("Total",self.train_steps) self.print_log() self.play() self.save(self.env.name) self.progbar.__init__(self.batch_size*self.log_freq) to_log = 0 def set_eps(self,x): self.eps = max(x,self.eps_min) def get_episode(self): episode = self.memory.empty_episode() state = self.env.current_state() for i in range(self.batch_size): # save current state episode["state"].append(state) # act action = self.act(state) state, rew, done,info = self.env.step(action) episode["next_state"].append(state) episode["t"].append(i) episode["action"].append(action) episode["reward"].append(rew) episode["terminated"].append(done) self.set_eps(self.eps-self.eps_decay) if done: state= self.env.reset() self.progbar.add(1,values=("Info",info)) self.done += 1 if not(self.done)%self.update_double: self.update=True # record the episodes self.memory.record(episode) del(episode) def play(self,name='play'): name = name+self.env.name+str(self.eps) eps = self.eps self.set_eps(0) state = self.env.reset() #print(self.env.t,end=",") done = False while not done: action = self.act(state) state, _, done,_ = self.env.step(action) #print(self.env.t,end=",") self.env.draw(name) self.set_eps(eps)