def main(): env = gym.make('CartPole-v1') env.seed(0) ob_space = env.observation_space Policy = Policy_net('policy', env) Old_Policy = Policy_net('old_policy', env) PPO = PPOTrain(Policy, Old_Policy, gamma=GAMMA) saver = tf.train.Saver() with tf.Session() as sess: writer = tf.summary.FileWriter('./log/test', sess.graph) sess.run(tf.global_variables_initializer()) saver.restore(sess, 'model/model.ckpt') obs = env.reset() reward = 0 success_num = 0 for iteration in range(ITERATION): # episode observations = [] actions = [] v_preds = [] rewards = [] run_policy_steps = 0 env.render() while True: # run policy RUN_POLICY_STEPS which is much less than episode length run_policy_steps += 1 obs = np.stack([obs]).astype( dtype=np.float32) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act(obs=obs, stochastic=False) act = act.item() v_pred = v_pred.item() observations.append(obs) actions.append(act) v_preds.append(v_pred) rewards.append(reward) next_obs, reward, done, info = env.step(act) if done: v_preds_next = v_preds[1:] + [ 0 ] # next state of terminate state has 0 state value obs = env.reset() reward = -1 break else: obs = next_obs writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps) ]), iteration) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards)) ]), iteration) # end condition of test if sum(rewards) >= 195: success_num += 1 if success_num >= 100: print('Iteration: ', iteration) print('Clear!!') break else: success_num = 0 gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next) # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape)) actions = np.array(actions).astype(dtype=np.int32) rewards = np.array(rewards).astype(dtype=np.float32) v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) gaes = np.array(gaes).astype(dtype=np.float32) gaes = (gaes - gaes.mean()) / gaes.std() inp = [observations, actions, rewards, v_preds_next, gaes] summary = PPO.get_summary(obs=inp[0], actions=inp[1], rewards=inp[2], v_preds_next=inp[3], gaes=inp[4])[0] writer.add_summary(summary, iteration) writer.close()
def main(): angle = 0.0 angle_thres_deg = 15 cart = 0.0 t.tic() reward_max = 5 reward_min = -5 reward_disc = 5 pwm_index = 1 pwm_list = [("L", 180), ("L", 170), ("L", 160), ("L", 0), ("R", 160), ("R", 170), ("R", 180)] pwm_list = [("L", 180), ("L", 0), ("R", 180)] pwm_list_size = 3 # Serial port for Arduino if (SERIAL_AVAILABLE): ser = serial.Serial('COM20', 115200) # Initialize serial port print("connected to: " + ser.portstr) # Confirm connection env = gym.make('CartPole-v0') env.seed(0) ob_space = env.observation_space Policy = Policy_net('policy', env) Old_Policy = Policy_net('old_policy', env) PPO = PPOTrain(Policy, Old_Policy, gamma=GAMMA) saver = tf.train.Saver() with tf.Session() as sess: if LOAD: saver.restore( sess, "./model/model_iter_{:d}_rewards_{:d}.ckpt".format( load_iteration, load_rewards)) else: sess.run( tf.global_variables_initializer()) # remove me if loading save writer = tf.summary.FileWriter('./log/train', sess.graph) obs = env.reset() reward = 0 success_num = 0 for iteration in range(ITERATION): # episode observations = [] actions = [] v_preds = [] rewards = [] run_policy_steps = 0 while True: # run policy RUN_POLICY_STEPS which is much less than episode length run_policy_steps += 1 obs = np.stack([obs]).astype( dtype=np.float32) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) observations.append(obs) actions.append(act) v_preds.append(v_pred) rewards.append(reward) # env.render() if (act == 1): if pwm_index < pwm_list_size - 1: pwm_index += 1 else: if pwm_index > 0: pwm_index -= 1 dir = pwm_list[pwm_index][0] pwm = pwm_list[pwm_index][1] print(dir) print(pwm) if (SERIAL_AVAILABLE): PD.writePWM(ser, 180, dir) last_angle = angle angle_deg = PD.getPEncoderPos( ser ) * 360 / 1200 # convert encoder counts (1200) to degrees angle = angle_deg * 2 * math.pi / 360 # convert degrees to radians angle_velocity = (angle - last_angle) / t.tocvalue() last_cart = cart cart = PD.getMEncoderPos(ser) cart_velocity = (cart - last_cart) / t.tocvalue() #print("Angle {:.1f}, Angle_vel (rad/s) {:.1f}, Position (mm) {:.1f}, Velocity (mm/s) {:.1f}".format(angle, angle_velocity, cart,cart_velocity)) t.tic() m = (reward_max - reward_min) / (reward_disc - angle_thres_deg) # reward = min(m*(abs(angle_deg)-reward_disc) + reward_max, reward_max) #reward = 1 reward = ((.9 / 7) * (min( (6 - abs(angle_deg)), (1))) + 6) + ((0.1 / 6) * (min( (5 - abs((cart / 1000))), (1)) + 5)) # next_obs = [angle angle_velocity cart cart_velocoty] # print(next_obs) next_obs = [angle, angle_velocity, cart, cart_velocity] #print("angle = ", angle_deg) print("x: ", PD.getMEncoderPos(ser)) if abs(angle_deg) > angle_thres_deg: v_preds_next = v_preds[1:] + [ 0 ] # next state of terminate state has 0 state value print("reward: ", sum(rewards)) obs = env.reset() reward = -1 print("Iteration: ", iteration) print('Waiting to reset') PD.writePWM(ser, 0, dir) if iteration % 10 == 0: saver.save( sess, "./model/model_iter_{:d}_rewards_{:d}.ckpt".format( iteration, sum(rewards))) print('Scoot scoot!! Model saved.') while (angle_deg > 1.5 or angle_deg < -1.5): time.sleep(0.1) angle_deg = PD.getPEncoderPos(ser) * 360 / 1200 print('Entered iteration {:1f}'.format(iteration + 1)) break else: obs = next_obs writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps) ]), iteration) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards)) ]), iteration) if sum(rewards) >= 195: success_num += 1 if success_num >= 100: saver.save(sess, './model/model.ckpt') print('Clear!! Model saved.') break else: success_num = 0 gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next) # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape)) actions = np.array(actions).astype(dtype=np.int32) rewards = np.array(rewards).astype(dtype=np.float32) v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) gaes = np.array(gaes).astype(dtype=np.float32) if iteration > 0: gaes = (gaes - gaes.mean()) / gaes.std() PPO.assign_policy_parameters() inp = [observations, actions, rewards, v_preds_next, gaes] # train for epoch in range(4): sample_indices = np.random.randint( low=0, high=observations.shape[0], size=64) # indices are in [low, high) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], rewards=sampled_inp[2], v_preds_next=sampled_inp[3], gaes=sampled_inp[4]) summary = PPO.get_summary(obs=inp[0], actions=inp[1], rewards=inp[2], v_preds_next=inp[3], gaes=inp[4])[0] writer.add_summary(summary, iteration) writer.close() if (SERIAL_AVAILABLE): ser.close()
def main(): allrewards = list() env = gym.make('CartPole-v0') env.seed(0) ob_space = env.observation_space Policy = Policy_net('policy', env) Old_Policy = Policy_net('old_policy', env) PPO = PPOTrain(Policy, Old_Policy, gamma=GAMMA) saver = tf.train.Saver() name = 'Model_Noise' filename = "data/{n}_{ts:%H_%M_%S}.csv".format(n=name, ts=datetime.now()) with open(filename, "w", 1) as result: result.write("Iteration, Reward \n") with tf.Session() as sess: writer = tf.summary.FileWriter('./log/train', sess.graph) sess.run(tf.global_variables_initializer()) obs = env.reset() reward = 0 success_num = 0 for iteration in range(ITERATION): # episode observations = [] actions = [] v_preds = [] rewards = [] run_policy_steps = 0 while True: # run policy RUN_POLICY_STEPS which is much less than episode length if iteration % 500 == 0: env.render() run_policy_steps += 1 obs = np.stack([obs]).astype( dtype=np.float32 ) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) observations.append(obs) actions.append(act) v_preds.append(v_pred) rewards.append(reward) next_obs, reward, done, info = env.step(act) if done: v_preds_next = v_preds[1:] + [ 0 ] # next state of terminate state has 0 state value obs = env.reset() reward = -1 break else: obs = next_obs writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps) ]), iteration) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards)) ]), iteration) result.write("{:d},{:2f}\n".format(iteration, sum(rewards))) print("Rewards: {:2f}, Iterations: {:d}".format( sum(rewards), iteration)) if sum(rewards) >= 195: success_num += 1 if success_num >= 100: saver.save(sess, './model/model.ckpt') print('Clear!! Model saved.') break else: success_num = 0 gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next) # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape)) actions = np.array(actions).astype(dtype=np.int32) rewards = np.array(rewards).astype(dtype=np.float32) v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) gaes = np.array(gaes).astype(dtype=np.float32) gaes = (gaes - gaes.mean()) / gaes.std() PPO.assign_policy_parameters() inp = [observations, actions, rewards, v_preds_next, gaes] # train for epoch in range(4): sample_indices = np.random.randint( low=0, high=observations.shape[0], size=64) # indices are in [low, high) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], rewards=sampled_inp[2], v_preds_next=sampled_inp[3], gaes=sampled_inp[4]) summary = PPO.get_summary(obs=inp[0], actions=inp[1], rewards=inp[2], v_preds_next=inp[3], gaes=inp[4])[0] writer.add_summary(summary, iteration) writer.close() if iteration % 500 == 0: env.close()
def main(): # env = gym.make('CartPole-v0') num_arm = 5 env = BernoulliBanditEnv(num_arm) env.seed(0) ob_space = env.observation_space # Policy = Policy_net('policy', env) # Old_Policy = Policy_net('old_policy', env) Policy = PolicyGRUNet('policy', env) Old_Policy = PolicyGRUNet('old_policy', env) PPO = PPOTrain(Policy, Old_Policy, gamma=GAMMA) saver = tf.train.Saver() with tf.Session() as sess: writer = tf.summary.FileWriter('./log/train', sess.graph) sess.run(tf.global_variables_initializer()) obs = env.reset() reward = 0 success_num = 0 for iteration in range(ITERATION): # episode observations = [] actions = [] v_preds = [] rewards = [] run_policy_steps = 0 while True: # run policy RUN_POLICY_STEPS which is much less than episode length run_policy_steps += 1 obs = np.stack([obs]).astype( dtype=np.float32) # prepare to feed placeholder Policy.obs # obs = np.expand_dims(obs, axis=0) act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) observations.append(obs) actions.append(act) v_preds.append(v_pred) rewards.append(reward) if act not in range(num_arm): act = np.random.randint(num_arm) next_obs, reward, done, info = env.step(act) if done: v_preds_next = v_preds[1:] + [ 0 ] # next state of terminate state has 0 state value obs = env.reset() reward = -1 break else: obs = next_obs writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps) ]), iteration) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards)) ]), iteration) if sum(rewards) >= 195: success_num += 1 if success_num >= 100: saver.save(sess, './model/model.ckpt') print('Clear!! Model saved.') break else: success_num = 0 gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next) # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape)) # observations = np.expand_dims(observations, axis=0) actions = np.array(actions).astype(dtype=np.int32) rewards = np.array(rewards).astype(dtype=np.float32) v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) gaes = np.array(gaes).astype(dtype=np.float32) gaes = (gaes - gaes.mean()) / gaes.std() PPO.assign_policy_parameters() inp = [observations, actions, rewards, v_preds_next, gaes] # train for epoch in range(4): sample_indices = np.random.randint( low=0, high=observations.shape[0], size=64) # indices are in [low, high) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], rewards=sampled_inp[2], v_preds_next=sampled_inp[3], gaes=sampled_inp[4]) summary = PPO.get_summary(obs=inp[0], actions=inp[1], rewards=inp[2], v_preds_next=inp[3], gaes=inp[4])[0] writer.add_summary(summary, iteration) writer.close()
def __init__(self): rospy.init_node('runPPO', anonymous=True) Policy = Policy_net('policy', self.n_inputs, self.n_outputs) Old_Policy = Policy_net('old_policy', self.n_inputs, self.n_outputs) PPO = PPOTrain(Policy, Old_Policy, gamma=GAMMA, c_2=0.1) saver = tf.train.Saver() rospy.Subscriber('/RL/gripper_status', String, self.callbackGripperStatus) # rospy.Service('/RL/net', net_eval, self.EvalNet) rospy.Service('/RL/start_learning', Empty, self.start_learning) obs_srv = rospy.ServiceProxy('/RL/observation', observation) drop_srv = rospy.ServiceProxy('/RL/IsObjDropped', IsDropped) move_srv = rospy.ServiceProxy('/RL/MoveGripper', TargetAngles) reset_srv = rospy.ServiceProxy('/RL/ResetGripper', Empty) pub_goal = rospy.Publisher('/RL/Goal', Float32MultiArray, queue_size=10) gg = Float32MultiArray() gg.data = self.g with tf.Session() as sess: # $ tensorboard --logdir=logs # http://0.0.0.0:6006/ writer = tf.summary.FileWriter( '/home/pracsys/catkin_ws/src/rutgers_collab/src/rl_pkg/src/PPO/log/train', sess.graph) sess.run(tf.global_variables_initializer()) reward = 0 success_num = 0 episode_count = 0 rate = rospy.Rate(100) # 100hz while not rospy.is_shutdown(): if self.stLearning: ## Start episode ## episode_count += 1 # Reset gripper reset_srv() while not self.gripper_closed: rate.sleep() # Get observation obs = np.array(obs_srv().state) self.prev_dis2goal = np.linalg.norm(self.g - obs[:2]) observations = [] actions = [] v_preds = [] rewards = [] run_policy_steps = 0 while True: # run policy RUN_POLICY_STEPS which is much less than episode length run_policy_steps += 1 print( '[RL] Step %d in episode %d, distance to goal: %f.' % (run_policy_steps, episode_count, self.prev_dis2goal)) pub_goal.publish(gg) obs = np.stack([obs]).astype( dtype=np.float32 ) # prepare to feed placeholder Policy.obs while 1: act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) if act < 8: break # Act suc = move_srv(self.A[act]) rospy.sleep(0.05) rate.sleep() if suc: # Get observation next_obs = np.array(obs_srv().state) fail = drop_srv( ).dropped # Check if dropped - end of episode else: # End episode if overload or angle limits reached rospy.logerr( '[RL] Failed to move gripper. Episode declared failed.' ) fail = True reward, done = self.transition_reward(next_obs, fail) observations.append(obs) actions.append(act) v_preds.append(v_pred) rewards.append( reward ) # Weird that this is before the step - this is the reward of the previos action print( '[RL] Action %d yielded reward %f and position (%f,%f).' % (act, reward, obs[0][0], obs[0][1])) if run_policy_steps > self.max_steps: done = True if done: v_preds_next = v_preds[1:] + [ 0 ] # next state of terminate state has 0 state value - adds zero in the end of the vector reward = -1 break else: obs = next_obs rate.sleep() print('episode_length', run_policy_steps, 'episode_reward', sum(rewards)) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps) ]), episode_count) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards)) ]), episode_count) if sum(rewards) >= self.stop_bound: success_num += 1 if success_num >= 100: saver.save( sess, '/home/pracsys/catkin_ws/src/rutgers_collab/src/rl_pkg/logs/model_ppo.ckpt' ) print('Clear!! Model saved.') break else: success_num = 0 gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next) # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1] + list( (self.n_inputs, ))) actions = np.array(actions).astype(dtype=np.int32) rewards = np.array(rewards).astype(dtype=np.float32) v_preds_next = np.array(v_preds_next).astype( dtype=np.float32) gaes = np.array(gaes).astype(dtype=np.float32) gaes = (gaes - gaes.mean()) / gaes.std() PPO.assign_policy_parameters() inp = [observations, actions, rewards, v_preds_next, gaes] # train for epoch in range(4): sample_indices = np.random.randint( low=0, high=observations.shape[0], size=64) # indices are in [low, high) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], rewards=sampled_inp[2], v_preds_next=sampled_inp[3], gaes=sampled_inp[4]) summary = PPO.get_summary(obs=inp[0], actions=inp[1], rewards=inp[2], v_preds_next=inp[3], gaes=inp[4])[0] writer.add_summary(summary, episode_count) if episode_count > self.max_episodes: break rate.sleep() writer.close()
def main(): env = gym.make(ENV) # Instancia o ambiente CartPole env.seed(0) # ob_space = env.observation_space # Descrevem o formato de observações válidas do espaço Policy = Policy_net('policy', env) # Cria a rede de Politica Old_Policy = Policy_net('old_policy', env) # Cria a rede de politica antiga PPO = PPOTrain(Policy, Old_Policy, gamma=GAMMA) saver = tf.train.Saver() # with tf.Session() as sess: # Bloco da sessão writer = tf.summary.FileWriter('./log/train', sess.graph) # Define diretório de logs sess.run(tf.global_variables_initializer()) # Inicializa as redes obs = env.reset() # Reseta o ambiente e obtêm a primeira observação reward = 0 # Armazena as recompensas success_num = 0 # Contador de sucessos for episode in range(EPISODES): # Loop do episodio observations = [] # Array pra armazenar as observações actions = [] # Array pra armazenar as ações v_preds = [] # Array pra armazenar as previsões rewards = [] # Array pra armazenar as recompensas run_policy_steps = 0 # Contador de passos em cada episodio env.render() # Renderiza o ambiente while True: # Run policy RUN_POLICY_STEPS which is much less than episode length # Execute a política RUN_POLICY_STEPS, que é muito menor que a duração do episódio run_policy_steps += 1 # Incrementa contador de passos de cada episodio obs = np.stack([obs]).astype( dtype=np.float32) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act( obs=obs, stochastic=True ) # Corre a rede neural e obtêm uma ação e o V previsto act = act.item() # Transforma um array do numpy v_pred = v_pred.item() # em um objeto scalar do Python observations.append( obs) # Adiciona a observação ao buffer de observações actions.append(act) # Adiciona a ação ao buffer de ações v_preds.append(v_pred) # Adiciona a v_pred ao buffer de v_pred rewards.append( reward) # Adiciona a recompensa ao buffer de recompensa next_obs, reward, done, info = env.step( act ) # envia a ação ao ambiente e recebe a próxima observação, a recompensa e se o passo terminou if done: # Se o done for verdadeiro ... v_preds_next = v_preds[1:] + [ 0 ] # [1:] seleciona do segundo elemento da lista em diante e + [0] adiciona um elemento de valor zero no final da lista # next state of terminate state has 0 state value # próximo estado do estado final tem 0 valor de estado obs = env.reset() # Redefine o ambiente reward = -1 # define a recompensa como -1 (?) break # Sai do loop while else: # Senão... obs = next_obs # Armazena em obs a próxima observação # Armazena em log para visualização no tensorboard writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps) ]), episode) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards)) ]), episode) # Condicional para finalizar o teste if sum( rewards ) >= 195: # Se a soma das recompensas for maior ou igual 195 success_num += 1 # Incrementa o contador de sucessos if success_num >= 100: # Se ocorrerem 100 sucessos saver.save(sess, './model/model.ckpt') # Salva a sessão print('Clear!! Model saved.') # Escreve na tela break # Sai do loop else: # senão, success_num = 0 # zera o contador de sucessos print("EP: ", episode, " Rw: ", sum(rewards) ) # Escreve na tela o numero do episodio e a recompensa gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next) # ? # Converte lista em NPArray para alimentar o tf.placeholder newshape = [-1] + list(ob_space.shape) # cria um array [-1, 4] observations = np.reshape( observations, newshape=newshape ) # antes, cada linha de observations era um array idependente. depois do reshape, observations passou ser um array só com varias linhas. actions = np.array(actions).astype(dtype=np.int32) rewards = np.array(rewards).astype(dtype=np.float32) v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) gaes = np.array(gaes).astype(dtype=np.float32) gaes = (gaes - gaes.mean()) / gaes.std( ) # subtrai dos itens de gaes a media de todos os itens de gaes e divide todos pelo desvio padrao de gaes PPO.assign_policy_parameters() inp = [ observations, actions, rewards, v_preds_next, gaes ] # Cria um array com 5 colunas: observações, ações, recompensas, # Treina for epoch in range(4): sample_indices = np.random.randint( low=0, high=observations.shape[0], size=64) # índices estão em [baixo, alto] sampled_inp = [] for a in inp: sampled_inp.append( np.take(a=a, indices=sample_indices, axis=0)) # amostra de dados de treinamento PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], rewards=sampled_inp[2], v_preds_next=sampled_inp[3], gaes=sampled_inp[4]) summary = PPO.get_summary(obs=inp[0], actions=inp[1], rewards=inp[2], v_preds_next=inp[3], gaes=inp[4])[0] writer.add_summary(summary, episode) writer.close() # Final do episódio