示例#1
0
def main():
    env = gym.make('CartPole-v1')
    env.seed(0)
    ob_space = env.observation_space
    Policy = Policy_net('policy', env)
    Old_Policy = Policy_net('old_policy', env)
    PPO = PPOTrain(Policy, Old_Policy, gamma=GAMMA)
    saver = tf.train.Saver()

    with tf.Session() as sess:
        writer = tf.summary.FileWriter('./log/test', sess.graph)
        sess.run(tf.global_variables_initializer())
        saver.restore(sess, 'model/model.ckpt')
        obs = env.reset()
        reward = 0
        success_num = 0

        for iteration in range(ITERATION):  # episode
            observations = []
            actions = []
            v_preds = []
            rewards = []
            run_policy_steps = 0
            env.render()
            while True:  # run policy RUN_POLICY_STEPS which is much less than episode length
                run_policy_steps += 1
                obs = np.stack([obs]).astype(
                    dtype=np.float32)  # prepare to feed placeholder Policy.obs
                act, v_pred = Policy.act(obs=obs, stochastic=False)

                act = act.item()
                v_pred = v_pred.item()

                observations.append(obs)
                actions.append(act)
                v_preds.append(v_pred)
                rewards.append(reward)

                next_obs, reward, done, info = env.step(act)

                if done:
                    v_preds_next = v_preds[1:] + [
                        0
                    ]  # next state of terminate state has 0 state value
                    obs = env.reset()
                    reward = -1
                    break
                else:
                    obs = next_obs

            writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag='episode_length',
                                     simple_value=run_policy_steps)
                ]), iteration)
            writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag='episode_reward',
                                     simple_value=sum(rewards))
                ]), iteration)

            # end condition of test
            if sum(rewards) >= 195:
                success_num += 1
                if success_num >= 100:
                    print('Iteration: ', iteration)
                    print('Clear!!')
                    break
            else:
                success_num = 0

            gaes = PPO.get_gaes(rewards=rewards,
                                v_preds=v_preds,
                                v_preds_next=v_preds_next)

            # convert list to numpy array for feeding tf.placeholder
            observations = np.reshape(observations,
                                      newshape=[-1] + list(ob_space.shape))
            actions = np.array(actions).astype(dtype=np.int32)
            rewards = np.array(rewards).astype(dtype=np.float32)
            v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)
            gaes = np.array(gaes).astype(dtype=np.float32)
            gaes = (gaes - gaes.mean()) / gaes.std()

            inp = [observations, actions, rewards, v_preds_next, gaes]

            summary = PPO.get_summary(obs=inp[0],
                                      actions=inp[1],
                                      rewards=inp[2],
                                      v_preds_next=inp[3],
                                      gaes=inp[4])[0]

            writer.add_summary(summary, iteration)
        writer.close()
示例#2
0
def main():
    angle = 0.0
    angle_thres_deg = 15
    cart = 0.0
    t.tic()
    reward_max = 5
    reward_min = -5
    reward_disc = 5
    pwm_index = 1
    pwm_list = [("L", 180), ("L", 170), ("L", 160), ("L", 0), ("R", 160),
                ("R", 170), ("R", 180)]
    pwm_list = [("L", 180), ("L", 0), ("R", 180)]
    pwm_list_size = 3
    # Serial port for Arduino
    if (SERIAL_AVAILABLE):
        ser = serial.Serial('COM20', 115200)  # Initialize serial port
        print("connected to: " + ser.portstr)  # Confirm connection

    env = gym.make('CartPole-v0')
    env.seed(0)
    ob_space = env.observation_space
    Policy = Policy_net('policy', env)
    Old_Policy = Policy_net('old_policy', env)
    PPO = PPOTrain(Policy, Old_Policy, gamma=GAMMA)
    saver = tf.train.Saver()

    with tf.Session() as sess:
        if LOAD:
            saver.restore(
                sess, "./model/model_iter_{:d}_rewards_{:d}.ckpt".format(
                    load_iteration, load_rewards))
        else:
            sess.run(
                tf.global_variables_initializer())  # remove me if loading save

        writer = tf.summary.FileWriter('./log/train', sess.graph)
        obs = env.reset()
        reward = 0
        success_num = 0

        for iteration in range(ITERATION):  # episode
            observations = []
            actions = []
            v_preds = []
            rewards = []
            run_policy_steps = 0
            while True:  # run policy RUN_POLICY_STEPS which is much less than episode length
                run_policy_steps += 1
                obs = np.stack([obs]).astype(
                    dtype=np.float32)  # prepare to feed placeholder Policy.obs
                act, v_pred = Policy.act(obs=obs, stochastic=True)

                act = np.asscalar(act)
                v_pred = np.asscalar(v_pred)

                observations.append(obs)
                actions.append(act)
                v_preds.append(v_pred)
                rewards.append(reward)

                # env.render()

                if (act == 1):
                    if pwm_index < pwm_list_size - 1:
                        pwm_index += 1
                else:
                    if pwm_index > 0:
                        pwm_index -= 1

                dir = pwm_list[pwm_index][0]
                pwm = pwm_list[pwm_index][1]
                print(dir)
                print(pwm)

                if (SERIAL_AVAILABLE):
                    PD.writePWM(ser, 180, dir)

                    last_angle = angle
                    angle_deg = PD.getPEncoderPos(
                        ser
                    ) * 360 / 1200  # convert encoder counts (1200) to degrees
                    angle = angle_deg * 2 * math.pi / 360  # convert degrees to radians

                    angle_velocity = (angle - last_angle) / t.tocvalue()

                    last_cart = cart
                    cart = PD.getMEncoderPos(ser)
                    cart_velocity = (cart - last_cart) / t.tocvalue()

                    #print("Angle {:.1f}, Angle_vel (rad/s) {:.1f}, Position (mm) {:.1f}, Velocity (mm/s) {:.1f}".format(angle, angle_velocity, cart,cart_velocity))

                    t.tic()

                m = (reward_max - reward_min) / (reward_disc - angle_thres_deg)
                # reward = min(m*(abs(angle_deg)-reward_disc) + reward_max, reward_max)
                #reward = 1
                reward = ((.9 / 7) * (min(
                    (6 - abs(angle_deg)), (1))) + 6) + ((0.1 / 6) * (min(
                        (5 - abs((cart / 1000))), (1)) + 5))

                # next_obs = [angle angle_velocity cart cart_velocoty]
                # print(next_obs)

                next_obs = [angle, angle_velocity, cart, cart_velocity]
                #print("angle = ", angle_deg)
                print("x: ", PD.getMEncoderPos(ser))
                if abs(angle_deg) > angle_thres_deg:
                    v_preds_next = v_preds[1:] + [
                        0
                    ]  # next state of terminate state has 0 state value
                    print("reward: ", sum(rewards))
                    obs = env.reset()
                    reward = -1
                    print("Iteration: ", iteration)
                    print('Waiting to reset')
                    PD.writePWM(ser, 0, dir)
                    if iteration % 10 == 0:
                        saver.save(
                            sess,
                            "./model/model_iter_{:d}_rewards_{:d}.ckpt".format(
                                iteration, sum(rewards)))
                        print('Scoot scoot!! Model saved.')
                    while (angle_deg > 1.5 or angle_deg < -1.5):
                        time.sleep(0.1)
                        angle_deg = PD.getPEncoderPos(ser) * 360 / 1200
                    print('Entered iteration {:1f}'.format(iteration + 1))
                    break
                else:
                    obs = next_obs

            writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag='episode_length',
                                     simple_value=run_policy_steps)
                ]), iteration)
            writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag='episode_reward',
                                     simple_value=sum(rewards))
                ]), iteration)

            if sum(rewards) >= 195:
                success_num += 1
                if success_num >= 100:
                    saver.save(sess, './model/model.ckpt')
                    print('Clear!! Model saved.')
                    break
            else:
                success_num = 0

            gaes = PPO.get_gaes(rewards=rewards,
                                v_preds=v_preds,
                                v_preds_next=v_preds_next)

            # convert list to numpy array for feeding tf.placeholder
            observations = np.reshape(observations,
                                      newshape=[-1] + list(ob_space.shape))
            actions = np.array(actions).astype(dtype=np.int32)
            rewards = np.array(rewards).astype(dtype=np.float32)
            v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)
            gaes = np.array(gaes).astype(dtype=np.float32)
            if iteration > 0:
                gaes = (gaes - gaes.mean()) / gaes.std()

            PPO.assign_policy_parameters()

            inp = [observations, actions, rewards, v_preds_next, gaes]

            # train
            for epoch in range(4):
                sample_indices = np.random.randint(
                    low=0, high=observations.shape[0],
                    size=64)  # indices are in [low, high)
                sampled_inp = [
                    np.take(a=a, indices=sample_indices, axis=0) for a in inp
                ]  # sample training data
                PPO.train(obs=sampled_inp[0],
                          actions=sampled_inp[1],
                          rewards=sampled_inp[2],
                          v_preds_next=sampled_inp[3],
                          gaes=sampled_inp[4])

            summary = PPO.get_summary(obs=inp[0],
                                      actions=inp[1],
                                      rewards=inp[2],
                                      v_preds_next=inp[3],
                                      gaes=inp[4])[0]

            writer.add_summary(summary, iteration)
        writer.close()
        if (SERIAL_AVAILABLE):
            ser.close()
示例#3
0
def main():
    allrewards = list()
    env = gym.make('CartPole-v0')
    env.seed(0)
    ob_space = env.observation_space
    Policy = Policy_net('policy', env)
    Old_Policy = Policy_net('old_policy', env)
    PPO = PPOTrain(Policy, Old_Policy, gamma=GAMMA)
    saver = tf.train.Saver()
    name = 'Model_Noise'
    filename = "data/{n}_{ts:%H_%M_%S}.csv".format(n=name, ts=datetime.now())
    with open(filename, "w", 1) as result:
        result.write("Iteration, Reward \n")

        with tf.Session() as sess:
            writer = tf.summary.FileWriter('./log/train', sess.graph)
            sess.run(tf.global_variables_initializer())
            obs = env.reset()
            reward = 0
            success_num = 0

            for iteration in range(ITERATION):  # episode
                observations = []
                actions = []
                v_preds = []
                rewards = []
                run_policy_steps = 0
                while True:  # run policy RUN_POLICY_STEPS which is much less than episode length
                    if iteration % 500 == 0:
                        env.render()
                    run_policy_steps += 1
                    obs = np.stack([obs]).astype(
                        dtype=np.float32
                    )  # prepare to feed placeholder Policy.obs
                    act, v_pred = Policy.act(obs=obs, stochastic=True)

                    act = np.asscalar(act)
                    v_pred = np.asscalar(v_pred)

                    observations.append(obs)
                    actions.append(act)
                    v_preds.append(v_pred)
                    rewards.append(reward)

                    next_obs, reward, done, info = env.step(act)

                    if done:
                        v_preds_next = v_preds[1:] + [
                            0
                        ]  # next state of terminate state has 0 state value
                        obs = env.reset()
                        reward = -1
                        break
                    else:
                        obs = next_obs

                writer.add_summary(
                    tf.Summary(value=[
                        tf.Summary.Value(tag='episode_length',
                                         simple_value=run_policy_steps)
                    ]), iteration)
                writer.add_summary(
                    tf.Summary(value=[
                        tf.Summary.Value(tag='episode_reward',
                                         simple_value=sum(rewards))
                    ]), iteration)

                result.write("{:d},{:2f}\n".format(iteration, sum(rewards)))
                print("Rewards: {:2f}, Iterations: {:d}".format(
                    sum(rewards), iteration))
                if sum(rewards) >= 195:
                    success_num += 1
                    if success_num >= 100:
                        saver.save(sess, './model/model.ckpt')
                        print('Clear!! Model saved.')
                        break
                else:
                    success_num = 0

                gaes = PPO.get_gaes(rewards=rewards,
                                    v_preds=v_preds,
                                    v_preds_next=v_preds_next)

                # convert list to numpy array for feeding tf.placeholder
                observations = np.reshape(observations,
                                          newshape=[-1] + list(ob_space.shape))
                actions = np.array(actions).astype(dtype=np.int32)
                rewards = np.array(rewards).astype(dtype=np.float32)
                v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)
                gaes = np.array(gaes).astype(dtype=np.float32)
                gaes = (gaes - gaes.mean()) / gaes.std()

                PPO.assign_policy_parameters()

                inp = [observations, actions, rewards, v_preds_next, gaes]

                # train
                for epoch in range(4):
                    sample_indices = np.random.randint(
                        low=0, high=observations.shape[0],
                        size=64)  # indices are in [low, high)
                    sampled_inp = [
                        np.take(a=a, indices=sample_indices, axis=0)
                        for a in inp
                    ]  # sample training data
                    PPO.train(obs=sampled_inp[0],
                              actions=sampled_inp[1],
                              rewards=sampled_inp[2],
                              v_preds_next=sampled_inp[3],
                              gaes=sampled_inp[4])

                summary = PPO.get_summary(obs=inp[0],
                                          actions=inp[1],
                                          rewards=inp[2],
                                          v_preds_next=inp[3],
                                          gaes=inp[4])[0]

                writer.add_summary(summary, iteration)
            writer.close()
            if iteration % 500 == 0:
                env.close()
示例#4
0
文件: main.py 项目: BatBate/Meta-RL
def main():
    # env = gym.make('CartPole-v0')
    num_arm = 5
    env = BernoulliBanditEnv(num_arm)
    env.seed(0)
    ob_space = env.observation_space
    # Policy = Policy_net('policy', env)
    # Old_Policy = Policy_net('old_policy', env)
    Policy = PolicyGRUNet('policy', env)
    Old_Policy = PolicyGRUNet('old_policy', env)
    PPO = PPOTrain(Policy, Old_Policy, gamma=GAMMA)
    saver = tf.train.Saver()

    with tf.Session() as sess:
        writer = tf.summary.FileWriter('./log/train', sess.graph)
        sess.run(tf.global_variables_initializer())
        obs = env.reset()
        reward = 0
        success_num = 0

        for iteration in range(ITERATION):  # episode
            observations = []
            actions = []
            v_preds = []
            rewards = []
            run_policy_steps = 0
            while True:  # run policy RUN_POLICY_STEPS which is much less than episode length
                run_policy_steps += 1
                obs = np.stack([obs]).astype(
                    dtype=np.float32)  # prepare to feed placeholder Policy.obs
                # obs = np.expand_dims(obs, axis=0)
                act, v_pred = Policy.act(obs=obs, stochastic=True)

                act = np.asscalar(act)
                v_pred = np.asscalar(v_pred)

                observations.append(obs)
                actions.append(act)
                v_preds.append(v_pred)
                rewards.append(reward)
                if act not in range(num_arm):
                    act = np.random.randint(num_arm)

                next_obs, reward, done, info = env.step(act)

                if done:
                    v_preds_next = v_preds[1:] + [
                        0
                    ]  # next state of terminate state has 0 state value
                    obs = env.reset()
                    reward = -1
                    break
                else:
                    obs = next_obs

            writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag='episode_length',
                                     simple_value=run_policy_steps)
                ]), iteration)
            writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag='episode_reward',
                                     simple_value=sum(rewards))
                ]), iteration)

            if sum(rewards) >= 195:
                success_num += 1
                if success_num >= 100:
                    saver.save(sess, './model/model.ckpt')
                    print('Clear!! Model saved.')
                    break
            else:
                success_num = 0

            gaes = PPO.get_gaes(rewards=rewards,
                                v_preds=v_preds,
                                v_preds_next=v_preds_next)

            # convert list to numpy array for feeding tf.placeholder
            observations = np.reshape(observations,
                                      newshape=[-1] + list(ob_space.shape))
            # observations = np.expand_dims(observations, axis=0)
            actions = np.array(actions).astype(dtype=np.int32)
            rewards = np.array(rewards).astype(dtype=np.float32)
            v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)
            gaes = np.array(gaes).astype(dtype=np.float32)
            gaes = (gaes - gaes.mean()) / gaes.std()

            PPO.assign_policy_parameters()

            inp = [observations, actions, rewards, v_preds_next, gaes]

            # train
            for epoch in range(4):
                sample_indices = np.random.randint(
                    low=0, high=observations.shape[0],
                    size=64)  # indices are in [low, high)
                sampled_inp = [
                    np.take(a=a, indices=sample_indices, axis=0) for a in inp
                ]  # sample training data
                PPO.train(obs=sampled_inp[0],
                          actions=sampled_inp[1],
                          rewards=sampled_inp[2],
                          v_preds_next=sampled_inp[3],
                          gaes=sampled_inp[4])

            summary = PPO.get_summary(obs=inp[0],
                                      actions=inp[1],
                                      rewards=inp[2],
                                      v_preds_next=inp[3],
                                      gaes=inp[4])[0]

            writer.add_summary(summary, iteration)
        writer.close()
示例#5
0
    def __init__(self):
        rospy.init_node('runPPO', anonymous=True)

        Policy = Policy_net('policy', self.n_inputs, self.n_outputs)
        Old_Policy = Policy_net('old_policy', self.n_inputs, self.n_outputs)
        PPO = PPOTrain(Policy, Old_Policy, gamma=GAMMA, c_2=0.1)
        saver = tf.train.Saver()

        rospy.Subscriber('/RL/gripper_status', String,
                         self.callbackGripperStatus)
        # rospy.Service('/RL/net', net_eval, self.EvalNet)
        rospy.Service('/RL/start_learning', Empty, self.start_learning)
        obs_srv = rospy.ServiceProxy('/RL/observation', observation)
        drop_srv = rospy.ServiceProxy('/RL/IsObjDropped', IsDropped)
        move_srv = rospy.ServiceProxy('/RL/MoveGripper', TargetAngles)
        reset_srv = rospy.ServiceProxy('/RL/ResetGripper', Empty)
        pub_goal = rospy.Publisher('/RL/Goal',
                                   Float32MultiArray,
                                   queue_size=10)

        gg = Float32MultiArray()
        gg.data = self.g

        with tf.Session() as sess:
            # $ tensorboard --logdir=logs
            # http://0.0.0.0:6006/
            writer = tf.summary.FileWriter(
                '/home/pracsys/catkin_ws/src/rutgers_collab/src/rl_pkg/src/PPO/log/train',
                sess.graph)

            sess.run(tf.global_variables_initializer())
            reward = 0
            success_num = 0

            episode_count = 0
            rate = rospy.Rate(100)  # 100hz
            while not rospy.is_shutdown():

                if self.stLearning:
                    ## Start episode ##
                    episode_count += 1

                    # Reset gripper
                    reset_srv()
                    while not self.gripper_closed:
                        rate.sleep()

                    # Get observation
                    obs = np.array(obs_srv().state)
                    self.prev_dis2goal = np.linalg.norm(self.g - obs[:2])

                    observations = []
                    actions = []
                    v_preds = []
                    rewards = []
                    run_policy_steps = 0
                    while True:  # run policy RUN_POLICY_STEPS which is much less than episode length
                        run_policy_steps += 1
                        print(
                            '[RL] Step %d in episode %d, distance to goal: %f.'
                            % (run_policy_steps, episode_count,
                               self.prev_dis2goal))

                        pub_goal.publish(gg)

                        obs = np.stack([obs]).astype(
                            dtype=np.float32
                        )  # prepare to feed placeholder Policy.obs

                        while 1:
                            act, v_pred = Policy.act(obs=obs, stochastic=True)
                            act = np.asscalar(act)
                            v_pred = np.asscalar(v_pred)
                            if act < 8:
                                break

                        # Act
                        suc = move_srv(self.A[act])
                        rospy.sleep(0.05)
                        rate.sleep()

                        if suc:
                            # Get observation
                            next_obs = np.array(obs_srv().state)
                            fail = drop_srv(
                            ).dropped  # Check if dropped - end of episode
                        else:
                            # End episode if overload or angle limits reached
                            rospy.logerr(
                                '[RL] Failed to move gripper. Episode declared failed.'
                            )
                            fail = True

                        reward, done = self.transition_reward(next_obs, fail)

                        observations.append(obs)
                        actions.append(act)
                        v_preds.append(v_pred)
                        rewards.append(
                            reward
                        )  # Weird that this is before the step - this is the reward of the previos action

                        print(
                            '[RL] Action %d yielded reward %f and position (%f,%f).'
                            % (act, reward, obs[0][0], obs[0][1]))

                        if run_policy_steps > self.max_steps:
                            done = True

                        if done:
                            v_preds_next = v_preds[1:] + [
                                0
                            ]  # next state of terminate state has 0 state value - adds zero in the end of the vector

                            reward = -1
                            break
                        else:
                            obs = next_obs

                        rate.sleep()

                    print('episode_length', run_policy_steps, 'episode_reward',
                          sum(rewards))

                    writer.add_summary(
                        tf.Summary(value=[
                            tf.Summary.Value(tag='episode_length',
                                             simple_value=run_policy_steps)
                        ]), episode_count)
                    writer.add_summary(
                        tf.Summary(value=[
                            tf.Summary.Value(tag='episode_reward',
                                             simple_value=sum(rewards))
                        ]), episode_count)

                    if sum(rewards) >= self.stop_bound:
                        success_num += 1
                        if success_num >= 100:
                            saver.save(
                                sess,
                                '/home/pracsys/catkin_ws/src/rutgers_collab/src/rl_pkg/logs/model_ppo.ckpt'
                            )
                            print('Clear!! Model saved.')
                            break
                    else:
                        success_num = 0

                    gaes = PPO.get_gaes(rewards=rewards,
                                        v_preds=v_preds,
                                        v_preds_next=v_preds_next)

                    # convert list to numpy array for feeding tf.placeholder
                    observations = np.reshape(observations,
                                              newshape=[-1] + list(
                                                  (self.n_inputs, )))
                    actions = np.array(actions).astype(dtype=np.int32)
                    rewards = np.array(rewards).astype(dtype=np.float32)
                    v_preds_next = np.array(v_preds_next).astype(
                        dtype=np.float32)
                    gaes = np.array(gaes).astype(dtype=np.float32)
                    gaes = (gaes - gaes.mean()) / gaes.std()

                    PPO.assign_policy_parameters()

                    inp = [observations, actions, rewards, v_preds_next, gaes]

                    # train
                    for epoch in range(4):
                        sample_indices = np.random.randint(
                            low=0, high=observations.shape[0],
                            size=64)  # indices are in [low, high)
                        sampled_inp = [
                            np.take(a=a, indices=sample_indices, axis=0)
                            for a in inp
                        ]  # sample training data
                        PPO.train(obs=sampled_inp[0],
                                  actions=sampled_inp[1],
                                  rewards=sampled_inp[2],
                                  v_preds_next=sampled_inp[3],
                                  gaes=sampled_inp[4])

                    summary = PPO.get_summary(obs=inp[0],
                                              actions=inp[1],
                                              rewards=inp[2],
                                              v_preds_next=inp[3],
                                              gaes=inp[4])[0]

                    writer.add_summary(summary, episode_count)

                if episode_count > self.max_episodes:
                    break

                rate.sleep()

            writer.close()
示例#6
0
def main():
    env = gym.make(ENV)  # Instancia o ambiente CartPole
    env.seed(0)  #
    ob_space = env.observation_space  # Descrevem o formato de observações válidas do espaço
    Policy = Policy_net('policy', env)  # Cria a rede de Politica
    Old_Policy = Policy_net('old_policy',
                            env)  # Cria a rede de politica antiga
    PPO = PPOTrain(Policy, Old_Policy, gamma=GAMMA)
    saver = tf.train.Saver()  #

    with tf.Session() as sess:  # Bloco da sessão
        writer = tf.summary.FileWriter('./log/train',
                                       sess.graph)  # Define diretório de logs
        sess.run(tf.global_variables_initializer())  # Inicializa as redes

        obs = env.reset()  # Reseta o ambiente e obtêm a primeira observação
        reward = 0  # Armazena as recompensas
        success_num = 0  # Contador de sucessos

        for episode in range(EPISODES):  # Loop do episodio
            observations = []  # Array pra armazenar as observações
            actions = []  # Array pra armazenar as ações
            v_preds = []  # Array pra armazenar as previsões
            rewards = []  # Array pra armazenar as recompensas
            run_policy_steps = 0  # Contador de passos em cada episodio
            env.render()  # Renderiza o ambiente

            while True:  # Run policy RUN_POLICY_STEPS which is much less than episode length
                # Execute a política RUN_POLICY_STEPS, que é muito menor que a duração do episódio
                run_policy_steps += 1  # Incrementa contador de passos de cada episodio
                obs = np.stack([obs]).astype(
                    dtype=np.float32)  # prepare to feed placeholder Policy.obs
                act, v_pred = Policy.act(
                    obs=obs, stochastic=True
                )  # Corre a rede neural e obtêm uma ação e o V previsto

                act = act.item()  # Transforma um array do numpy
                v_pred = v_pred.item()  # em um objeto scalar do Python

                observations.append(
                    obs)  # Adiciona a observação ao buffer de observações
                actions.append(act)  # Adiciona a ação ao buffer de ações
                v_preds.append(v_pred)  # Adiciona a v_pred ao buffer de v_pred
                rewards.append(
                    reward)  # Adiciona a recompensa ao buffer de recompensa

                next_obs, reward, done, info = env.step(
                    act
                )  # envia a ação ao ambiente e recebe a próxima observação, a recompensa e se o passo terminou

                if done:  # Se o done for verdadeiro ...

                    v_preds_next = v_preds[1:] + [
                        0
                    ]  # [1:] seleciona do segundo elemento da lista em diante e + [0] adiciona um elemento de valor zero no final da lista
                    # next state of terminate state has 0 state value
                    # próximo estado do estado final tem 0 valor de estado
                    obs = env.reset()  #   Redefine o ambiente
                    reward = -1  #   define a recompensa como -1 (?)
                    break  #   Sai do loop while
                else:  # Senão...
                    obs = next_obs  #   Armazena em obs a próxima observação

            # Armazena em log para visualização no tensorboard
            writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag='episode_length',
                                     simple_value=run_policy_steps)
                ]), episode)
            writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag='episode_reward',
                                     simple_value=sum(rewards))
                ]), episode)

            # Condicional para finalizar o teste
            if sum(
                    rewards
            ) >= 195:  # Se a soma das recompensas for maior ou igual 195
                success_num += 1  #   Incrementa o contador de sucessos
                if success_num >= 100:  #   Se ocorrerem 100 sucessos
                    saver.save(sess,
                               './model/model.ckpt')  #       Salva a sessão
                    print('Clear!! Model saved.')  #       Escreve na tela
                    break  #       Sai do loop
            else:  # senão,
                success_num = 0  #   zera o contador de sucessos

            print("EP: ", episode, " Rw: ", sum(rewards)
                  )  # Escreve na tela o numero do episodio e a recompensa

            gaes = PPO.get_gaes(rewards=rewards,
                                v_preds=v_preds,
                                v_preds_next=v_preds_next)  # ?

            # Converte lista em NPArray para alimentar o tf.placeholder
            newshape = [-1] + list(ob_space.shape)  # cria um array [-1, 4]
            observations = np.reshape(
                observations, newshape=newshape
            )  # antes, cada linha de observations era um array idependente. depois do reshape, observations passou ser um array só com varias linhas.

            actions = np.array(actions).astype(dtype=np.int32)

            rewards = np.array(rewards).astype(dtype=np.float32)
            v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)
            gaes = np.array(gaes).astype(dtype=np.float32)
            gaes = (gaes - gaes.mean()) / gaes.std(
            )  # subtrai dos itens de gaes a media de todos os itens de gaes e divide todos pelo desvio padrao de gaes

            PPO.assign_policy_parameters()

            inp = [
                observations, actions, rewards, v_preds_next, gaes
            ]  # Cria um array com 5 colunas: observações, ações, recompensas,

            # Treina
            for epoch in range(4):
                sample_indices = np.random.randint(
                    low=0, high=observations.shape[0],
                    size=64)  # índices estão em [baixo, alto]
                sampled_inp = []
                for a in inp:
                    sampled_inp.append(
                        np.take(a=a, indices=sample_indices,
                                axis=0))  # amostra de dados de treinamento
                PPO.train(obs=sampled_inp[0],
                          actions=sampled_inp[1],
                          rewards=sampled_inp[2],
                          v_preds_next=sampled_inp[3],
                          gaes=sampled_inp[4])

            summary = PPO.get_summary(obs=inp[0],
                                      actions=inp[1],
                                      rewards=inp[2],
                                      v_preds_next=inp[3],
                                      gaes=inp[4])[0]

            writer.add_summary(summary, episode)
        writer.close()  # Final do episódio