Python StartOpenAI_ROS_Environment.reset 예제들, openai_ros.openai_ros_common.StartOpenAI_ROS_Environment.reset Python 예제들

예제 #1

0

파일 보기

def main():
    map_dir = "/home/i2rlab/shahil_files/shahil_RL_ws_new/src/turtlebot/turtlebot_gazebo/worlds/"
    #os.rename(map_dir+"house_train.world",map_dir+"maze.world")
    rospy.init_node('turtlebot2_human', anonymous=True, log_level=rospy.WARN)
    env = StartOpenAI_ROS_Environment(
            'MyTurtleBot2HumanModel-v1')
    #os.rename(map_dir+"maze.world",map_dir+"house_train.world")
    s_dim = env.observation_space.shape[0]+4
    a_dim = env.action_space.shape[0]
    a_bound = env.action_space.high
    #print('a_bound test', a_bound)
    sign_talker = rospy.Publisher('/sign', Int64, queue_size=1)
    ####################### load agent #######################################
    #TD3 = td3(a_dim, s_dim, a_bound, GAMMA, TAU, MEMORY_CAPACITY, BATCH_SIZE, LR_A, LR_C)
    #TD3.loader()

    d = [0.5,1,0.5,1]+list(6*np.ones([180]))
    #A = np.array([[1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,1]])
    #TGREEN =  '\033[32m' # Green Text
    #ENDC = '\033[m' # reset to the defaults

    #################### Start ####################
    i=0
    while i <2:
        s = env.reset()
        #print("env printing: ",env)
        t1 = time.time()
        min_distance_ob = s[-2]
        theta_bt_uh_ob = s[-1]
        #a_old = A[0]
        a_old = np.array([1,0,0,0])
        s = np.hstack((a_old,np.array(s[3:-2])/d))
        for j in range(MAX_EP_STEPS):
            ################ Mode #########################################
            #if i%2==0:
                #a = TD3.choose_action(s,1)
                #a = [1,0,0,0]
                #if a[1]<=0.9 and min_distance_ob<=0.7 and abs(theta_bt_uh_ob)<1.3:
                    #print (TGREEN + "Going to Crash!" , ENDC)
                    #a = A[1]
            '''else:
                a = A[0]'''

            #x_test = env.state_msg.pose.position.x
            #print('x_test',env.collect_data)

            a = np.array([1,0,0,0])
            s_, r, done, info = env.step(a)
            sign_talker.publish(int(r))
            if r <-200:
                done = 0
            min_distance_ob = s_[-2]
            theta_bt_uh_ob = s_[-1]
            a_old = a
            s=np.hstack((a_old,np.array(s_[3:-2])/d))
            if done or j == MAX_EP_STEPS-1:
                t = time.time()-t1
                if j>3:
                    i=i+1
                break

예제 #2

0

파일 보기

파일: ddqn_test.py 프로젝트: hgaren/aliengo_reinforcement_learning

    def test(self):
        # Initialize OpenAI_ROS ENV
        LoadYamlFileParamsTest(rospackage_name="learning_ros", rel_path_from_package_to_file="config", yaml_file_name="aliengo_stand.yaml")
        env = StartOpenAI_ROS_Environment('AliengoStand-v0')
        saver = tf.train.Saver()

        time.sleep(3)
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            saver.restore(sess, 'model_test/model_ddqn.ckpt')
            scores = []
            success_num = 0

            for e in range(self.EPISODES):
                state = env.reset()
                state = np.reshape(state, [1, self.state_size])
                done = False
                i = 0
                rewards = []
                while  not rospy.is_shutdown():  # until ros is not shutdown                #self.env.render()
                    # height prob is chosen as action
                    action = np.argmax(self.model.predict(state))
                    next_state, reward, done, _ = env.step(action)
                    time.sleep(0.01)

                    next_state = np.reshape(next_state, [1, self.state_size])
                    rewards.append(reward)
                    self.remember(state, action, reward, next_state, done)
                    
                    i += 1
                    if done:
                        # every step update target model
                        self.update_target_model()
                        

                        state = env.reset()
                        break 
                    else:
                        state = next_state   
                    self.replay()
                scores.append(sum(rewards))
               
        plt.plot(scores)
        plt.show()

예제 #3

0

파일 보기

파일: start_training.py 프로젝트: rynsy/rl_bot

    initial_epsilon = qlearn.epsilon

    start_time = time.time()
    highest_reward = 0

    # Starts the main training loop: the one about the episodes to do
    for x in range(nepisodes):
        rospy.logdebug("############### WALL START EPISODE=>" + str(x))

        cumulated_reward = 0
        done = False
        if qlearn.epsilon > 0.05:
            qlearn.epsilon *= epsilon_discount

        # Initialize the environment and get first state of the robot
        observation = env.reset()
        state = ''.join(map(str, observation))

        # Show on screen the actual situation of the robot
        # env.render()
        # for each episode, we test the robot for nsteps
        for i in range(nsteps):
            rospy.logwarn("############### Start Step=>" + str(i))
            # Pick an action based on the current state
            action = qlearn.chooseAction(state)
            rospy.logwarn("Next action is:%d", action)
            # Execute the action in the environment and get feedback
            observation, reward, done, info = env.step(action)

            rospy.logwarn(str(observation) + " " + str(reward))
            cumulated_reward += reward

예제 #4

0

파일 보기

파일: start_training.py 프로젝트: or-tal-robotics/dql_sumo

    episode_lens = np.zeros(num_episodes)

    epsilon = 1.0
    epsilon_min = 0.1
    epsilon_change = (epsilon - epsilon_min) / 500000

    rospy.init_node('sumo_dqlearn', anonymous=True, log_level=rospy.WARN)

    episode_counter_pub = rospy.Publisher('/episode_counter', Int16)

    # Init OpenAI_ROS ENV
    task_and_robot_environment_name = rospy.get_param(
        '/turtlebot2/task_and_robot_environment_name')
    env = StartOpenAI_ROS_Environment(task_and_robot_environment_name)
    rospy.loginfo("Gym environment done")
    obs = env.reset()
    print obs.shape

    # Create the Gym environment
    rospy.loginfo("Gym environment done")
    rospy.loginfo("Starting Learning")

    # Set the logging system
    rospack = rospkg.RosPack()
    pkg_path = rospack.get_path('gazebo_sumo')
    #outdir = pkg_path + '/training_results'
    #env = wrappers.Monitor(env, outdir, force=True)
    rospy.loginfo("Monitor Wrapper started")

    last_time_steps = np.ndarray(0)

예제 #5

0

파일 보기

파일: turtlebot2_wall_random.py 프로젝트: PierreExeter/custom_gym_envs

rospy.init_node('turtlebot2_wall_random', anonymous=True, log_level=rospy.WARN)
task_and_robot_environment_name = rospy.get_param(
    '/turtlebot2/task_and_robot_environment_name')
env = StartOpenAI_ROS_Environment(task_and_robot_environment_name)

print("Environment: ", env)
print("Action space: ", env.action_space)
# print(env.action_space.high)
# print(env.action_space.low)
print("Observation space: ", env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

for episode in range(20):

    env.reset()

    for t in range(100):

        action = env.action_space.sample()
        obs, reward, done, info = env.step(action)

        print("episode: ", episode)
        print("timestep: ", t)
        print("obs: ", obs)
        print("action:", action)
        print("reward: ", reward)
        print("done: ", done)
        print("info: ", info)

        if done:

예제 #6

0

파일 보기

파일: deepq.py 프로젝트: eldarsilver/DQN_Pytorch_ROS

    basedirpathlogs = os.path.join(basedir, "logs")
    logdir = os.path.join(basedirpathlogs, datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
    writer = SummaryWriter(log_dir=logdir)
    #tracedir = "$HOME/python3_ws/src/turtle2_openai_ros_example/src/trace"
    tracedir = os.path.join(basedir, "trace")
    ############################################################################

    print("Target reward: {}".format(env.spec.reward_threshold))
    step_count = 0
    ep_rew_history = []
    i_episode, ep_reward = 0, -float('inf')
    while step_count < num_steps:
        rospy.logdebug("############### START EPISODE=>" + str(i_episode))
        # Initialize the environment and state
        # type(state): <class 'list'>
        state, done = env.reset(), False
        state = [round(num, 1) for num in state]
        list_state = state
        #print("\n type(state): ")
        #print(type(state))
        rospy.logwarn("# state we are => " + str(state))
        state = torch.from_numpy(np.array(state)).float().unsqueeze(0).to(device)
        while not done:
            rospy.logwarn("i_episode: " + str(i_episode))
            rospy.logwarn("step_count: " + str(step_count))
            # Select an action
            eps_greedy_threshold = compute_eps_threshold(step_count, eps_start, eps_end, eps_decay)
            action, policy_act, policy_used = select_action(policy_net, state, device, env, eps_greedy_threshold, n_actions)
            rospy.logwarn("Next action is:%d", action)

            # Perform action in env

예제 #7

0

파일 보기

    def run(self):
        # Initialize OpenAI_ROS ENV
        LoadYamlFileParamsTest(rospackage_name="learning_ros",
                               rel_path_from_package_to_file="config",
                               yaml_file_name="aliengo_stand.yaml")
        env = StartOpenAI_ROS_Environment('AliengoStand-v0')
        saver = tf.train.Saver()

        time.sleep(3)
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            scores = []
            success_num = 0

            for e in range(self.EPISODES):
                state = env.reset()
                state = np.reshape(state, [1, self.state_size])
                done = False
                rewards = []
                while not rospy.is_shutdown(
                ):  # until ros is not shutdown                #self.env.render()
                    action = self.act(state)
                    next_state, reward, done, _ = env.step(action)
                    time.sleep(0.01)

                    next_state = np.reshape(next_state, [1, self.state_size])
                    rewards.append(reward)
                    self.remember(state, action, reward, next_state, done)

                    if done:
                        # every step update target model
                        self.update_target_model()

                        state = env.reset()
                        break
                    else:
                        state = next_state
                    self.replay()
                    #if consectuvely 10 times has high reward end the training
                scores.append(sum(rewards))
                if sum(rewards) >= 400:
                    success_num += 1
                    print("Succes number: " + str(success_num))
                    if success_num >= 5:  #checkpoint
                        if (ddqn):
                            saver.save(sess, 'model_train/model_ddqn.ckpt')
                        else:
                            saver.save(sess, 'model_train/model_dqn.ckpt')

                        print('Clear!! Model saved.')
                    if success_num >= 10:
                        if (ddqn):
                            saver.save(sess, 'model_train/model_ddqn.ckpt')
                        else:
                            saver.save(sess, 'model_train/model_dqn.ckpt')

                        print('Clear!! Model saved. AND Finished! ')
                        break

                else:
                    success_num = 0
        plt.plot(scores)
        plt.show()

예제 #8

0

파일 보기

파일: agent_human_experiment.py 프로젝트: shahils01/Deep-Reinforcement-Learning-Shared-Control

def main():
    name = input('Please input your exp number:')
    folder = "Human_exp_new_" + str(name)
    os.system("mkdir " + folder)

    ####################### inite environment ########################################
    map_dir = "/home/i2rlab/shahil_files/shahil_RL_ws_new/src/turtlebot/turtlebot_gazebo/worlds/"
    #os.rename(map_dir+"house_3.world",map_dir+"maze.world")
    rospy.init_node('turtlebot2_human', anonymous=True, log_level=rospy.WARN)
    env = StartOpenAI_ROS_Environment('MyTurtleBot2HumanModel-v1')
    #os.rename(map_dir+"maze.world",map_dir+"house_3.world")
    s_dim = env.observation_space.shape[0] + 4
    a_dim = env.action_space.shape[0]
    a_bound = env.action_space.high
    sign_talker = rospy.Publisher('/sign', Int64, queue_size=1)
    ####################### load agent #######################################
    TD3 = td3(a_dim, s_dim, a_bound, GAMMA, TAU, MEMORY_CAPACITY, BATCH_SIZE,
              LR_A, LR_C)
    TD3.loader()
    # ddpg = DDPG(a_dim, s_dim, a_bound, GAMMA, TAU, MEMORY_CAPACITY, BATCH_SIZE, LR_A, LR_C)
    # with ddpg.sess.as_default():
    #     with ddpg.g.as_default():
    #         ddpg.loader()
    d = [0.5, 1, 0.5, 1] + list(6 * np.ones([180]))
    A = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]])
    TGREEN = '\033[32m'  # Green Text
    ENDC = '\033[m'  # reset to the defaults

    #################### Initialize Data #################################
    n_h = 0  ############## number of eps of pure human control #################
    n_s_TD3 = 0  ############## number of eps of shared control #################
    # n_s_ddpg = 0

    n_crash_h = 0  ############## number of crash in pure human control #################
    n_crash_s_TD3 = 0  ############## number of crash in shared control #################
    # n_crash_s_ddpg = 0

    n_success_h = 0  ############## number of success of pure human control #################
    n_success_s_TD3 = 0  ############## number of success of shared control #################
    # n_success_s_ddpg = 0

    t_h = 0  ############## total time of pure human control #################
    t_s_TD3 = 0  ############## total time of shared control #################
    # t_s_ddpg = 0
    #################### SART #####################
    SA_h = 0
    SA_s_TD3 = 0
    # SA_s_ddpg = 0
    #################### workload #################
    WL_h = 0
    WL_s_TD3 = 0
    # WL_s_ddpg = 0
    #################### Start ####################
    i = 0
    total_ep = 2
    sequence_ep = [0, 1]
    #np.random.shuffle(sequence_ep)
    pureh_ep = sequence_ep[0:int(total_ep / 2)]
    # ddpg_ep = sequence_ep[int(total_ep/3):int(2*total_ep/3)]
    td3_ep = sequence_ep[int(total_ep / 2):]
    n_d = 0
    while i < total_ep:
        s = env.reset()
        print(s)
        t1 = time.time()
        min_distance_ob = s[-2]
        theta_bt_uh_ob = s[-1]
        a_old = A[0]
        s = np.hstack((a_old, np.array(s[3:-2]) / d))
        done = 0
        while not done:
            ################ Mode #########################################
            if i in td3_ep:
                a = TD3.choose_action(s, 1)
                if a[1] <= 0.9 and min_distance_ob <= 0.7 and abs(
                        theta_bt_uh_ob) < 1.3:
                    print(TGREEN + "Going to Crash!", ENDC)
                    a = A[1]
            elif i in pureh_ep:
                a = A[0]

            s_, r, done, info = env.step(a)
            sign_talker.publish(int(r))
            if r < -400 and done:
                done = 0
                if i in td3_ep:
                    n_crash_s_TD3 += 1
                elif i in pureh_ep:
                    n_crash_h += 1
            print("n_crash_s_TD3:", n_crash_s_TD3)
            print("n_crash_h:", n_crash_h)
            print("i:", i)
            print("n_d:", n_d)
            min_distance_ob = s_[-2]
            theta_bt_uh_ob = s_[-1]
            a_old = a
            s = np.hstack((a_old, np.array(s_[3:-2]) / d))
            if done:
                t = 0
                n_d = n_d + 1
                if n_d % 5 == 0:
                    instability = int(
                        input(
                            'How changeable is the situation (1: stable and straightforward -- 7: changing suddenly):'
                        ))
                    variability = int(
                        input(
                            'How many variables are changing within the situation: (1: very few -- 7: large number):'
                        ))
                    complexity = int(
                        input(
                            'How complicated is the situation (1: simple -- 7: complex):'
                        ))
                    arousal = int(
                        input(
                            'How aroused are you in the situation (1: low degree of alertness -- 7: alert and ready for activity):'
                        ))
                    spare = int(
                        input(
                            'How much mental capacity do you have to spare in the situation (1: Nothing to spare -- 7: sufficient):'
                        ))
                    concentration = int(
                        input(
                            'How much are you concentrating on the situation (1: focus on only one thing -- 7: many aspect):'
                        ))
                    attention = int(
                        input(
                            'How much is your attention divide in the situation (1: focus on only one thing -- 7: many aspect):'
                        ))
                    quantity = int(
                        input(
                            'How much information have your gained about the situation (1: little -- 7: much):'
                        ))
                    quality = int(
                        input(
                            'How good information have you been accessible and usable (1: poor -- 7: good):'
                        ))
                    famlilarity = int(
                        input(
                            'How familar are you with the situation (1: New situation -- 7: a great deal of relevant experience):'
                        ))
                    SA = quantity + quality + famlilarity - (
                        (instability + variability + complexity) -
                        (arousal + spare + concentration + attention))
                    if i in td3_ep:
                        SA_s_TD3 += SA
                        WL_s_TD3 += float(
                            input('Please input your workload (from TLX):'))
                        t_s_TD3 += t
                        n_s_TD3 += 1
                        if r > 500:
                            n_success_s_TD3 += 1
                    elif i in pureh_ep:
                        SA_h += SA
                        WL_h += float(
                            input('Please input your workload (from TLX):'))
                        t_h += t
                        n_h += 1
                        if r > 500:
                            n_success_h += 1
                    i = i + 1
                break

    np.savetxt(
        'data.dat',
        np.array([[
            n_s_TD3, t_s_TD3, n_crash_s_TD3, n_success_s_TD3, SA_s_TD3,
            WL_s_TD3
        ], [n_h, t_h, n_crash_h, n_success_h, SA_h, WL_h]]))
    #,[n_s_ddpg,t_s_ddpg,n_crash_s_ddpg,n_success_s_ddpg, SA_s_ddpg, WL_s_ddpg]]))
    ########### shared_TD3: number of eps, total time, time of crash, time of success, situation awareness, workload ###########
    ########### human: number of eps, total time, time of crash, time of success, situation awareness, workload ###########
    ########### shared_TD3_ddpg: number of eps, total time, time of crash, time of success, situation awareness, workload ###########
    os.system('cp -r data.dat ' + folder + '/')

예제 #9

0

파일 보기

class DQNRobotSolver():
    def __init__(self,
                 environment_name,
                 n_observations,
                 n_actions,
                 n_episodes=1000,
                 n_win_ticks=195,
                 min_episodes=100,
                 max_env_steps=None,
                 gamma=1.0,
                 epsilon=1.0,
                 epsilon_min=0.01,
                 epsilon_log_decay=0.995,
                 alpha=0.01,
                 alpha_decay=0.01,
                 batch_size=64,
                 monitor=False,
                 quiet=False):
        self.memory = deque(maxlen=100000)

        # self.env = gym.make(environment_name)
        self.env = StartOpenAI_ROS_Environment(environment_name)

        if monitor:
            self.env = gym.wrappers.Monitor(self.env,
                                            '../data/cartpole-1',
                                            force=True)

        self.input_dim = n_observations
        self.n_actions = n_actions
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_log_decay
        self.alpha = alpha
        self.alpha_decay = alpha_decay
        self.n_episodes = n_episodes
        self.n_win_ticks = n_win_ticks
        self.min_episodes = min_episodes
        self.batch_size = batch_size
        self.quiet = quiet
        if max_env_steps is not None:
            self.env._max_episode_steps = max_env_steps

        # Init model
        self.model = Sequential()

        self.model.add(Dense(24, input_dim=self.input_dim, activation='tanh'))
        self.model.add(Dense(48, activation='tanh'))
        self.model.add(Dense(self.n_actions, activation='linear'))
        self.model.compile(loss='mse',
                           optimizer=Adam(lr=self.alpha,
                                          decay=self.alpha_decay))

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def choose_action(self, state, epsilon):
        return self.env.action_space.sample() if (
            np.random.random() <= epsilon) else np.argmax(
                self.model.predict(state))

    def get_epsilon(self, t):
        return max(
            self.epsilon_min,
            min(self.epsilon, 1.0 - math.log10((t + 1) * self.epsilon_decay)))

    def preprocess_state(self, state):
        return np.reshape(state, [1, self.input_dim])

    def replay(self, batch_size):
        x_batch, y_batch = [], []
        minibatch = random.sample(self.memory, min(len(self.memory),
                                                   batch_size))
        for state, action, reward, next_state, done in minibatch:
            y_target = self.model.predict(state)
            y_target[0][
                action] = reward if done else reward + self.gamma * np.max(
                    self.model.predict(next_state)[0])
            x_batch.append(state[0])
            y_batch.append(y_target[0])

        self.model.fit(np.array(x_batch),
                       np.array(y_batch),
                       batch_size=len(x_batch),
                       verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def run(self):

        rate = rospy.Rate(30)

        scores = deque(maxlen=100)

        for e in range(self.n_episodes):

            init_state = self.env.reset()

            state = self.preprocess_state(init_state)
            done = False
            i = 0
            while not done:
                # openai_ros doesnt support render for the moment
                #self.env.render()
                action = self.choose_action(state, self.get_epsilon(e))
                next_state, reward, done, _ = self.env.step(action)
                next_state = self.preprocess_state(next_state)
                self.remember(state, action, reward, next_state, done)
                state = next_state
                i += 1

            scores.append(i)
            mean_score = np.mean(scores)
            if mean_score >= self.n_win_ticks and e >= min_episodes:
                if not self.quiet:
                    print('Ran {} episodes. Solved after {} trials'.format(
                        e, e - min_episodes))
                return e - min_episodes
            if e % 1 == 0 and not self.quiet:
                print(
                    '[Episode {}] - Mean survival time over last {} episodes was {} ticks.'
                    .format(e, min_episodes, mean_score))

            self.replay(self.batch_size)

        if not self.quiet: print('Did not solve after {} episodes'.format(e))
        return e

예제 #10

0

파일 보기

def main():
    # Initialize OpenAI_ROS ENV
    LoadYamlFileParamsTest(rospackage_name="learning_ros",
                           rel_path_from_package_to_file="config",
                           yaml_file_name="aliengo_stand.yaml")
    env = StartOpenAI_ROS_Environment('AliengoStand-v0')
    time.sleep(3)
    # Initialize PPO agent
    Policy = Policy_net('policy', ob_space=3, act_space=8)
    Old_Policy = Policy_net('old_policy', ob_space=3, act_space=8)
    PPO = PPOTrain(Policy, Old_Policy, gamma=GAMMA)
    saver = tf.train.Saver()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        obs = env.reset()
        reward = 0
        success_num = 0
        scores = []

        for iteration in range(ITERATION):
            observations = []
            actions = []
            v_preds = []
            rewards = []
            while not rospy.is_shutdown():  # until ros is not shutdown
                # prepare to feed placeholder Policy.obs
                obs = np.stack([obs]).astype(dtype=np.float32)
                act, v_pred = Policy.act(obs=obs, stochastic=True)
                #print('act: ',act, 'v_pred: ',v_pred )

                act = np.asscalar(act)
                v_pred = np.asscalar(v_pred)
                observations.append(obs)
                actions.append(act)
                v_preds.append(v_pred)
                rewards.append(reward)
                #execute according to action
                next_obs, reward, done, _ = env.step(act)
                time.sleep(0.01)
                if done:
                    # next state of terminate state has 0 state value
                    v_preds_next = v_preds[1:] + [0]
                    obs = env.reset()
                    break
                else:
                    obs = next_obs
            #scores store for visualization
            scores.append(sum(rewards))
            #if consectuvely 10 times has high reward end the training
            if sum(rewards) >= 400:
                success_num += 1
                print("Succes number: " + str(success_num))
                if success_num >= 5:
                    saver.save(sess, 'model_train/model_ppo.ckpt')
                    print('Clear!! Model saved.')
                if success_num >= 10:
                    saver.save(sess, 'model_train/model_ppo.ckpt')
                    print('Finished! ')
                    break

            else:
                success_num = 0

            gaes = PPO.get_gaes(rewards=rewards,
                                v_preds=v_preds,
                                v_preds_next=v_preds_next)

            # convert list to numpy array for feeding tf.placeholder
            observations = np.reshape(observations, [len(observations), 3])
            actions = np.array(actions).astype(dtype=np.int32)
            rewards = np.array(rewards).astype(dtype=np.float32)
            v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)
            #calculate generative advantage estimator score
            gaes = np.array(gaes).astype(dtype=np.float32)
            gaes = (gaes - gaes.mean())
            print('gaes', gaes)
            #assign current policy params to previous policy params
            PPO.assign_policy_parameters()

            inp = [observations, actions, rewards, v_preds_next, gaes]

            # PPO train
            for epoch in range(4):
                sample_indices = np.random.randint(
                    low=0, high=observations.shape[0],
                    size=64)  # indices are in [low, high)
                sampled_inp = [
                    np.take(a=a, indices=sample_indices, axis=0) for a in inp
                ]  # sample training data
                PPO.train(obs=sampled_inp[0],
                          actions=sampled_inp[1],
                          rewards=sampled_inp[2],
                          v_preds_next=sampled_inp[3],
                          gaes=sampled_inp[4])
        plt.plot(scores)
        plt.show()
        env.stop()