Exemplo n.º 1
0
    def __init__(
        self,
        config,
        env,
        demo_transitions=None
    ):  #we need another file to give the defination of configuration
        self.sess = tf.InteractiveSession()
        self.config = config
        self.generate_memory = Memory(
            capacity=self.config.generate_memory_size, permanent_data=0)
        self.expert_memory = Memory(capacity=self.config.expert_memory_size,
                                    permanent_data=0)
        self.add_data_to_genarte_memory(source=demo_transitions)
        self.add_data_to_expert_memory(source=demo_transitions)
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n
        self.ob_space = env.observation_space
        self.gamma = 0.95
        self.Policy = Policy_net('policy', env)
        self.Old_Policy = Policy_net('old_policy', env)
        self.PPO = PPOTrain(self.Policy, self.Policy, self.gamma)
        self.D = Discriminator(env)
        self.epsilon = self.config.INITIAL_EPSILON
        self.saver = tf.train.Saver()

        self.sess.run(tf.global_variables_initializer())
        print("we have initialized the GAIL")

        self.save_model()
        self.restore_model()
Exemplo n.º 2
0
def train():
    FLAGS(sys.argv)
    with sc2_env.SC2Env(
            map_name='MoveToBeacon',
            agent_interface_format=sc2_env.parse_agent_interface_format(
                feature_screen=64,
                feature_minimap=64,
                rgb_screen=None,
                rgb_minimap=None,
                action_space=None,
                use_feature_units=False),
            step_mul=step_mul,
            game_steps_per_episode=None,
            disable_fog=False,
            visualize=True) as env:
        with tf.Session() as sess:
            Policy = Policy_net('policy', 2, 4)
            Old_Policy = Policy_net('old_policy', 2, 4)
            PPO = PPOTrain(Policy, Old_Policy)
            D = Discriminator()
            saver = tf.train.Saver()
            saver.restore(sess, './model/gail.cpkt')
            c = 0
            for episodes in range(100000):
                done = False
                obs = env.reset()
                while not 331 in obs[0].observation.available_actions:
                    actions = actAgent2Pysc2(100, obs)
                    obs = env.step(actions=[actions])
                state = obs2state(obs)
                observations = []
                actions_list = []
                rewards = []
                v_preds = []
                reward = 0
                global_step = 0
                while not done:
                    global_step += 1
                    state = np.stack([state]).astype(dtype=np.float32)
                    act, v_pred = Policy.act(obs=state, stochastic=True)
                    act, v_pred = np.asscalar(act), np.asscalar(v_pred)
                    observations.append(state)
                    actions_list.append(act)
                    rewards.append(reward)
                    v_preds.append(v_pred)
                    actions = actAgent2Pysc2(act, obs)
                    obs = env.step(actions=[actions])
                    next_state = obs2state(obs)
                    distance = obs2distance(obs)
                    if distance < 0.03 or global_step == 100:
                        done = True
                    if done:
                        v_preds_next = v_preds[1:] + [0]
                        break
                    state = next_state
Exemplo n.º 3
0
def train():
    num_process = 4
    sub = SubprocVecEnv(num_process, False)
    state_space = 2
    action_space = 4
    Policy = Policy_net('policy', state_space, action_space)
    Old_Policy = Policy_net('old_policy', state_space, action_space)
    PPO = PPOTrain(Policy, Old_Policy, gamma=0.95)
    with tf.Session() as sess:
        saver = tf.train.Saver()
        tf.set_random_seed(1234)
        #sess.run(tf.global_variables_initializer())
        saver.restore(sess, './synch_ubuntu/model')
        i = 0
        #writer = tf.summary.FileWriter('./board/dqn_per', sess.graph)
        #for i in range(10):
        while True:
            i += 1
            info = sub.reset()
            terminal, each_terminal = False, [False] * num_process
            global_step = 0
            memory = []
            obs_s, state, action, reward, done = trans_data(info, num_process)
            while not terminal:
                time.sleep(0.05)
                global_step += 1
                action, v_pred = get_action(Policy, each_terminal, num_process,
                                            state)
                info = sub.step(action, obs_s, [global_step] * num_process)
                obs_s, next_state, a, reward, done = trans_data(
                    info, num_process)
                each_terminal, terminal = check_done(info, num_process)
                memory.append([state, action, reward, v_pred])

                if terminal:
                    state_, action_, reward_, v_preds_next_, gaes_ = memory_stack(
                        memory, num_process, state_space, PPO)
                    PPO.assign_policy_parameters()
                    inp = [state_, action_, reward_, v_preds_next_, gaes_]
                    for epoch in range(3):
                        sample_indices = np.random.randint(
                            low=0, high=state_.shape[0], size=64)
                        sampled_inp = [
                            np.take(a=t, indices=sample_indices, axis=0)
                            for t in inp
                        ]
                        PPO.train(obs=sampled_inp[0],
                                  actions=sampled_inp[1],
                                  rewards=sampled_inp[2],
                                  v_preds_next=sampled_inp[3],
                                  gaes=sampled_inp[4])
                    #summary = sess.run(merged, feed_dict={r:sum(reward_)/(num_process)})
                    #writer.add_summary(summary, i)
                    #saver.save(sess, './synch_ubuntu/model')
                    if i < 5100:
                        print(sum(reward_) / (num_process), i)
                state = next_state
        sub.close()
Exemplo n.º 4
0
    def __init__(self, env, config):
        self.Soulsess = tf.InteractiveSession()
        self.config = config
        self.expert_memory = Memory(capacity=self.config.EXPERT_MEMORY_SIZE,
                                    permanent_data=0)
        self.generate_memory = Memory(
            capacity=self.config.GENERATE_MEMORY_SIZE, permanent_data=0)
        #self.sess.run(tf.global_variables_initializer())

        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n
        self.ob_space = env.observation_space
        self.gamma = 0.95
        self.Policy = Policy_net('policy', env)
        self.Old_Policy = Policy_net('old_policy', env)
        self.PPO = PPOTrain(self.Policy, self.Old_Policy, self.gamma)
        self.D = Discriminator(env)
        self.epsilon = self.config.INITIAL_EPSILON
        self.saver = tf.train.Saver()
Exemplo n.º 5
0
def main():
    angle = 0.0
    angle_thres_deg = 15
    cart = 0.0
    t.tic()
    reward_max = 5
    reward_min = -5
    reward_disc = 5
    pwm_index = 1
    pwm_list = [("L", 180), ("L", 170), ("L", 160), ("L", 0), ("R", 160),
                ("R", 170), ("R", 180)]
    pwm_list = [("L", 180), ("L", 0), ("R", 180)]
    pwm_list_size = 3
    # Serial port for Arduino
    if (SERIAL_AVAILABLE):
        ser = serial.Serial('COM20', 115200)  # Initialize serial port
        print("connected to: " + ser.portstr)  # Confirm connection

    env = gym.make('CartPole-v0')
    env.seed(0)
    ob_space = env.observation_space
    Policy = Policy_net('policy', env)
    Old_Policy = Policy_net('old_policy', env)
    PPO = PPOTrain(Policy, Old_Policy, gamma=GAMMA)
    saver = tf.train.Saver()

    with tf.Session() as sess:
        if LOAD:
            saver.restore(
                sess, "./model/model_iter_{:d}_rewards_{:d}.ckpt".format(
                    load_iteration, load_rewards))
        else:
            sess.run(
                tf.global_variables_initializer())  # remove me if loading save

        writer = tf.summary.FileWriter('./log/train', sess.graph)
        obs = env.reset()
        reward = 0
        success_num = 0

        for iteration in range(ITERATION):  # episode
            observations = []
            actions = []
            v_preds = []
            rewards = []
            run_policy_steps = 0
            while True:  # run policy RUN_POLICY_STEPS which is much less than episode length
                run_policy_steps += 1
                obs = np.stack([obs]).astype(
                    dtype=np.float32)  # prepare to feed placeholder Policy.obs
                act, v_pred = Policy.act(obs=obs, stochastic=True)

                act = np.asscalar(act)
                v_pred = np.asscalar(v_pred)

                observations.append(obs)
                actions.append(act)
                v_preds.append(v_pred)
                rewards.append(reward)

                # env.render()

                if (act == 1):
                    if pwm_index < pwm_list_size - 1:
                        pwm_index += 1
                else:
                    if pwm_index > 0:
                        pwm_index -= 1

                dir = pwm_list[pwm_index][0]
                pwm = pwm_list[pwm_index][1]
                print(dir)
                print(pwm)

                if (SERIAL_AVAILABLE):
                    PD.writePWM(ser, 180, dir)

                    last_angle = angle
                    angle_deg = PD.getPEncoderPos(
                        ser
                    ) * 360 / 1200  # convert encoder counts (1200) to degrees
                    angle = angle_deg * 2 * math.pi / 360  # convert degrees to radians

                    angle_velocity = (angle - last_angle) / t.tocvalue()

                    last_cart = cart
                    cart = PD.getMEncoderPos(ser)
                    cart_velocity = (cart - last_cart) / t.tocvalue()

                    #print("Angle {:.1f}, Angle_vel (rad/s) {:.1f}, Position (mm) {:.1f}, Velocity (mm/s) {:.1f}".format(angle, angle_velocity, cart,cart_velocity))

                    t.tic()

                m = (reward_max - reward_min) / (reward_disc - angle_thres_deg)
                # reward = min(m*(abs(angle_deg)-reward_disc) + reward_max, reward_max)
                #reward = 1
                reward = ((.9 / 7) * (min(
                    (6 - abs(angle_deg)), (1))) + 6) + ((0.1 / 6) * (min(
                        (5 - abs((cart / 1000))), (1)) + 5))

                # next_obs = [angle angle_velocity cart cart_velocoty]
                # print(next_obs)

                next_obs = [angle, angle_velocity, cart, cart_velocity]
                #print("angle = ", angle_deg)
                print("x: ", PD.getMEncoderPos(ser))
                if abs(angle_deg) > angle_thres_deg:
                    v_preds_next = v_preds[1:] + [
                        0
                    ]  # next state of terminate state has 0 state value
                    print("reward: ", sum(rewards))
                    obs = env.reset()
                    reward = -1
                    print("Iteration: ", iteration)
                    print('Waiting to reset')
                    PD.writePWM(ser, 0, dir)
                    if iteration % 10 == 0:
                        saver.save(
                            sess,
                            "./model/model_iter_{:d}_rewards_{:d}.ckpt".format(
                                iteration, sum(rewards)))
                        print('Scoot scoot!! Model saved.')
                    while (angle_deg > 1.5 or angle_deg < -1.5):
                        time.sleep(0.1)
                        angle_deg = PD.getPEncoderPos(ser) * 360 / 1200
                    print('Entered iteration {:1f}'.format(iteration + 1))
                    break
                else:
                    obs = next_obs

            writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag='episode_length',
                                     simple_value=run_policy_steps)
                ]), iteration)
            writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag='episode_reward',
                                     simple_value=sum(rewards))
                ]), iteration)

            if sum(rewards) >= 195:
                success_num += 1
                if success_num >= 100:
                    saver.save(sess, './model/model.ckpt')
                    print('Clear!! Model saved.')
                    break
            else:
                success_num = 0

            gaes = PPO.get_gaes(rewards=rewards,
                                v_preds=v_preds,
                                v_preds_next=v_preds_next)

            # convert list to numpy array for feeding tf.placeholder
            observations = np.reshape(observations,
                                      newshape=[-1] + list(ob_space.shape))
            actions = np.array(actions).astype(dtype=np.int32)
            rewards = np.array(rewards).astype(dtype=np.float32)
            v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)
            gaes = np.array(gaes).astype(dtype=np.float32)
            if iteration > 0:
                gaes = (gaes - gaes.mean()) / gaes.std()

            PPO.assign_policy_parameters()

            inp = [observations, actions, rewards, v_preds_next, gaes]

            # train
            for epoch in range(4):
                sample_indices = np.random.randint(
                    low=0, high=observations.shape[0],
                    size=64)  # indices are in [low, high)
                sampled_inp = [
                    np.take(a=a, indices=sample_indices, axis=0) for a in inp
                ]  # sample training data
                PPO.train(obs=sampled_inp[0],
                          actions=sampled_inp[1],
                          rewards=sampled_inp[2],
                          v_preds_next=sampled_inp[3],
                          gaes=sampled_inp[4])

            summary = PPO.get_summary(obs=inp[0],
                                      actions=inp[1],
                                      rewards=inp[2],
                                      v_preds_next=inp[3],
                                      gaes=inp[4])[0]

            writer.add_summary(summary, iteration)
        writer.close()
        if (SERIAL_AVAILABLE):
            ser.close()
Exemplo n.º 6
0
def main():
    allrewards = list()
    env = gym.make('CartPole-v0')
    env.seed(0)
    ob_space = env.observation_space
    Policy = Policy_net('policy', env)
    Old_Policy = Policy_net('old_policy', env)
    PPO = PPOTrain(Policy, Old_Policy, gamma=GAMMA)
    saver = tf.train.Saver()
    name = 'Model_Noise'
    filename = "data/{n}_{ts:%H_%M_%S}.csv".format(n=name, ts=datetime.now())
    with open(filename, "w", 1) as result:
        result.write("Iteration, Reward \n")

        with tf.Session() as sess:
            writer = tf.summary.FileWriter('./log/train', sess.graph)
            sess.run(tf.global_variables_initializer())
            obs = env.reset()
            reward = 0
            success_num = 0

            for iteration in range(ITERATION):  # episode
                observations = []
                actions = []
                v_preds = []
                rewards = []
                run_policy_steps = 0
                while True:  # run policy RUN_POLICY_STEPS which is much less than episode length
                    if iteration % 500 == 0:
                        env.render()
                    run_policy_steps += 1
                    obs = np.stack([obs]).astype(
                        dtype=np.float32
                    )  # prepare to feed placeholder Policy.obs
                    act, v_pred = Policy.act(obs=obs, stochastic=True)

                    act = np.asscalar(act)
                    v_pred = np.asscalar(v_pred)

                    observations.append(obs)
                    actions.append(act)
                    v_preds.append(v_pred)
                    rewards.append(reward)

                    next_obs, reward, done, info = env.step(act)

                    if done:
                        v_preds_next = v_preds[1:] + [
                            0
                        ]  # next state of terminate state has 0 state value
                        obs = env.reset()
                        reward = -1
                        break
                    else:
                        obs = next_obs

                writer.add_summary(
                    tf.Summary(value=[
                        tf.Summary.Value(tag='episode_length',
                                         simple_value=run_policy_steps)
                    ]), iteration)
                writer.add_summary(
                    tf.Summary(value=[
                        tf.Summary.Value(tag='episode_reward',
                                         simple_value=sum(rewards))
                    ]), iteration)

                result.write("{:d},{:2f}\n".format(iteration, sum(rewards)))
                print("Rewards: {:2f}, Iterations: {:d}".format(
                    sum(rewards), iteration))
                if sum(rewards) >= 195:
                    success_num += 1
                    if success_num >= 100:
                        saver.save(sess, './model/model.ckpt')
                        print('Clear!! Model saved.')
                        break
                else:
                    success_num = 0

                gaes = PPO.get_gaes(rewards=rewards,
                                    v_preds=v_preds,
                                    v_preds_next=v_preds_next)

                # convert list to numpy array for feeding tf.placeholder
                observations = np.reshape(observations,
                                          newshape=[-1] + list(ob_space.shape))
                actions = np.array(actions).astype(dtype=np.int32)
                rewards = np.array(rewards).astype(dtype=np.float32)
                v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)
                gaes = np.array(gaes).astype(dtype=np.float32)
                gaes = (gaes - gaes.mean()) / gaes.std()

                PPO.assign_policy_parameters()

                inp = [observations, actions, rewards, v_preds_next, gaes]

                # train
                for epoch in range(4):
                    sample_indices = np.random.randint(
                        low=0, high=observations.shape[0],
                        size=64)  # indices are in [low, high)
                    sampled_inp = [
                        np.take(a=a, indices=sample_indices, axis=0)
                        for a in inp
                    ]  # sample training data
                    PPO.train(obs=sampled_inp[0],
                              actions=sampled_inp[1],
                              rewards=sampled_inp[2],
                              v_preds_next=sampled_inp[3],
                              gaes=sampled_inp[4])

                summary = PPO.get_summary(obs=inp[0],
                                          actions=inp[1],
                                          rewards=inp[2],
                                          v_preds_next=inp[3],
                                          gaes=inp[4])[0]

                writer.add_summary(summary, iteration)
            writer.close()
            if iteration % 500 == 0:
                env.close()
Exemplo n.º 7
0
def main():
    # env = gym.make('CartPole-v0')
    num_arm = 5
    env = BernoulliBanditEnv(num_arm)
    env.seed(0)
    ob_space = env.observation_space
    # Policy = Policy_net('policy', env)
    # Old_Policy = Policy_net('old_policy', env)
    Policy = PolicyGRUNet('policy', env)
    Old_Policy = PolicyGRUNet('old_policy', env)
    PPO = PPOTrain(Policy, Old_Policy, gamma=GAMMA)
    saver = tf.train.Saver()

    with tf.Session() as sess:
        writer = tf.summary.FileWriter('./log/train', sess.graph)
        sess.run(tf.global_variables_initializer())
        obs = env.reset()
        reward = 0
        success_num = 0

        for iteration in range(ITERATION):  # episode
            observations = []
            actions = []
            v_preds = []
            rewards = []
            run_policy_steps = 0
            while True:  # run policy RUN_POLICY_STEPS which is much less than episode length
                run_policy_steps += 1
                obs = np.stack([obs]).astype(
                    dtype=np.float32)  # prepare to feed placeholder Policy.obs
                # obs = np.expand_dims(obs, axis=0)
                act, v_pred = Policy.act(obs=obs, stochastic=True)

                act = np.asscalar(act)
                v_pred = np.asscalar(v_pred)

                observations.append(obs)
                actions.append(act)
                v_preds.append(v_pred)
                rewards.append(reward)
                if act not in range(num_arm):
                    act = np.random.randint(num_arm)

                next_obs, reward, done, info = env.step(act)

                if done:
                    v_preds_next = v_preds[1:] + [
                        0
                    ]  # next state of terminate state has 0 state value
                    obs = env.reset()
                    reward = -1
                    break
                else:
                    obs = next_obs

            writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag='episode_length',
                                     simple_value=run_policy_steps)
                ]), iteration)
            writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag='episode_reward',
                                     simple_value=sum(rewards))
                ]), iteration)

            if sum(rewards) >= 195:
                success_num += 1
                if success_num >= 100:
                    saver.save(sess, './model/model.ckpt')
                    print('Clear!! Model saved.')
                    break
            else:
                success_num = 0

            gaes = PPO.get_gaes(rewards=rewards,
                                v_preds=v_preds,
                                v_preds_next=v_preds_next)

            # convert list to numpy array for feeding tf.placeholder
            observations = np.reshape(observations,
                                      newshape=[-1] + list(ob_space.shape))
            # observations = np.expand_dims(observations, axis=0)
            actions = np.array(actions).astype(dtype=np.int32)
            rewards = np.array(rewards).astype(dtype=np.float32)
            v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)
            gaes = np.array(gaes).astype(dtype=np.float32)
            gaes = (gaes - gaes.mean()) / gaes.std()

            PPO.assign_policy_parameters()

            inp = [observations, actions, rewards, v_preds_next, gaes]

            # train
            for epoch in range(4):
                sample_indices = np.random.randint(
                    low=0, high=observations.shape[0],
                    size=64)  # indices are in [low, high)
                sampled_inp = [
                    np.take(a=a, indices=sample_indices, axis=0) for a in inp
                ]  # sample training data
                PPO.train(obs=sampled_inp[0],
                          actions=sampled_inp[1],
                          rewards=sampled_inp[2],
                          v_preds_next=sampled_inp[3],
                          gaes=sampled_inp[4])

            summary = PPO.get_summary(obs=inp[0],
                                      actions=inp[1],
                                      rewards=inp[2],
                                      v_preds_next=inp[3],
                                      gaes=inp[4])[0]

            writer.add_summary(summary, iteration)
        writer.close()
Exemplo n.º 8
0
def run_gail(agent, index_gail, env):
    DG_flag = 1
    #env.seed(0)
    ob_space = env.observation_space
    Policy = Policy_net('policy_' + str(index_gail), env)
    Old_Policy = Policy_net('old_policy' + str(index_gail), env)
    gamma = 0.95
    PPO = PPOTrain(Policy, Old_Policy, gamma)
    D = Discriminator(env, index_gail)

    if DG_flag:
        # with open(Config.DEMO_DATA_PATH, 'rb') as f:
        #     demo_transitions = pickle.load(f)
        #     demo_transitions = deque(itertools.islice(demo_transitions, 0, Config.demo_buffer_size))
        #     assert len(demo_transitions) == Config.demo_buffer_size
        expert_data = agent.replay_memory if agent.replay_memory.full(
        ) else agent.demo_memory
        _, demo_transitions, _ = expert_data.sample(agent.config.BATCH_SIZE)
        expert_observations = [data[0] for data in demo_transitions]
        expert_actions = [data[1] for data in demo_transitions]
    else:
        expert_observations = np.genfromtxt('trajectory/observations.csv')
        expert_actions = np.genfromtxt('trajectory/actions.csv',
                                       dtype=np.int32)

    with tf.Session() as sess:
        # writer = tf.summary.FileWriter(args.logdir, sess.graph)
        #load_path=saver.restore(sess,"trained_models/model.ckpt")
        #sess.run(tf.global_variables_initializer())
        #if index_gail>1:
        #   saver.restore(sess, 'trained_models/model' + str(index_gail-1) + '.ckpt')

        obs = env.reset()
        state_for_memory = obs  #为了处理两套程序中使用的数据格式不同
        success_num = 0
        iteration = int(2000)  #0319
        for iteration in range(iteration):
            #print("running policy ")
            observations = []
            #states_for_memory=[]
            actions = []
            # do NOT use rewards to update policy , # 0319 why ?
            rewards = []
            v_preds = []
            run_policy_steps = 0
            score = 0
            if DG_flag:
                t_q = deque(maxlen=Config.trajectory_n)
                done, score, n_step_reward, state_for_memory = False, 0, None, env.reset(
                )
            while True:
                run_policy_steps += 1
                obs = np.stack([obs]).astype(
                    dtype=np.float32)  # prepare to feed placeholder Policy.obs
                act, v_pred = Policy.act(obs=obs, stochastic=True)

                act = np.asscalar(act)
                v_pred = np.asscalar(v_pred)
                next_obs, reward, done, info = env.step(act)
                next_state_for_memory = next_obs
                score += reward
                if DG_flag:
                    reward_to_sub = 0. if len(t_q) < t_q.maxlen else t_q[0][
                        2]  # record the earliest reward for the sub
                    t_q.append([
                        state_for_memory, act, reward, next_state_for_memory,
                        done, 0.0
                    ])
                    if len(t_q) == t_q.maxlen:
                        if n_step_reward is None:  # only compute once when t_q first filled
                            n_step_reward = sum([
                                t[2] * Config.GAMMA**i
                                for i, t in enumerate(t_q)
                            ])
                        else:
                            n_step_reward = (n_step_reward -
                                             reward_to_sub) / Config.GAMMA
                            n_step_reward += reward * Config.GAMMA**(
                                Config.trajectory_n - 1)
                        t_q[0].extend([
                            n_step_reward, next_state_for_memory, done,
                            t_q.maxlen
                        ])  # actual_n is max_len here
                        #agent.perceive(t_q[0])  # perceive when a transition is completed

                env.render()  # 0313
                observations.append(obs)
                actions.append(act)
                rewards.append(reward)
                v_preds.append(v_pred)

                if done:
                    if DG_flag:
                        t_q.popleft(
                        )  # first transition's n-step is already set
                        transitions = set_n_step(t_q, Config.trajectory_n)
                    next_obs = np.stack([next_obs]).astype(
                        dtype=np.float32
                    )  # prepare to feed placeholder Policy.obs
                    _, v_pred = Policy.act(obs=next_obs, stochastic=True)
                    v_preds_next = v_preds[1:] + [np.asscalar(v_pred)]
                    obs = env.reset()
                    print("iteration", iteration, "score", score)
                    break
                else:
                    obs = next_obs
                    state_for_memory = next_state_for_memory
                #print("state_for memory",state_for_memory)
            #writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps)]), iteration)
            #writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))]), iteration)

            #
            if sum(rewards) >= 100:

                success_num += 1
                # todo
                # 在 能够得到较好的回报 的时候 存储这次的demo
                if DG_flag:
                    for t in transitions:
                        agent.perceive(t)
                        agent.replay_memory.memory_len()

                if success_num >= 3:
                    #saver.save(sess, 'trained_models/model.ckpt')
                    #saver.save(sess, 'trained_models/model' + str(index_gail) + '.ckpt')
                    print(success_num)
                    print('Clear!! Model saved.')
                    env.close()
                    break
            else:
                success_num = 0

            # convert list to numpy array for feeding tf.placeholder
            observations = np.reshape(observations,
                                      newshape=[-1] + list(ob_space.shape))
            actions = np.array(actions).astype(dtype=np.int32)

            # train discriminator
            for i in range(2):
                #print("training D")
                D.train(expert_s=expert_observations,
                        expert_a=expert_actions,
                        agent_s=observations,
                        agent_a=actions)

            # output of this discriminator is reward
            d_rewards = D.get_rewards(agent_s=observations, agent_a=actions)
            d_rewards = np.reshape(d_rewards,
                                   newshape=[-1]).astype(dtype=np.float32)

            gaes = PPO.get_gaes(rewards=d_rewards,
                                v_preds=v_preds,
                                v_preds_next=v_preds_next)
            gaes = np.array(gaes).astype(dtype=np.float32)
            # gaes = (gaes - gaes.mean()) / gaes.std()
            v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)

            # train policy
            inp = [observations, actions, gaes, d_rewards, v_preds_next]
            PPO.assign_policy_parameters()
            for epoch in range(6):
                #print("updating PPO ")
                sample_indices = np.random.randint(
                    low=0, high=observations.shape[0],
                    size=32)  # indices are in [low, high)
                sampled_inp = [
                    np.take(a=a, indices=sample_indices, axis=0) for a in inp
                ]  # sample training data
                PPO.train(obs=sampled_inp[0],
                          actions=sampled_inp[1],
                          gaes=sampled_inp[2],
                          rewards=sampled_inp[3],
                          v_preds_next=sampled_inp[4])
Exemplo n.º 9
0
def main(args):
    env = gym.make('CartPole-v1')
    env.seed(0)
    ob_space = env.observation_space
    Policy = Policy_net('policy', env)
    Old_Policy = Policy_net('old_policy', env)
    PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma)
    D = Discriminator(env)

    #expert_observations = np.genfromtxt('trajectory/observations.csv')
    #expert_actions = np.genfromtxt('trajectory/actions.csv', dtype=np.int32)
    with open(Config.DEMO_DATA_PATH, 'rb') as f:
        demo_transitions = pickle.load(f)
        demo_transitions = deque(itertools.islice(demo_transitions, 0, Config.DEMO_BUFFER_SIZE))
        print("demo_transitions len: ", len(demo_transitions))
    expert_observations = [data[0] for data in demo_transitions]
    expert_actions = [data[1] for data in demo_transitions]

    saver = tf.train.Saver()

    with tf.Session() as sess:
        # writer = tf.summary.FileWriter(args.logdir, sess.graph)
        sess.run(tf.global_variables_initializer())

        obs = env.reset()
        success_num = 0
        itera = 0
        scores=[]
        for iteration in range(args.iteration):
            observations = []
            actions = []
            # do NOT use rewards to update policy
            rewards = []
            v_preds = []
            run_policy_steps = 0
            score=0

            while True:
                run_policy_steps += 1
                obs = np.stack([obs]).astype(dtype=np.float32)  # prepare to feed placeholder Policy.obs
                act, v_pred = Policy.act(obs=obs, stochastic=True)

                act = np.asscalar(act)
                v_pred = np.asscalar(v_pred)
                next_obs, reward, done, info = env.step(act)
                score = reward+score
                env.render()#0313

                observations.append(obs)
                actions.append(act)
                rewards.append(reward)
                v_preds.append(v_pred)

                if done:
                    itera += 1
                    next_obs = np.stack([next_obs]).astype(dtype=np.float32)  # prepare to feed placeholder Policy.obs
                    _, v_pred = Policy.act(obs=next_obs, stochastic=True)
                    v_preds_next = v_preds[1:] + [np.asscalar(v_pred)]
                    obs = env.reset()
                    print("itera :", itera, "score:{}", score)
                    scores.append(score)
                    break
                else:
                    obs = next_obs

            # writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps)]), iteration)
            # writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))]), iteration)

            if sum(rewards) >= 195:
                success_num += 1
                if success_num >= 100:
                    saver.save(sess, args.savedir + '/model.ckpt')
                    print('Clear!! Model saved.')
                    break
            else:
                success_num = 0

            # convert list to numpy array for feeding tf.placeholder
            observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape))
            actions = np.array(actions).astype(dtype=np.int32)

            # train discriminator
            for i in range(2):
                D.train(expert_s=expert_observations,
                        expert_a=expert_actions,
                        agent_s=observations,
                        agent_a=actions)

            # output of this discriminator is reward
            d_rewards = D.get_rewards(agent_s=observations, agent_a=actions)
            d_rewards = np.reshape(d_rewards, newshape=[-1]).astype(dtype=np.float32)

            gaes = PPO.get_gaes(rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next)
            gaes = np.array(gaes).astype(dtype=np.float32)
            # gaes = (gaes - gaes.mean()) / gaes.std()
            v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)

            # train policy
            inp = [observations, actions, gaes, d_rewards, v_preds_next]
            PPO.assign_policy_parameters()
            for epoch in range(6):
                sample_indices = np.random.randint(low=0, high=observations.shape[0],
                                                   size=32)  # indices are in [low, high)
                sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp]  # sample training data
                PPO.train(obs=sampled_inp[0],
                          actions=sampled_inp[1],
                          gaes=sampled_inp[2],
                          rewards=sampled_inp[3],
                          v_preds_next=sampled_inp[4])

        #     summary = PPO.get_summary(obs=inp[0],
        #                               actions=inp[1],
        #                               gaes=inp[2],
        #                               rewards=inp[3],
        #                               v_preds_next=inp[4])
        #
        #     writer.add_summary(summary, iteration)
        # writer.close()

    plt.plot(scores, 'r')
    plt.show()
Exemplo n.º 10
0
def main():
    env = gym.make('CartPole-v1')
    env.seed(0)
    ob_space = env.observation_space
    Policy = Policy_net('policy', env)
    Old_Policy = Policy_net('old_policy', env)
    PPO = PPOTrain(Policy, Old_Policy, gamma=GAMMA)
    saver = tf.train.Saver()

    with tf.Session() as sess:
        writer = tf.summary.FileWriter('./log/test', sess.graph)
        sess.run(tf.global_variables_initializer())
        saver.restore(sess, 'model/model.ckpt')
        obs = env.reset()
        reward = 0
        success_num = 0

        for iteration in range(ITERATION):  # episode
            observations = []
            actions = []
            v_preds = []
            rewards = []
            run_policy_steps = 0
            env.render()
            while True:  # run policy RUN_POLICY_STEPS which is much less than episode length
                run_policy_steps += 1
                obs = np.stack([obs]).astype(
                    dtype=np.float32)  # prepare to feed placeholder Policy.obs
                act, v_pred = Policy.act(obs=obs, stochastic=False)

                act = act.item()
                v_pred = v_pred.item()

                observations.append(obs)
                actions.append(act)
                v_preds.append(v_pred)
                rewards.append(reward)

                next_obs, reward, done, info = env.step(act)

                if done:
                    v_preds_next = v_preds[1:] + [
                        0
                    ]  # next state of terminate state has 0 state value
                    obs = env.reset()
                    reward = -1
                    break
                else:
                    obs = next_obs

            writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag='episode_length',
                                     simple_value=run_policy_steps)
                ]), iteration)
            writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag='episode_reward',
                                     simple_value=sum(rewards))
                ]), iteration)

            # end condition of test
            if sum(rewards) >= 195:
                success_num += 1
                if success_num >= 100:
                    print('Iteration: ', iteration)
                    print('Clear!!')
                    break
            else:
                success_num = 0

            gaes = PPO.get_gaes(rewards=rewards,
                                v_preds=v_preds,
                                v_preds_next=v_preds_next)

            # convert list to numpy array for feeding tf.placeholder
            observations = np.reshape(observations,
                                      newshape=[-1] + list(ob_space.shape))
            actions = np.array(actions).astype(dtype=np.int32)
            rewards = np.array(rewards).astype(dtype=np.float32)
            v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)
            gaes = np.array(gaes).astype(dtype=np.float32)
            gaes = (gaes - gaes.mean()) / gaes.std()

            inp = [observations, actions, rewards, v_preds_next, gaes]

            summary = PPO.get_summary(obs=inp[0],
                                      actions=inp[1],
                                      rewards=inp[2],
                                      v_preds_next=inp[3],
                                      gaes=inp[4])[0]

            writer.add_summary(summary, iteration)
        writer.close()
Exemplo n.º 11
0
import tensorflow as tf
from ppo import PPOTrain
from policy_net import Policy_net
import threading
import time
import gym
import operator
import itertools

thread_number = 30

score_list = []
sess = tf.InteractiveSession()
Policy = Policy_net(sess, 'policy')
Old_Policy = Policy_net(sess, 'old_policy')
PPO = PPOTrain(sess, Policy, Old_Policy)
sess.run(tf.global_variables_initializer())


def add(number):
    env = gym.make('CartPole-v0')
    done = False
    observations = []
    actions = []
    v_preds = []
    rewards = []
    state = env.reset()
    global_step = 0
    while not done:
        global_step += 1
        action, value = Policy.act(state)
Exemplo n.º 12
0
def train():
    FLAGS(sys.argv)
    with sc2_env.SC2Env(map_name="CollectMineralShards", step_mul=step_mul,
                        screen_size_px=(32, 32), minimap_size_px=(32, 32)) as env:
        Policy = Policy_net('policy', 32*32*2, 4)
        Old_Policy = Policy_net('old_policy', 32*32*2, 4)
        PPO = PPOTrain(Policy, Old_Policy, gamma=0.95)
        saver = tf.train.Saver()
        with tf.Session() as sess:
            print('a')
            saver.restore(sess, './model/model.ckpt')
            print('a')
            #sess.run(tf.global_variables_initializer())
            for episodes in range(EPISODES):
                done = False
                obs = env.reset()
                while not 331 in obs[0].observation["available_actions"]:
                    actions = actAgent2Pysc2(100, obs)
                    obs = env.step(actions=[actions])
                actions = gather(obs)
                obs = env.step(actions=[actions])
                end_step = 200
                global_step = 0
                score = 0
                reward = 0
                for i in range(100):
                    time.sleep(0.01)
                    actions = no_operation(obs)
                    obs = env.step(actions=[actions])       
                state = obs2state(obs)
                observations = []
                actions_list = []
                v_preds = []
                rewards = []

                print('episode start')
                while not done:
                    global_step += 1
                    time.sleep(0.05)
                    state = np.stack([state]).astype(dtype=np.float32)
                    act, v_pred = Policy.act(obs=state, stochastic=True)
                    act, v_pred = np.asscalar(act), np.asscalar(v_pred)
                    actions = actAgent2Pysc2(act, obs)
                    #while not 331 in obs[0].observation["available_actions"]:
                    #    actions = actAgent2Pysc2(100, obs)
                    #    obs = env.step(actions=[actions])
                    obs = env.step(actions=[actions])
                    
                    if global_step == end_step or obs2done(obs) >= 1900 :    # 게임 time을 다 사용하거나 미네랄을 다 먹었을 경우 게임이 끝난다.
                        done = True
                    
                    next_state = obs2state(obs)
                    reward = obs[0].reward

                    if reward == 0:
                        reward = -0.1

                    if done:
                        if obs2done(obs) >= 1900:   # 게임이 종료되었는데 미네랄을 다 먹었으면
                            reward = 3
                        else:                       # 게임이 종료되었는데 미네랄을 다 못먹으면
                            reward = -3   

                    score += reward

                    observations.append(state)
                    actions_list.append(act)
                    v_preds.append(v_pred)
                    rewards.append(reward)

                    if done:   # 게임 종료시
                        v_preds_next = v_preds[1:] + [0]
                        gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next)
                        observations = np.reshape(observations, newshape=[-1, 32*32*2])
                        actions = np.array(actions_list).astype(dtype=np.int32)
                        rewards = np.array(rewards).astype(np.float32)
                        v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)
                        gaes = np.array(gaes).astype(dtype=np.float32)
                        gaes = (gaes - gaes.mean())
                        PPO.assign_policy_parameters()
                        inp = [observations, actions, rewards, v_preds_next, gaes]
                        for epoch in range(5):
                            sample_indices = np.random.randint(low=0, high=observations.shape[0], size=64)
                            sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp]  # sample training data
                            PPO.train(obs=sampled_inp[0],
                                    actions=sampled_inp[1],
                                    rewards=sampled_inp[2],
                                    v_preds_next=sampled_inp[3],
                                    gaes=sampled_inp[4])
                        print(episodes, score)
                        save_path = saver.save(sess, './model/model.ckpt')
                        if episodes == 0:
                            f = open('test2.csv', 'w', encoding='utf-8', newline='')
                        else:
                            f = open('test2.csv', 'a', encoding='utf-8', newline='')
                        wr = csv.writer(f)
                        wr.writerow([episodes, score])
                        f.close()
                        break
                    state = next_state
Exemplo n.º 13
0
    def __init__(self):
        rospy.init_node('runPPO', anonymous=True)

        Policy = Policy_net('policy', self.n_inputs, self.n_outputs)
        Old_Policy = Policy_net('old_policy', self.n_inputs, self.n_outputs)
        PPO = PPOTrain(Policy, Old_Policy, gamma=GAMMA, c_2=0.1)
        saver = tf.train.Saver()

        rospy.Subscriber('/RL/gripper_status', String,
                         self.callbackGripperStatus)
        # rospy.Service('/RL/net', net_eval, self.EvalNet)
        rospy.Service('/RL/start_learning', Empty, self.start_learning)
        obs_srv = rospy.ServiceProxy('/RL/observation', observation)
        drop_srv = rospy.ServiceProxy('/RL/IsObjDropped', IsDropped)
        move_srv = rospy.ServiceProxy('/RL/MoveGripper', TargetAngles)
        reset_srv = rospy.ServiceProxy('/RL/ResetGripper', Empty)
        pub_goal = rospy.Publisher('/RL/Goal',
                                   Float32MultiArray,
                                   queue_size=10)

        gg = Float32MultiArray()
        gg.data = self.g

        with tf.Session() as sess:
            # $ tensorboard --logdir=logs
            # http://0.0.0.0:6006/
            writer = tf.summary.FileWriter(
                '/home/pracsys/catkin_ws/src/rutgers_collab/src/rl_pkg/src/PPO/log/train',
                sess.graph)

            sess.run(tf.global_variables_initializer())
            reward = 0
            success_num = 0

            episode_count = 0
            rate = rospy.Rate(100)  # 100hz
            while not rospy.is_shutdown():

                if self.stLearning:
                    ## Start episode ##
                    episode_count += 1

                    # Reset gripper
                    reset_srv()
                    while not self.gripper_closed:
                        rate.sleep()

                    # Get observation
                    obs = np.array(obs_srv().state)
                    self.prev_dis2goal = np.linalg.norm(self.g - obs[:2])

                    observations = []
                    actions = []
                    v_preds = []
                    rewards = []
                    run_policy_steps = 0
                    while True:  # run policy RUN_POLICY_STEPS which is much less than episode length
                        run_policy_steps += 1
                        print(
                            '[RL] Step %d in episode %d, distance to goal: %f.'
                            % (run_policy_steps, episode_count,
                               self.prev_dis2goal))

                        pub_goal.publish(gg)

                        obs = np.stack([obs]).astype(
                            dtype=np.float32
                        )  # prepare to feed placeholder Policy.obs

                        while 1:
                            act, v_pred = Policy.act(obs=obs, stochastic=True)
                            act = np.asscalar(act)
                            v_pred = np.asscalar(v_pred)
                            if act < 8:
                                break

                        # Act
                        suc = move_srv(self.A[act])
                        rospy.sleep(0.05)
                        rate.sleep()

                        if suc:
                            # Get observation
                            next_obs = np.array(obs_srv().state)
                            fail = drop_srv(
                            ).dropped  # Check if dropped - end of episode
                        else:
                            # End episode if overload or angle limits reached
                            rospy.logerr(
                                '[RL] Failed to move gripper. Episode declared failed.'
                            )
                            fail = True

                        reward, done = self.transition_reward(next_obs, fail)

                        observations.append(obs)
                        actions.append(act)
                        v_preds.append(v_pred)
                        rewards.append(
                            reward
                        )  # Weird that this is before the step - this is the reward of the previos action

                        print(
                            '[RL] Action %d yielded reward %f and position (%f,%f).'
                            % (act, reward, obs[0][0], obs[0][1]))

                        if run_policy_steps > self.max_steps:
                            done = True

                        if done:
                            v_preds_next = v_preds[1:] + [
                                0
                            ]  # next state of terminate state has 0 state value - adds zero in the end of the vector

                            reward = -1
                            break
                        else:
                            obs = next_obs

                        rate.sleep()

                    print('episode_length', run_policy_steps, 'episode_reward',
                          sum(rewards))

                    writer.add_summary(
                        tf.Summary(value=[
                            tf.Summary.Value(tag='episode_length',
                                             simple_value=run_policy_steps)
                        ]), episode_count)
                    writer.add_summary(
                        tf.Summary(value=[
                            tf.Summary.Value(tag='episode_reward',
                                             simple_value=sum(rewards))
                        ]), episode_count)

                    if sum(rewards) >= self.stop_bound:
                        success_num += 1
                        if success_num >= 100:
                            saver.save(
                                sess,
                                '/home/pracsys/catkin_ws/src/rutgers_collab/src/rl_pkg/logs/model_ppo.ckpt'
                            )
                            print('Clear!! Model saved.')
                            break
                    else:
                        success_num = 0

                    gaes = PPO.get_gaes(rewards=rewards,
                                        v_preds=v_preds,
                                        v_preds_next=v_preds_next)

                    # convert list to numpy array for feeding tf.placeholder
                    observations = np.reshape(observations,
                                              newshape=[-1] + list(
                                                  (self.n_inputs, )))
                    actions = np.array(actions).astype(dtype=np.int32)
                    rewards = np.array(rewards).astype(dtype=np.float32)
                    v_preds_next = np.array(v_preds_next).astype(
                        dtype=np.float32)
                    gaes = np.array(gaes).astype(dtype=np.float32)
                    gaes = (gaes - gaes.mean()) / gaes.std()

                    PPO.assign_policy_parameters()

                    inp = [observations, actions, rewards, v_preds_next, gaes]

                    # train
                    for epoch in range(4):
                        sample_indices = np.random.randint(
                            low=0, high=observations.shape[0],
                            size=64)  # indices are in [low, high)
                        sampled_inp = [
                            np.take(a=a, indices=sample_indices, axis=0)
                            for a in inp
                        ]  # sample training data
                        PPO.train(obs=sampled_inp[0],
                                  actions=sampled_inp[1],
                                  rewards=sampled_inp[2],
                                  v_preds_next=sampled_inp[3],
                                  gaes=sampled_inp[4])

                    summary = PPO.get_summary(obs=inp[0],
                                              actions=inp[1],
                                              rewards=inp[2],
                                              v_preds_next=inp[3],
                                              gaes=inp[4])[0]

                    writer.add_summary(summary, episode_count)

                if episode_count > self.max_episodes:
                    break

                rate.sleep()

            writer.close()
Exemplo n.º 14
0
def main():
    env = gym.make(ENV)  # Instancia o ambiente CartPole
    env.seed(0)  #
    ob_space = env.observation_space  # Descrevem o formato de observações válidas do espaço
    Policy = Policy_net('policy', env)  # Cria a rede de Politica
    Old_Policy = Policy_net('old_policy',
                            env)  # Cria a rede de politica antiga
    PPO = PPOTrain(Policy, Old_Policy, gamma=GAMMA)
    saver = tf.train.Saver()  #

    with tf.Session() as sess:  # Bloco da sessão
        writer = tf.summary.FileWriter('./log/train',
                                       sess.graph)  # Define diretório de logs
        sess.run(tf.global_variables_initializer())  # Inicializa as redes

        obs = env.reset()  # Reseta o ambiente e obtêm a primeira observação
        reward = 0  # Armazena as recompensas
        success_num = 0  # Contador de sucessos

        for episode in range(EPISODES):  # Loop do episodio
            observations = []  # Array pra armazenar as observações
            actions = []  # Array pra armazenar as ações
            v_preds = []  # Array pra armazenar as previsões
            rewards = []  # Array pra armazenar as recompensas
            run_policy_steps = 0  # Contador de passos em cada episodio
            env.render()  # Renderiza o ambiente

            while True:  # Run policy RUN_POLICY_STEPS which is much less than episode length
                # Execute a política RUN_POLICY_STEPS, que é muito menor que a duração do episódio
                run_policy_steps += 1  # Incrementa contador de passos de cada episodio
                obs = np.stack([obs]).astype(
                    dtype=np.float32)  # prepare to feed placeholder Policy.obs
                act, v_pred = Policy.act(
                    obs=obs, stochastic=True
                )  # Corre a rede neural e obtêm uma ação e o V previsto

                act = act.item()  # Transforma um array do numpy
                v_pred = v_pred.item()  # em um objeto scalar do Python

                observations.append(
                    obs)  # Adiciona a observação ao buffer de observações
                actions.append(act)  # Adiciona a ação ao buffer de ações
                v_preds.append(v_pred)  # Adiciona a v_pred ao buffer de v_pred
                rewards.append(
                    reward)  # Adiciona a recompensa ao buffer de recompensa

                next_obs, reward, done, info = env.step(
                    act
                )  # envia a ação ao ambiente e recebe a próxima observação, a recompensa e se o passo terminou

                if done:  # Se o done for verdadeiro ...

                    v_preds_next = v_preds[1:] + [
                        0
                    ]  # [1:] seleciona do segundo elemento da lista em diante e + [0] adiciona um elemento de valor zero no final da lista
                    # next state of terminate state has 0 state value
                    # próximo estado do estado final tem 0 valor de estado
                    obs = env.reset()  #   Redefine o ambiente
                    reward = -1  #   define a recompensa como -1 (?)
                    break  #   Sai do loop while
                else:  # Senão...
                    obs = next_obs  #   Armazena em obs a próxima observação

            # Armazena em log para visualização no tensorboard
            writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag='episode_length',
                                     simple_value=run_policy_steps)
                ]), episode)
            writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag='episode_reward',
                                     simple_value=sum(rewards))
                ]), episode)

            # Condicional para finalizar o teste
            if sum(
                    rewards
            ) >= 195:  # Se a soma das recompensas for maior ou igual 195
                success_num += 1  #   Incrementa o contador de sucessos
                if success_num >= 100:  #   Se ocorrerem 100 sucessos
                    saver.save(sess,
                               './model/model.ckpt')  #       Salva a sessão
                    print('Clear!! Model saved.')  #       Escreve na tela
                    break  #       Sai do loop
            else:  # senão,
                success_num = 0  #   zera o contador de sucessos

            print("EP: ", episode, " Rw: ", sum(rewards)
                  )  # Escreve na tela o numero do episodio e a recompensa

            gaes = PPO.get_gaes(rewards=rewards,
                                v_preds=v_preds,
                                v_preds_next=v_preds_next)  # ?

            # Converte lista em NPArray para alimentar o tf.placeholder
            newshape = [-1] + list(ob_space.shape)  # cria um array [-1, 4]
            observations = np.reshape(
                observations, newshape=newshape
            )  # antes, cada linha de observations era um array idependente. depois do reshape, observations passou ser um array só com varias linhas.

            actions = np.array(actions).astype(dtype=np.int32)

            rewards = np.array(rewards).astype(dtype=np.float32)
            v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)
            gaes = np.array(gaes).astype(dtype=np.float32)
            gaes = (gaes - gaes.mean()) / gaes.std(
            )  # subtrai dos itens de gaes a media de todos os itens de gaes e divide todos pelo desvio padrao de gaes

            PPO.assign_policy_parameters()

            inp = [
                observations, actions, rewards, v_preds_next, gaes
            ]  # Cria um array com 5 colunas: observações, ações, recompensas,

            # Treina
            for epoch in range(4):
                sample_indices = np.random.randint(
                    low=0, high=observations.shape[0],
                    size=64)  # índices estão em [baixo, alto]
                sampled_inp = []
                for a in inp:
                    sampled_inp.append(
                        np.take(a=a, indices=sample_indices,
                                axis=0))  # amostra de dados de treinamento
                PPO.train(obs=sampled_inp[0],
                          actions=sampled_inp[1],
                          rewards=sampled_inp[2],
                          v_preds_next=sampled_inp[3],
                          gaes=sampled_inp[4])

            summary = PPO.get_summary(obs=inp[0],
                                      actions=inp[1],
                                      rewards=inp[2],
                                      v_preds_next=inp[3],
                                      gaes=inp[4])[0]

            writer.add_summary(summary, episode)
        writer.close()  # Final do episódio
Exemplo n.º 15
0
def main():
    listener()
    # env = gym.make('CartPole-v0')
    # env.seed(0)
    reset()
    obs = robot_state.robot_state
    ob_space = 4
    Policy = Policy_net('policy')
    Old_Policy = Policy_net('old_policy')
    PPO = PPOTrain(Policy, Old_Policy, gamma=GAMMA)
    saver = tf.train.Saver()

    with tf.Session() as sess:
        writer = tf.summary.FileWriter('./log/test', sess.graph)
        sess.run(tf.global_variables_initializer())
        saver.restore(sess, 'model/model.ckpt')
        reset()
        reward = 0
        success_num = 0

        for iteration in range(ITERATION):  # episode
            observations = []
            actions = []
            v_preds = []
            rewards = []
            run_policy_steps = 0
            while True:  # run policy RUN_POLICY_STEPS which is much less than episode length
                run_policy_steps += 1
                obs = np.stack([obs]).astype(dtype=np.float32)  # prepare to feed placeholder Policy.obs
                act, v_pred = Policy.act(obs=obs, stochastic=False)
                print('act: ',act, 'v_pred: ',v_pred )

                # act = np.asscalar(actions)
                # v_pred = np.asscalar(v_preds)

                observations.append(obs)
                actions.append(act)
                v_preds.append(v_pred)
                rewards.append(reward)

                reward, done = take_action(act)
                time.sleep(0.25)
                next_obs = robot_state.robot_state

                if done:
                    v_preds_next = v_preds[1:] + [0]  # next state of terminate state has value 0
                    reset()
                    obs = robot_state.robot_state
                    reward = -1
                    break
                else:
                    obs = next_obs

            writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps)])
                               , iteration)
            writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))])
                               , iteration)

            # end condition of test
            if sum(rewards) >= 195:
                success_num += 1
                render = True
                if success_num >= 100:
                    print('Iteration: ', iteration)
                    print('Clear!!')
                    break
            else:
                success_num = 0

            gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next)

            # convert list to numpy array for feeding tf.placeholder
            observations = np.reshape(observations, [len(observations), 4])
            actions = np.array(actions).astype(dtype=np.int32)
            rewards = np.array(rewards).astype(dtype=np.float32)
            v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)
            gaes = np.array(gaes).astype(dtype=np.float32)
            gaes = (gaes - gaes.mean())
            print('gaes', gaes)

            inp = [observations, actions, rewards, v_preds_next, gaes]

            
            # summary = PPO.get_summary(obs=inp[0],
            #                           actions=inp[1],
            #                           rewards=inp[2],
            #                           v_preds_next=inp[3],
            #                           gaes=inp[4])[0]

            # writer.add_summary(summary, iteration)
        writer.close()
Exemplo n.º 16
0
    map_name='CollectMineralShards',
    agent_interface_format=sc2_env.parse_agent_interface_format(
        feature_screen=16,
        feature_minimap=16,
        rgb_screen=None,
        rgb_minimap=None,
        action_space=None,
        use_feature_units=False),
    step_mul=4,
    game_steps_per_episode=None,
    disable_fog=False,
    visualize=False)
with tf.Session() as sess:
    Policy = Policy_net('policy')
    Old_Policy = Policy_net('old_policy')
    PPO = PPOTrain(Policy, Old_Policy)
    #sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    saver.restore(sess, "collectMineralShards/tmp/model.ckpt")

    for episodes in range(100000):
        observations = []
        actions_list = []
        v_preds = []
        spatial = []
        rewards = []

        obs = env.reset()

        done = False
        global_step = 0
Exemplo n.º 17
0
def train():
    FLAGS(sys.argv)
    with sc2_env.SC2Env(
            map_name='MoveToBeacon',
            agent_interface_format=sc2_env.parse_agent_interface_format(
                feature_screen=64,
                feature_minimap=64,
                rgb_screen=None,
                rgb_minimap=None,
                action_space=None,
                use_feature_units=False),
            step_mul=step_mul,
            game_steps_per_episode=None,
            disable_fog=False,
            visualize=False) as env:
        r = tf.placeholder(tf.float32)  ########
        rr = tf.summary.scalar('reward', r)
        merged = tf.summary.merge_all()  ########
        expert_observations = np.genfromtxt('trajectory/observations.csv')
        expert_actions = np.genfromtxt('trajectory/actions.csv',
                                       dtype=np.int32)
        with tf.Session() as sess:
            Policy = Policy_net('policy', 2, 4)
            Old_Policy = Policy_net('old_policy', 2, 4)
            PPO = PPOTrain(Policy, Old_Policy)
            D = Discriminator()
            sess.run(tf.global_variables_initializer())
            saver = tf.train.Saver()
            writer = tf.summary.FileWriter('./board/gail',
                                           sess.graph)  ########
            c = 0
            for episodes in range(100000):
                done = False
                obs = env.reset()
                while not 331 in obs[0].observation.available_actions:
                    actions = actAgent2Pysc2(100, obs)
                    obs = env.step(actions=[actions])
                state = obs2state(obs)
                observations = []
                actions_list = []
                rewards = []
                v_preds = []
                reward = 0
                global_step = 0
                while not done:
                    global_step += 1
                    state = np.stack([state]).astype(dtype=np.float32)
                    act, v_pred = Policy.act(obs=state, stochastic=True)
                    act, v_pred = np.asscalar(act), np.asscalar(v_pred)
                    observations.append(state)
                    actions_list.append(act)
                    rewards.append(reward)
                    v_preds.append(v_pred)
                    actions = actAgent2Pysc2(act, obs)
                    obs = env.step(actions=[actions])
                    next_state = obs2state(obs)
                    distance = obs2distance(obs)
                    if distance < 0.03 or global_step == 100:
                        done = True
                    if done:
                        v_preds_next = v_preds[1:] + [0]
                        break
                    state = next_state
                observations = np.reshape(observations, newshape=[-1, 2])
                actions_list = np.array(actions_list).astype(dtype=np.int32)
                for i in range(2):
                    sample_indices = (np.random.randint(
                        expert_observations.shape[0],
                        size=observations.shape[0]))
                    inp = [expert_observations, expert_actions]
                    sampled_inp = [
                        np.take(a=a, indices=sample_indices, axis=0)
                        for a in inp
                    ]  # sample training data
                    D.train(expert_s=sampled_inp[0],
                            expert_a=sampled_inp[1],
                            agent_s=observations,
                            agent_a=actions_list)
                d_rewards = D.get_rewards(agent_s=observations,
                                          agent_a=actions_list)
                d_rewards = np.reshape(d_rewards,
                                       newshape=[-1]).astype(dtype=np.float32)

                gaes = PPO.get_gaes(rewards=d_rewards,
                                    v_preds=v_preds,
                                    v_preds_next=v_preds_next)
                gaes = np.array(gaes).astype(dtype=np.float32)
                v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)

                inp = [
                    observations, actions_list, gaes, d_rewards, v_preds_next
                ]
                PPO.assign_policy_parameters()
                for epoch in range(15):
                    sample_indices = np.random.randint(
                        low=0, high=observations.shape[0],
                        size=32)  # indices are in [low, high)
                    sampled_inp = [
                        np.take(a=a, indices=sample_indices, axis=0)
                        for a in inp
                    ]  # sample training data
                    PPO.train(obs=sampled_inp[0],
                              actions=sampled_inp[1],
                              gaes=sampled_inp[2],
                              rewards=sampled_inp[3],
                              v_preds_next=sampled_inp[4])
                summary = sess.run(merged, feed_dict={r: global_step})
                writer.add_summary(summary, episodes)
                if global_step < 50:
                    c += 1
                else:
                    c = 0
                if c > 10:
                    saver.save(sess, './model/gail.cpkt')
                    print('save model')
                    break
                print(episodes, global_step, c)
Exemplo n.º 18
0
def train():
    FLAGS(sys.argv)
    with sc2_env.SC2Env(map_name="MoveToBeacon",
                        step_mul=step_mul,
                        screen_size_px=(16, 16),
                        minimap_size_px=(16, 16)) as env:
        Policy = Policy_net('policy', 16 * 16 * 2, 4)
        Old_Policy = Policy_net('old_policy', 16 * 16 * 2, 4)
        PPO = PPOTrain(Policy, Old_Policy, gamma=0.95)
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            for episodes in range(EPISODES):
                done = False
                obs = env.reset()
                while not 331 in obs[0].observation["available_actions"]:
                    actions = actAgent2Pysc2(100, obs)
                    obs = env.step(actions=[actions])
                state = np.array(obs2state(obs))
                print('episode start')
                global_step = 0
                reward = 0

                observations = []
                actions_list = []
                v_preds = []
                rewards = []

                while not done:
                    global_step += 1
                    time.sleep(0.05)

                    state = np.stack([state]).astype(dtype=np.float32)
                    act, v_pred = Policy.act(obs=state, stochastic=True)
                    act, v_pred = np.asscalar(act), np.asscalar(v_pred)
                    actions = actAgent2Pysc2(act, obs)
                    obs = env.step(actions=[actions])

                    for i in range(1):
                        actions = no_operation(obs)
                        obs = env.step(actions=[actions])
                    distance = obs2distance(obs)
                    if global_step == 1:
                        pre_distance = distance
                    next_state = np.array(obs2state(obs))
                    reward = -10 * (distance - pre_distance)
                    #if reward < 0 :
                    #    reward = -0.01
                    #if reward <= 0:
                    #    reward = 0
                    #elif reward > 0:
                    #    reward = 0
                    reward = -0.01
                    if distance < 0.03 or global_step == 100:  # 게임 종료시
                        if distance < 0.03:
                            reward = 1
                        if global_step == 200:
                            reward = -1
                        done = True

                    observations.append(state)
                    actions_list.append(act)
                    v_preds.append(v_pred)
                    rewards.append(reward)

                    if distance < 0.03 or global_step == 100:  # 게임 종료시
                        v_preds_next = v_preds[1:] + [0]
                        gaes = PPO.get_gaes(rewards=rewards,
                                            v_preds=v_preds,
                                            v_preds_next=v_preds_next)
                        observations = np.reshape(observations,
                                                  newshape=[-1, 16 * 16 * 2])
                        actions = np.array(actions_list).astype(dtype=np.int32)
                        rewards = np.array(rewards).astype(dtype=np.float32)
                        v_preds_next = np.array(v_preds_next).astype(
                            dtype=np.float32)
                        gaes = np.array(gaes).astype(dtype=np.float32)
                        gaes = (gaes - gaes.mean())

                        PPO.assign_policy_parameters()

                        inp = [
                            observations, actions, rewards, v_preds_next, gaes
                        ]
                        for epoch in range(5):
                            sample_indices = np.random.randint(
                                low=0, high=observations.shape[0],
                                size=64)  # indices are in [low, high)
                            sampled_inp = [
                                np.take(a=a, indices=sample_indices, axis=0)
                                for a in inp
                            ]  # sample training data
                            PPO.train(obs=sampled_inp[0],
                                      actions=sampled_inp[1],
                                      rewards=sampled_inp[2],
                                      v_preds_next=sampled_inp[3],
                                      gaes=sampled_inp[4])
                        print(episodes, global_step)
                        break
                    state = next_state
                    pre_distance = distance
Exemplo n.º 19
0
def main():
    # Initialize OpenAI_ROS ENV
    LoadYamlFileParamsTest(rospackage_name="learning_ros",
                           rel_path_from_package_to_file="config",
                           yaml_file_name="aliengo_stand.yaml")
    env = StartOpenAI_ROS_Environment('AliengoStand-v0')
    time.sleep(3)
    # Initialize PPO agent
    Policy = Policy_net('policy', ob_space=3, act_space=8)
    Old_Policy = Policy_net('old_policy', ob_space=3, act_space=8)
    PPO = PPOTrain(Policy, Old_Policy, gamma=GAMMA)
    saver = tf.train.Saver()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        obs = env.reset()
        reward = 0
        success_num = 0
        scores = []

        for iteration in range(ITERATION):
            observations = []
            actions = []
            v_preds = []
            rewards = []
            while not rospy.is_shutdown():  # until ros is not shutdown
                # prepare to feed placeholder Policy.obs
                obs = np.stack([obs]).astype(dtype=np.float32)
                act, v_pred = Policy.act(obs=obs, stochastic=True)
                #print('act: ',act, 'v_pred: ',v_pred )

                act = np.asscalar(act)
                v_pred = np.asscalar(v_pred)
                observations.append(obs)
                actions.append(act)
                v_preds.append(v_pred)
                rewards.append(reward)
                #execute according to action
                next_obs, reward, done, _ = env.step(act)
                time.sleep(0.01)
                if done:
                    # next state of terminate state has 0 state value
                    v_preds_next = v_preds[1:] + [0]
                    obs = env.reset()
                    break
                else:
                    obs = next_obs
            #scores store for visualization
            scores.append(sum(rewards))
            #if consectuvely 10 times has high reward end the training
            if sum(rewards) >= 400:
                success_num += 1
                print("Succes number: " + str(success_num))
                if success_num >= 5:
                    saver.save(sess, 'model_train/model_ppo.ckpt')
                    print('Clear!! Model saved.')
                if success_num >= 10:
                    saver.save(sess, 'model_train/model_ppo.ckpt')
                    print('Finished! ')
                    break

            else:
                success_num = 0

            gaes = PPO.get_gaes(rewards=rewards,
                                v_preds=v_preds,
                                v_preds_next=v_preds_next)

            # convert list to numpy array for feeding tf.placeholder
            observations = np.reshape(observations, [len(observations), 3])
            actions = np.array(actions).astype(dtype=np.int32)
            rewards = np.array(rewards).astype(dtype=np.float32)
            v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)
            #calculate generative advantage estimator score
            gaes = np.array(gaes).astype(dtype=np.float32)
            gaes = (gaes - gaes.mean())
            print('gaes', gaes)
            #assign current policy params to previous policy params
            PPO.assign_policy_parameters()

            inp = [observations, actions, rewards, v_preds_next, gaes]

            # PPO train
            for epoch in range(4):
                sample_indices = np.random.randint(
                    low=0, high=observations.shape[0],
                    size=64)  # indices are in [low, high)
                sampled_inp = [
                    np.take(a=a, indices=sample_indices, axis=0) for a in inp
                ]  # sample training data
                PPO.train(obs=sampled_inp[0],
                          actions=sampled_inp[1],
                          rewards=sampled_inp[2],
                          v_preds_next=sampled_inp[3],
                          gaes=sampled_inp[4])
        plt.plot(scores)
        plt.show()
        env.stop()