Exemplo n.º 1
0
    def play_deterministic(self, n_tot):

        self.model.eval()
        env = Env()
        render = args.render

        n_human = 60
        humans_trajectories = iter(self.data)
        reverse_excitation_index = consts.reverse_excitation_index

        for i in range(n_tot):

            env.reset()

            observation = next(humans_trajectories)
            print("Observation %s" % observation)
            trajectory = self.data[observation]

            j = 0

            ims = []
            # fig = plt.figure()
            while not env.t:

                if j < n_human:
                    a = trajectory[j, self.meta['action']]

                else:

                    # im = plt.imshow(np.rollaxis(env.s.numpy().squeeze(0)[:3], 0, 3), animated=True)
                    # ims.append([im])
                    if self.cuda:
                        s = Variable(env.s.cuda(), requires_grad=False)
                    else:
                        s = Variable(env.s, requires_grad=False)
                    _, _, beta, _, _, _ = self.model(s)

                    beta = beta.squeeze(0)
                    beta = beta.sign().int() * (beta.abs() > 0.5).int()
                    a = reverse_excitation_index[tuple(beta.data)]

                env.step(a)

                j += 1

            # if render:
            #     ani = animation.ArtistAnimation(fig, ims, interval=10, blit=True,
            #                                     repeat=False)
            #     plt.show()

            yield env.score
Exemplo n.º 2
0
    def play_episode_deterministic(self, n_tot):
        self.model.eval()
        env = Env()

        n_human = 300
        humans_trajectories = iter(self.data)
        reverse_excitation_index = consts.reverse_excitation_index

        for i in range(n_tot):

            env.reset()
            observation = next(humans_trajectories)
            trajectory = self.data[observation]

            j = 0

            while not env.t:

                s = Variable(env.s.cuda(), requires_grad=False)
                v, q, beta, r, p, phi = self.model(s)
                beta = beta.squeeze(0)

                if j < n_human:
                    a = trajectory[j, self.meta['action']]

                else:

                    beta_index = (beta.sign().int() *
                                  (beta.abs() > 0.5).int()).data.cpu().numpy()
                    beta_index[0] = abs(beta_index[0])
                    a = reverse_excitation_index[tuple(beta_index.data)]

                env.step(a)

                # x = phi.squeeze(0).data.cpu().numpy()
                # print(np.mean(abs(x)))
                # yield v, q, beta, r, p, s
                yield {
                    'o': env.s.cpu().numpy(),
                    'v': v.data.cpu().numpy(),
                    's': phi.data.cpu().numpy(),
                    'score': env.score,
                    'beta': beta.data.cpu().numpy(),
                    'phi': phi.squeeze(0).data.cpu().numpy()
                }

                j += 1

        raise StopIteration
Exemplo n.º 3
0
def experiment1_test(
    output_folder,
    word_vectors,
    agent,
    episode_index,
    testset_path='./dataset/conll2003/en/eng.testb',
):
    # 初始化环境
    env = Env(testset_path, word_vectors)
    step = 0
    s = env.reset()
    print('[' + util.now_time() + "] start testing...")
    while True:
        # check task is ended
        if env.end():
            print('[' + util.now_time() + "] testing...done")
            result_file = '%03d_episode_test.txt' % (episode_index + 1)
            env.save_all_newlines_to_file(output_folder, result_file)
            return evaluate.conlleval(output_folder, result_file)

        # Choose Action a
        a = agent.choose_action(s)

        # Execute action
        s_, r = env.step(a)

        # Next status
        step += 1
        s = s_
Exemplo n.º 4
0
def cartpole():
    env = Env('localhost:32822')
    env.make(ENV_NAME)
    score_logger = ScoreLogger(ENV_NAME)
    observation_space = env.observation_space.box.shape[0]
    action_space = env.action_space.discrete.n
    dqn_solver = DQNSolver(observation_space, action_space)
    run = 0
    while True:
        run += 1
        state = env.reset()
        # print(state)
        state = np.reshape(state, [1, observation_space])
        step = 0
        while True:
            step += 1
            # env.render()
            print("acting on state: ", state)
            action = dqn_solver.act(state)
            state_next, reward, terminal, info = env.step(action)
            reward = reward if not terminal else -reward
            state_next = np.reshape(state_next, [1, observation_space])
            dqn_solver.remember(state, action, reward, state_next, terminal)
            state = state_next
            if terminal:
                print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))
                score_logger.add_score(step, run)
                plt.plot(dqn_solver.loss)
                plt.title('Model loss')
                plt.ylabel('Loss')
                plt.xlabel('Episode')
                plt.savefig("loss.png")
                break
            dqn_solver.experience_replay()
Exemplo n.º 5
0
    def play_deterministic(self, n_tot):

        self.model.eval()
        env = Env()
        render = args.render

        n_human = 60
        humans_trajectories = iter(self.data)
        reverse_excitation_index = consts.reverse_excitation_index

        for i in range(n_tot):

            env.reset()

            observation = next(humans_trajectories)
            print("Observation %s" % observation)
            trajectory = self.data[observation]

            j = 0

            while not env.t:

                if j < n_human:
                    a = trajectory[j, self.meta['action']]

                else:

                    if self.cuda:
                        s = Variable(env.s.cuda(), requires_grad=False)
                    else:
                        s = Variable(env.s, requires_grad=False)
                    _, _, beta, _, _, _ = self.model(s)

                    beta = beta.squeeze(0)
                    beta = (beta.sign().int() * (beta.abs() > 0.5).int()).data
                    if self.cuda:
                        beta = beta.cpu().numpy()
                    else:
                        beta = beta.numpy()
                    beta[0] = abs(beta[0])
                    a = reverse_excitation_index[tuple(beta)]

                env.step(a)

                j += 1

            yield {'o': env.s.cpu().numpy(), 'score': env.score}
Exemplo n.º 6
0
def experiment1_train(
    output_folder,
    word_vectors,
    n_episodes=300,
    trainset_path='./dataset/conll2003/en/eng.train',
):
    # 初始化环境
    print('[' + util.now_time() + "] init environment...")
    env = Env(trainset_path, word_vectors)
    print('[' + util.now_time() + "] 环境初始化完毕")

    # 初始化DQN
    print('[' + util.now_time() + "] init agent...")
    agent = DQN(n_actions=env.n_actions,
                status_dim=env.status_dim,
                action_dim=env.action_dim,
                reward_dim=env.reward_dim)
    print('[' + util.now_time() + "] agent初始化完毕")

    # 迭代episodes
    for i in range(n_episodes):
        print('[' + util.now_time() + "] start episode %03d of learning..." %
              (i + 1))
        step = 0
        s = env.reset()

        while True:
            # check task is ended
            if env.end():
                print('[' + util.now_time() +
                      "] episode %03d of learning...done" % (i + 1))
                result_file = '%03d_episode_train.txt' % (i + 1)
                env.save_all_newlines_to_file(output_folder, result_file)
                train_eval = evaluate.conlleval(output_folder, result_file)
                test_eval = experiment1_test(output_folder, word_vectors,
                                             agent, i)
                break

            # Choose Action a
            a = agent.choose_action(s)

            # Execute action
            # print('step %d' % step)
            s_, r = env.step(a)

            agent.store_transition(s, a, r, s_)

            step += 1
            s = s_

            if step > 200 and step % 5 == 0:
                agent.learn()

    # plot and compare train and test set TODO
    # plot(train_evals,test_evals)
    agent.eval_network.save(output_folder + os.path.sep + 'ex1_eval_model',
                            overwrite=True)
Exemplo n.º 7
0
class CarEnvironment(Environment):
    
    def __init__(self):
        self.action = [0.0, 0.0]
        self.delay = False
        self.grid = Grid()
        self.env = Env(self.grid)
        self.reset()

    def step(self):
        # Simulate a step in the environment
        self.agent.brain.stored_action = self.action
        self.env.tick()
        self.env.calculate_moves()
        self.env.do_moves()
        self.env.print_env(self.env.tick_number)
    
    def reset(self):
        self.env.reset()
        self.agent = self.env.cars[0]
        # (x, y, vel, deg, goal_x, goal_y, dist_to_goal, dir_of_goal, direction_diff)
        self.sensors = self.agent.brain.get_state_tuple()
        self.distance_to_goal = self.agent.brain.distance_to_goal()
        
    def getSensors(self):
        # (x, y, vel, deg, goal_x, goal_y, dist_to_goal, dir_of_goal, direction_diff)
        return self.agent.brain.get_state_tuple()
    
    def getCarState(self):
        return self.agent.brain.get_state()
    
    def in_goal_state(self):
        return self.agent.brain.get_state().reached_goal
    
    def performAction(self, action):
        self.action = action
        self.step()
    
    def indim(self):
        return 2
    
    def outdim(self):
        return len(self.getSensors())
Exemplo n.º 8
0
def run_exp(cfg=None):
    logger = Logger(cfg)
    agent = DQNAgent(cfg)
    env = Env(cfg)
    trainer = Trainer(env, agent, cfg)

    cfg = cfg.exp
    n_training_steps = cfg.n_episodes // cfg.train_after
    global_step = 0
    state = env.reset()
    joint_angles = np.empty(cfg.n_episodes)
    for step in range(cfg.n_episodes):
        state = trainer.single_step(state)
        # agent training
        if global_step % cfg.train_after == (cfg.train_after - 1):
            print(f"step: {step}")
            print("Training agents")
            # fw model warmup phase of 2000 steps
            metrics_dict = agent.train(
                cfg.train_iv, cfg.train_fw,
                cfg.train_policy if global_step >= 0 else False)
            logger.log_metrics(metrics_dict, global_step)
            logger.log_all_network_weights(agent.joint_agents[0], step)
            agent.decrease_eps(n_training_steps)

        # video logging
        if global_step % cfg.video_after == 0:
            print("logging video")
            vis, debug0, debug1 = trainer.record_frames(debug_cams=True)
            logger.log_vid_debug_cams(vis, debug0, debug1, global_step)

        # distractor toggling
        if global_step % cfg.toggle_table_after == (cfg.toggle_table_after -
                                                    1):
            env.toggle_table()

        global_step += 1
        pos = env.get_joint_positions()[0]
        joint_angles[step] = pos

    joint_angles = np.degrees(-joint_angles)
    plt.hist(joint_angles, bins=20, range=(0, 170))
    plt.savefig(os.path.join("plots", "explored_angles.png"))
Exemplo n.º 9
0
            for station in range(10):
                target_station_id = np.random.randint(num_stations)
                dests.append(mapo[target_station_id])  # get real station ID

            arriveTimes = [np.random.randint(12) for _ in range(10)]

            # cal. incentive
            preds = np.array(env.calIncentive(dests, arriveTimes, infos))
            max_index = np.random.choice(np.flatnonzero(
                preds == preds.max()))  # randomly pick max value's index
            dest = dests[max_index]

            # rent bike
            # get a starting point
            if env.bikes.count(0) == num_stations:
                env.reset(num_stations, num_bikes_per_station)

            source = None
            while True:
                target_station_id = np.random.randint(num_stations)
                if env.bikes[target_station_id] == 0:
                    continue
                else:
                    source = mapo[target_station_id]
                    break

            res = agent.rentBike(source, dest, i, arriveTimes[max_index], args)
            if res is False:
                bankrupts.append(agent_idx)
            else:
                # cal. number of bikes
Exemplo n.º 10
0
def main():
    rospy.init_node('ddpg_stage_1')
    env = Env(is_training)
    agent = DDPG(env, state_dim, action_dim)

    # import ipdb
    # ipdb.set_trace()

    past_action = np.array([0., 0.])
    print('State Dimensions: ' + str(state_dim))
    print('Action Dimensions: ' + str(action_dim))
    print('Action Max: ' + str(action_linear_max) + ' m/s and ' +
          str(action_angular_max) + ' rad/s')
    print('Action Min: ' + str(action_linear_min) + ' m/s and ' +
          str(action_angular_min) + ' rad/s')

    #########################################################################################
    #                                 Training
    #########################################################################################
    if is_training:
        print('Training mode')
        avg_reward_his = []
        total_reward = 0
        action_var = 0.2
        success_rate = 0

        # Log path setting
        now = datetime.datetime.now()
        logdir = now.strftime('%Y-%M-%d') + '_' + now.strftime('%H-%M')
        logdir = os.path.join(log_dir, logdir)
        # tb_writer = SummaryWriter(logdir)

        # Start training
        start_time = time.time()
        for itr in range(10000):
            state = env.reset()

            # episode_reward = 0.0
            # For each episode
            for cur_step in range(max_episode_length):
                action = agent.action(state)
                action[0] = np.clip(np.random.normal(action[0], action_var),
                                    action_linear_min, action_linear_max)
                action[1] = np.clip(np.random.normal(action[1], action_var),
                                    action_angular_min, action_angular_max)

                state_, reward, done, arrive = env.step(action, past_action)
                time_step = agent.perceive(state, action, reward, state_, done)

                ########################################################################################
                #                                   debugging environment
                ########################################################################################
                if is_debugging:
                    print('cur_step: {}'.format(cur_step))
                    print('action: {}'.format(action))
                    print('goal position: x:{}, y:{}'.format(
                        env.goal_position.position.x,
                        env.goal_position.position.y))
                    print('r: {}, done: {}, arrive: {}'.format(
                        reward, done, arrive))
                ########################################################################################

                result = 'Success' if arrive else 'Fail'

                if time_step > 0:
                    total_reward += reward

                if time_step % 10000 == 0 and time_step > 0:
                    print(
                        '---------------------------------------------------')
                    avg_reward = total_reward / 10000
                    print('Average_reward: {}'.format(avg_reward))
                    avg_reward_his.append(round(avg_reward, 2))
                    # writer.add_scalar('avg_reward', avg_reward, time_step)
                    print('Overall average Reward: {}'.format(avg_reward_his))
                    total_reward = 0

                if time_step % 5 == 0 and time_step > exploration_decay_start_step:
                    action_var *= 0.9999

                past_action = action
                state = state_

                if arrive or done or cur_step >= max_episode_length:
                    if result == 'Success':
                        success_rate += 1
                    sec = time.time() - start_time
                    elapsed_time = str(
                        datetime.timedelta(seconds=sec)).split('.')[0]
                    print(
                        'Num_episode: {}, Full steps: {}, Result: {}, Elapsed time: {}'
                        .format(itr, cur_step, result, elapsed_time))

                    if itr % 20 == 0 and itr > 0:
                        print('Total: {}/20, Success rate: {}'.format(
                            success_rate, round(success_rate / 20), 2))
                        success_rate = 0

                    break


#########################################################################################
#                                 Testing
#########################################################################################
    else:
        print('Testing mode')
        while True:
            state = env.reset()
            one_round_step = 0

            while True:
                a = agent.action(state)
                a[0] = np.clip(a[0], 0., 1.)
                a[1] = np.clip(a[1], -0.5, 0.5)
                state_, reward, done, arrive = env.step(a, past_action)
                past_action = a
                state = state_
                one_round_step += 1

                if arrive:
                    print('Step: %3i' % one_round_step, '| Arrive!!!')
                    one_round_step = 0

                if done:
                    print('Step: %3i' % one_round_step, '| Collision!!!')
                    break
Exemplo n.º 11
0
    def play_episode(self, n_tot):

        self.beta_net.eval()
        self.beta_target.eval()

        self.pi_net.eval()
        self.pi_target.eval()

        self.vb_net.eval()
        self.vb_target.eval()

        self.q_net.eval()
        self.q_target.eval()

        self.qb_net.eval()
        self.qb_target.eval()

        env = Env()

        n_human = 120
        humans_trajectories = iter(self.data)
        softmax = torch.nn.Softmax()

        for i in range(n_tot):

            env.reset()
            observation = next(humans_trajectories)
            trajectory = self.data[observation]
            choices = np.arange(self.global_action_space, dtype=np.int)
            mask = Variable(torch.FloatTensor(
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0]),
                            requires_grad=False).cuda()
            j = 0
            temp = 1

            while not env.t:

                s = Variable(env.s.cuda(), requires_grad=False)

                beta, phi = self.beta_net(s)
                pi, _ = self.pi_net(s)
                q, _ = self.q_net(s)
                vb, _ = self.vb_net(s)

                pi = beta.squeeze(0)
                self.greedy = False

                if j < n_human:
                    a = trajectory[j, self.meta['action']]

                else:
                    # eps = np.random.rand()
                    eps = 1
                    # a = np.random.choice(choices)
                    if self.greedy and eps > 0.01:
                        a = pi.data.cpu().numpy()
                        a = np.argmax(a)
                    else:
                        a = softmax(pi / temp).data.cpu().numpy()
                        a = np.random.choice(choices, p=a)

                q = q[0, a]
                q = q.squeeze(0)

                env.step(a)

                yield {
                    'o': env.s.cpu().numpy(),
                    'v': vb.squeeze(0).data.cpu().numpy(),
                    'vb': vb.squeeze(0).data.cpu().numpy(),
                    'qb': q.squeeze(0).data.cpu().numpy(),
                    # 's': x[0, :512].data.cpu().numpy(),
                    'score': env.score,
                    'beta': pi.data.cpu().numpy(),
                    'phi': phi.squeeze(0).data.cpu().numpy(),
                    'q': q.squeeze(0).data.cpu().numpy()
                }

                j += 1

        raise StopIteration
Exemplo n.º 12
0
    def play(self, n_tot, action_offset, player):

        self.beta_net.eval()
        self.beta_target.eval()

        self.pi_net.eval()
        self.pi_target.eval()

        self.vb_net.eval()
        self.vb_target.eval()

        self.q_net.eval()
        self.q_target.eval()

        self.qb_net.eval()
        self.qb_target.eval()

        env = Env(action_offset)

        n_human = 90

        episodes = list(self.data.keys())
        random.shuffle(episodes)
        humans_trajectories = iter(episodes)

        for i in range(n_tot):

            env.reset()
            trajectory = self.data[next(humans_trajectories)]
            choices = np.arange(self.global_action_space, dtype=np.int)
            random_choices = self.mask_q.data.cpu().numpy()
            random_choices = random_choices / random_choices.sum()

            j = 0

            while not env.t:

                s = Variable(env.s.cuda(), requires_grad=False)

                if player is 'beta':
                    pi, _ = self.beta_net(s)
                    pi = pi.squeeze(0)
                    self.greedy = False

                elif player is 'q_b':
                    pi, _ = self.qb_net(s)
                    pi = pi.squeeze(0)
                    self.greedy = True

                elif player is 'pi':
                    pi, _ = self.pi_net(s)
                    pi = pi.squeeze(0)
                    self.greedy = False

                elif player is 'q_pi':
                    pi, _ = self.q_net(s)
                    pi = pi.squeeze(0)
                    self.greedy = True

                else:
                    raise NotImplementedError

                if j < n_human:
                    a = trajectory[j, self.meta['action']]

                else:
                    eps = np.random.rand()
                    # eps = 1
                    # a = np.random.choice(choices)
                    if self.greedy:
                        if eps > 0.01:
                            a = (pi * self.mask_q).data.cpu().numpy()
                            a = np.argmax(a)
                        else:
                            a = np.random.choice(choices, p=random_choices)
                    else:
                        a = F.softmax(pi + self.mask_beta,
                                      dim=0).data.cpu().numpy()
                        a = np.random.choice(choices, p=a)

                env.step(a)

                j += 1

            yield {'score': env.score, 'frames': j}

        raise StopIteration
Exemplo n.º 13
0
    def play_episode(self, n_tot):

        self.model.eval()
        env = Env()

        n_human = 120
        humans_trajectories = iter(self.data)
        softmax = torch.nn.Softmax()

        mask = torch.FloatTensor(consts.actions_mask[args.game])
        mask = Variable(mask.cuda(), requires_grad=False)
        # self.actions_matrix = torch.FloatTensor([[0, 0, 0], [1, 0, 0],[0, 1, 0], [0, 0, 1]])

        for i in range(n_tot):

            env.reset()
            observation = next(humans_trajectories)
            trajectory = self.data[observation]
            choices = np.arange(self.global_action_space, dtype=np.int)

            j = 0

            while not env.t:

                s = Variable(env.s.cuda(), requires_grad=False)
                v, q, beta, _, _, phi = self.model(s, self.actions_matrix)
                beta = beta.squeeze(0)
                q = q.squeeze(2)
                q = q.squeeze(0)

                q = q * mask
                # beta[0] = 0
                temp = 0.1
                if True:  # self.imitation:

                    # consider only 3 most frequent actions
                    beta_np = beta.data.cpu().numpy()
                    indices = np.argsort(beta_np)

                    # maskb = Variable(torch.FloatTensor([i in indices[14:18] for i in range(18)]), requires_grad=False)
                    # maskb = Variable(torch.FloatTensor([1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), requires_grad=False)
                    # maskb = maskb.cuda()
                    # pi = maskb * (q / q.max())

                    maskb = Variable(torch.FloatTensor(
                        [1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                         0]),
                                     requires_grad=False)
                    maskb = maskb.cuda()
                    pi = maskb * (beta / beta.max())
                    # pi = maskb * (q / q.max())
                    self.greedy = False

                    # if j%2:
                    #     pi = maskb * (q / q.max())
                    #     self.greedy = True
                    # else:
                    #     self.greedy = False
                    #     pi = maskb * (beta / beta.max())
                    # pi = (beta > 3).float() * (q / q.max())

                    # pi = beta  # (beta > 5).float() * (q / q.max())
                    # pi[0] = 0
                    # beta_prob = softmax(pi)
                    beta_prob = pi
                else:
                    pi = q / q.max()  # q.max() is the temperature
                    beta_prob = q

                if j < n_human:
                    a = trajectory[j, self.meta['action']]

                else:
                    # a = np.random.choice(choices)
                    if self.greedy:
                        a = pi.data.cpu().numpy()
                        a = np.argmax(a)
                    else:
                        a = softmax(pi / temp).data.cpu().numpy()
                        a = np.random.choice(choices, p=a)

                env.step(a)

                # x = phi.squeeze(0).data.cpu().numpy()
                # print(np.mean(abs(x)))
                # yield v, q, beta, r, p, s
                yield {
                    'o': env.s.cpu().numpy(),
                    'v': v.data.cpu().numpy(),
                    's': phi.data.cpu().numpy(),
                    'score': env.score,
                    'beta': beta_prob.data.cpu().numpy(),
                    'phi': phi.squeeze(0).data.cpu().numpy()
                }

                j += 1

        raise StopIteration
Exemplo n.º 14
0
"""
All information on README.md
"""

import tensorflow as tf
from environment import Env
import numpy as np
import time
import model

steps = 1000
env = Env(vision=True)
ob = env.reset(relaunch=True)
print(ob)

###=================== Play the game with the trained model
# while True:
#     env = Env(vision=True)
#     ob = env.reset(relaunch=True)
#     loss = 0.0
#     for i in range(steps):
#         image = scipy.misc.imresize(ob, [66, 200]) / 255.0
#         degrees = model.y.eval(feed_dict={model.x: [image], model.keep_prob: 1.0})[0][0]
#         ob, reward, done, _ = env.step(act)
#         if done is True:
#             break
#         else:
#             ob_list.append(ob)
#
#     print("PLAY WITH THE TRAINED MODEL")
#     print(reward_sum)
        self.optimizer([self.states, self.actions, discounted_rewards])
        self.states, self.actions, self.rewards = [], [], []


if __name__ == "__main__":
    env = Env()
    agent = ReinforceAgent()

    global_step = 0
    scores, episodes = [], []

    for e in range(EPISODES):
        done = False
        score = 0
        # fresh env
        state = env.reset()
        state = np.reshape(state, [1, 15])

        while not done:
            global_step += 1
            # get action for the current state and go one step in environment
            action = agent.get_action(state)
            next_state, reward, done = env.step(action)
            next_state = np.reshape(next_state, [1, 15])

            agent.append_sample(state, action, reward)
            score += reward
            state = copy.deepcopy(next_state)

            if done:
                # update policy neural network for each episode
Exemplo n.º 16
0
def main():
    expert_demo = pickle.load(open('./Expert dataset 1/expert_20x20_1.p', "rb"))
    demonstrations = np.array(expert_demo[0])

    print("demonstrations.shape", demonstrations.shape)

    print(expert_demo[1])
    print(expert_demo[0])
    print(np.array(expert_demo[0]).shape)

    # expert_x = int(expert_demo[1][0])
    # expert_y = int(expert_demo[1][1])

    expert_x = int(expert_demo[0][0])
    expert_y = int(expert_demo[0][1])


    env = Env(expert_x, expert_y)

    # env.seed(args.seed)
    # torch.manual_seed(args.seed)

    num_inputs = 6
    num_actions = 8
    running_state = ZFilter((num_inputs,), clip=5)

    print('state size:', num_inputs) 
    print('action size:', num_actions)

    actor = Actor(num_inputs, num_actions, args)
    critic = Critic(num_inputs, args)
    vdb = VDB(num_inputs + num_actions, args)

    actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate)
    critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, 
                              weight_decay=args.l2_rate) 
    vdb_optim = optim.Adam(vdb.parameters(), lr=args.learning_rate)
    
    # load demonstrations

    k = 1
    writer = SummaryWriter(args.logdir)

    if args.load_model is not None:
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)

        actor.load_state_dict(ckpt['actor'])
        critic.load_state_dict(ckpt['critic'])
        vdb.load_state_dict(ckpt['vdb'])

        running_state.rs.n = ckpt['z_filter_n']
        running_state.rs.mean = ckpt['z_filter_m']
        running_state.rs.sum_square = ckpt['z_filter_s']

        print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n))

    
    episodes = 0
    train_discrim_flag = True



    for iter in range(args.max_iter_num):
        # expert_demo = pickle.load(open('./paper/{}.p'.format((iter+1)%expert_sample_size), "rb"))
        print(iter)
        expert_demo = pickle.load(open('./Expert dataset 1/expert_20x20_{}.p'.format(np.random.randint(1,50)), "rb"))
        tmp = expert_demo.pop(-1)

        demonstrations = np.array(expert_demo)

        print(demonstrations, demonstrations.shape)
        tot_sample_size = len(demonstrations) + 10
        ##########################

        actor.eval(), critic.eval()
        memory = deque()

        steps = 0
        scores = []

        # while steps < args.total_sample_size:

        while steps < tot_sample_size:
            # env.delete_graph()
            state = env.reset()
            # time.sleep(1)

            score = 0

            # state = running_state(state)
            state1 = state
            for _ in range((tot_sample_size+1)*2):
                if args.render:
                    env.render()

                steps += 1

                mu, std = actor(torch.Tensor(state).unsqueeze(0))
                action2 = np.argmax(get_action(mu, std)[0])
                action = get_action(mu, std)[0]
                next_state, reward, done, _ = env.step(action2)

                irl_reward = get_reward(vdb, state, action)

                # ###### 동영상 촬영용
                # if iter > 11500 :
                #     time.sleep(0.015)
                # #####
                if done:
                    mask = 0
                else:
                    mask = 1

                memory.append([state, action, irl_reward, mask])

                # next_state = running_state(next_state)
                state = next_state

                score += reward

                if done:
                    break
            ##########################
            env.draw_graph()
            env.render()
            ##########################
            episodes += 1
            scores.append(score)
        
        score_avg = np.mean(scores)
        print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg))
        writer.add_scalar('log/score', float(score_avg), iter)

        actor.train(), critic.train(), vdb.train()
        if train_discrim_flag:
            expert_acc, learner_acc = train_vdb(vdb, memory, vdb_optim, demonstrations, 0, args)
            print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100))
            if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen:
                train_discrim_flag = False
        train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args)

        if iter % 100:
            score_avg = int(score_avg)

            model_path = os.path.join(os.getcwd(),'save_model')
            if not os.path.isdir(model_path):
                os.makedirs(model_path)

            ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar')

            save_checkpoint({
                'actor': actor.state_dict(),
                'critic': critic.state_dict(),
                'vdb': vdb.state_dict(),
                'z_filter_n':running_state.rs.n,
                'z_filter_m': running_state.rs.mean,
                'z_filter_s': running_state.rs.sum_square,
                'args': args,
                'score': score_avg
            }, filename=ckpt_path)

    ####
    score_avg = int(score_avg)

    model_path = os.path.join(os.getcwd(), 'save_model')
    if not os.path.isdir(model_path):
        os.makedirs(model_path)

    ckpt_path = os.path.join(model_path, 'ckpt_' + 'last_model' + '.pth.tar')

    save_checkpoint({
        'actor': actor.state_dict(),
        'critic': critic.state_dict(),
        'vdb': vdb.state_dict(),
        'z_filter_n': running_state.rs.n,
        'z_filter_m': running_state.rs.mean,
        'z_filter_s': running_state.rs.sum_square,
        'args': args,
        'score': score_avg
    }, filename=ckpt_path)
Exemplo n.º 17
0
                               big_cpus,
                               big_disks,
                               big_mems,
                               big_time,
                               big_lifes,
                               big_profits,
                               small_cpus,
                               small_disks,
                               small_mems,
                               small_time,
                               small_lifes,
                               small_profits,
                               federate=True)
    # Obtain the greedy solution
    print('### GREEDY SOLUTION without federation ###')
    env.reset()
    greedy_profit_no_fed = greedy(env,
                                  big_cpus,
                                  big_disks,
                                  big_mems,
                                  big_time,
                                  big_lifes,
                                  big_profits,
                                  small_cpus,
                                  small_disks,
                                  small_mems,
                                  small_time,
                                  small_lifes,
                                  small_profits,
                                  federate=False)
Exemplo n.º 18
0
class DDPGStage:
    def __init__(self, model, is_training=False, var=1.):
        self.max_step = 200
        self.exploration_decay_start_step = 50000
        state_dim = 366
        action_dim = 2
        self.action_linear_max = 0.25  # m/s
        self.action_angular_max = 0.5  # rad/s
        rospy.init_node('ddpg_stage_1')
        rospy.on_shutdown(self.clear_vel)
        self.is_training = is_training
        if ['/gazebo/model_states', 'gazebo_msgs/ModelStates'] in rospy.get_published_topics():
            self.env = SimEnv(self.is_training)
            print("Gazebo mode")
        else:
            self.env = Env(self.is_training)
            print("Real world mode")

        self.agent = DDPG(model, self.env, state_dim, action_dim)
        self.past_action = np.array([0., 0.])
        print('State Dimensions: ' + str(state_dim))
        print('Action Dimensions: ' + str(action_dim))
        print('Action Max: ' + str(self.action_linear_max) + ' m/s and ' + str(self.action_angular_max) + ' rad/s')

        self.var = var

    def _train(self):
        print('Training mode')
        avg_reward_his = []
        total_reward = 0

        while not rospy.is_shutdown():
            state = self.env.reset()
            one_round_step = 0

            while not rospy.is_shutdown():
                a = self.agent.action(state)
                a[0] = np.clip(np.random.normal(a[0], self.var), 0., 1.)
                a[1] = np.clip(np.random.normal(a[1], self.var), -0.5, 0.5)

                state_, r, collision, arrive = self.env.step(a)
                time_step = self.agent.perceive(state, a, r, state_, collision)

                if time_step > 0:
                    total_reward += r

                if time_step % 10000 == 0 and time_step > 0:
                    print('---------------------------------------------------')
                    avg_reward = total_reward / 10000
                    print('Average_reward = ', avg_reward)
                    avg_reward_his.append(round(avg_reward, 2))
                    print('Average Reward:', avg_reward_his)
                    total_reward = 0

                if time_step % 5 == 0 and time_step > self.exploration_decay_start_step and self.var > 0.1:
                    self.var *= 0.9999

                state = state_
                one_round_step += 1

                plt.title("STEP %d, Reward: %.2f" % (one_round_step, r))
                result = 'Step: %3i | Reward: %.2f | Var: %.2f | Time step: %i |' % (one_round_step, r, self.var, time_step)
                if arrive:
                    print(result, 'Success')
                    one_round_step = 0
                    self.env.common_reset()
                elif collision:
                    print(result, 'Collision')
                    break
                elif one_round_step >= self.max_step:
                    print(result, 'Failed')
                    break

    def _evaluate(self):
        print('Testing mode')
        self.env.goal_range["x"] = [-1, 1]
        self.env.goal_range["y"] = [-1, 1]
        while not rospy.is_shutdown():
            state = self.env.reset()
            one_round_step = 0

            while not rospy.is_shutdown():

                a = self.agent.action(state)
                print("action: %s" % a)
                a[0] = np.clip(a[0], 0., 1.)
                a[1] = np.clip(a[1], -0.5, 0.5)
                state_, r, collision, arrive = self.env.step(a)
                state = state_
                one_round_step += 1

                plt.title("STEP %d, Reward: %.2f" % (one_round_step, r))
                result = 'Step: %3i | Reward: %.2f | Var: %.2f |' % (
                one_round_step, r, self.var)
                if arrive:
                    print(result, 'Success')
                    one_round_step = 0
                    self.env.common_reset()
                    # input()
                elif collision:
                    print(result, 'Collision')
                    break
                elif one_round_step >= self.max_step:
                    print(result, 'Failed')
                    break

    def run(self):
        # try:
        if self.is_training:
            self._train()
        else:
            self._evaluate()
        self.env.pub_cmd_vel.publish(Twist())

    def clear_vel(self):
        self.env.pub_cmd_vel.publish(Twist())
Exemplo n.º 19
0
    def play_episode(self, n_tot):

        self.beta_net.eval()
        self.pi_net.eval()
        self.vb_net.eval()
        self.q_net.eval()
        self.q_target.eval()
        self.beta_target.eval()
        self.vb_target.eval()
        self.qb_net.eval()

        env = Env()

        n_human = 120
        humans_trajectories = iter(self.data)
        softmax = torch.nn.Softmax()

        for i in range(n_tot):

            env.reset()
            observation = next(humans_trajectories)
            trajectory = self.data[observation]
            choices = np.arange(self.global_action_space, dtype=np.int)
            mask = Variable(torch.FloatTensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0]),
                             requires_grad=False).cuda()
            j = 0
            temp = 1

            while not env.t:

                s = Variable(env.s.cuda(), requires_grad=False)

                beta, phi = self.beta_net(s)
                pi, _ = self.pi_net(s)
                q, _ = self.q_net(s)
                vb, _ = self.vb_net(s)

                pi = beta.squeeze(0)
                self.greedy = False

                if j < n_human:
                    a = trajectory[j, self.meta['action']]

                else:
                    # eps = np.random.rand()
                    eps = 1
                    # a = np.random.choice(choices)
                    if self.greedy and eps > 0.025:
                        a = pi.data.cpu().numpy()
                        a = np.argmax(a)
                    else:
                        a = softmax(pi/temp).data.cpu().numpy()
                        a = np.random.choice(choices, p=a)

                q = q[0, a]
                q = q.squeeze(0)

                env.step(a)

                yield {'o': env.s.cpu().numpy(),
                       'v': vb.squeeze(0).data.cpu().numpy(),
                       'vb': vb.squeeze(0).data.cpu().numpy(),
                       'qb': q.squeeze(0).data.cpu().numpy(),
                       # 's': x[0, :512].data.cpu().numpy(),
                       'score': env.score,
                       'beta': pi.data.cpu().numpy(),
                       'phi': phi.squeeze(0).data.cpu().numpy(),
                       'q': q.squeeze(0).data.cpu().numpy()}

                j += 1

        raise StopIteration


        # if self.mc:
        #
        #     # MC return to boost the learning of Q^{\pi}
        #     loss_q_pi = self.loss_q_pi(q_pi, r_mc)
        # else:
        #
        #     # evaluate V^{\pi}(s')
        #     # V^{\pi}(s') = \sum_{a} Q^{\pi}(s',a) \pi(a|s')
        #     pi_target_tag, _ = self.pi_target(s_tag)
        #     beta_target_tag, _ = self.beta_target(s_tag)
        #     beta_sfm_tag = F.softmax(beta_target_tag, 1)
        #     pi_sfm_tag = F.softmax(pi_target_tag, 1)
        #     # consider only common actions
        #     mask_b = (beta_sfm_tag > self.behavioral_threshold).float()
        #     q_pi_tag_target, _ = self.q_target(s_tag)
        #
        #     v_tag = (q_pi_tag_target * mask_b * pi_sfm_tag).sum(1)
        #     v_tag = v_tag.unsqueeze(1)
        #     v_tag = v_tag.detach()
        #
        #     loss_q_pi = self.loss_q_pi(q_pi, r + (self.discount ** k) * (v_tag * (1 - t)))
Exemplo n.º 20
0
class Training:
    def __init__(self):
        self.n_episode = []
        self.n_epsilon = []
        self.n_dist = []
        self.avg_err = []
        self.logging_data = []

        # Parameters
        self.n_episodes = rospy.get_param("/n_episodes")
        self.n_step = rospy.get_param("/n_steps")
        self.mode_action = rospy.get_param('/mode_action')
        self.mem_size = rospy.get_param('/mem_size')
        self.batch_size = rospy.get_param('/batch_size')
        self.mode_optimize = rospy.get_param('/mode_optimize')
        self.avg_err_fre = rospy.get_param('/avg_err_fre')
        self.save_fre = rospy.get_param("/save_fre")
        self.load_checkpoint = rospy.get_param("/load_checkpoint")

        # create environment
        self.env = Env()
        self.n_states = self.env.observation_space
        self.n_actions = self.env.action_space.n

        # create Deep Q-Network
        self.dqn = DQN(self.n_states, self.n_actions)
        self.memory = ExperienceReplay(self.mem_size)

        # plot
        self.color1 = 'tab:green'
        self.color2 = 'tab:blue'
        self.color3 = 'tab:orange'
        self.color4 = 'tab:red'

        self.style_plot = random.choice(plt.style.available)
        plt.style.use(self.style_plot)
        plt.ion()

        ###########
        # Figure 1 - Rewards
        self.fig1 = plt.figure(1)
        # fig = plt.figure(figsize=(12,5))
        self.ax1 = self.fig1.add_subplot(1, 1, 1)
        self.ax2 = self.ax1.twinx()

        title_1 = 'Rewards - (Mode: Training)'
        self.ax1.set_title(title_1)
        self.ax1.set_xlabel('Episode')
        self.ax1.set_ylabel('Reward', color=self.color1)
        self.ax2.set_ylabel('Epsilon', color=self.color2)
        self.ax1.tick_params(axis='y', labelcolor=self.color1)
        self.ax2.tick_params(axis='y', labelcolor=self.color2)

        ###########
        # Figure 2 - Error
        self.fig2 = plt.figure(2)
        self.ax3 = self.fig2.add_subplot(1, 1, 1)

        title_2 = 'Error Distance - (Mode: Training)'
        self.ax3.set_title(title_2)
        self.ax3.set_xlabel('Episode')
        self.ax3.set_ylabel('Meter')

        self.init_file()

    def moving_average(self, x, w):
        return np.convolve(x, np.ones(w), 'valid') / w

    def init_file(self):
        rospack = rospkg.RosPack()
        data_path = rospack.get_path("pioneer_dragging") + "/data"
        username = getpass.getuser()
        # n_folder  = len(os.walk(data_path).__next__()[1])
        n_folder = glob("{}/{}*".format(data_path, username))
        n_folder = len(n_folder) + 1

        if self.load_checkpoint:
            n_folder -= 1

        self.data_path = "{}/{}-{}".format(data_path, username, n_folder)
        if not os.path.exists(self.data_path):
            os.mkdir(self.data_path)

        # config file
        if not self.load_checkpoint:
            config_path = rospack.get_path(
                "pioneer_dragging") + "/config/dragging_params.yaml"
            config_log = '{}/{}-params.yaml'.format(self.data_path, n_folder)
            os.system('cp {} {}'.format(config_path, config_log))

            plot_style = {'plot_style': self.style_plot}
            with open(config_log, 'r') as yamlfile:
                cur_yaml = yaml.safe_load(yamlfile)  # Note the safe_load
                cur_yaml.update(plot_style)

            if cur_yaml:
                with open(config_log, 'w') as yamlfile:
                    yaml.safe_dump(cur_yaml,
                                   yamlfile)  # Also note the safe_dump

        # history file
        self.history_log = '{}/{}-log.txt'.format(self.data_path, n_folder)

        # model file
        self.dqn.file_models = '{}/{}-pytorch-RL.tar'.format(
            self.data_path, n_folder)

        # memory file
        self.memory.file_mem = '{}/{}-memory.data'.format(
            self.data_path, n_folder)

        # figures file
        self.figure1 = '{}/{}-Rewards(Training).png'.format(
            self.data_path, n_folder)
        self.figure2 = '{}/{}-Error(Training).png'.format(
            self.data_path, n_folder)

    def plot_result(self,
                    i_episode,
                    cumulated_reward,
                    epsilon,
                    error_dist,
                    loaded=False):
        ### Figure 1
        # plot bar (cumulated reward)
        self.ax1.bar(i_episode, cumulated_reward, color=self.color1)

        # plot line (epsilon decay )
        if loaded:
            self.ax2.plot(i_episode, epsilon, color=self.color2)

            self.n_episode = i_episode.tolist()
            self.n_epsilon = epsilon.tolist()
            self.n_dist = error_dist.tolist()
        else:
            self.n_episode.append(i_episode)
            self.n_epsilon.append(epsilon)
            self.ax2.plot(self.n_episode, self.n_epsilon, color=self.color2)

            self.n_dist.append(error_dist)

        ### Figure 2
        # plot bar (error distance)
        self.ax3.bar(i_episode, error_dist, color=self.color3)

        # window_err = np.array(self.n_dist)
        # window_err = np.mean(window_err)
        # self.avg_err.append(window_err)
        # self.ax3.plot(self.n_episode, self.avg_err, color=self.color4)

        # plot line (average error distance)
        if len(self.n_dist) % self.avg_err_fre == 0:
            avg_err = self.moving_average(np.array(self.n_dist),
                                          self.avg_err_fre)
            self.ax3.plot(avg_err, color=self.color4)

        plt.draw()
        plt.pause(0.1)

    def run(self):
        start_time = time.time()

        if self.load_checkpoint:
            self.memory.load()
            self.dqn.load_model()

            # history log loaded
            self.logging_data = [
                line.rstrip('\n') for line in open(self.history_log)
            ]

            hist_data = pd.read_csv(self.history_log, sep=",")
            i_episode = hist_data['i_episode']
            cumulated_reward = hist_data['cumulated_reward']
            epsilon = hist_data['epsilon']
            error_dist = hist_data['error_dist']

            self.plot_result(i_episode,
                             cumulated_reward,
                             epsilon,
                             error_dist,
                             loaded=True)
            i_episode = hist_data['i_episode'].iloc[-1] + 1
            self.dqn.epsilon = hist_data['epsilon'].iloc[-1]
            rospy.loginfo('[RL] Loaded checkpoint')
        else:
            i_episode = 0

        #########################################
        ###### Reinfrocement Training loop ######
        for i_episode in range(i_episode, self.n_episodes):
            state = self.env.reset(i_episode)
            cumulated_reward = 0

            steps = 0
            step_time = time.time()

            while not rospy.is_shutdown():
                steps += 1
                action, epsilon = self.dqn.select_action(state, i_episode)
                # print('num_steps: {}, epsilon: {}, steps_done: {}'.format(steps, epsilon, dqn.steps_done))

                # action = env.action_space.sample()
                rospy.loginfo('[RL] action: {}'.format(action))

                next_state, reward, done, info = self.env.step(action)
                self.memory.push(state, action, next_state, reward, done)
                cumulated_reward += reward

                ################################
                ######### optimize #############

                if self.mode_optimize == 'normal_dqn':
                    # without experience replay memory
                    self.dqn.optimize(state, action, next_state, reward, done)

                elif self.mode_optimize == 'dqn_replay_memory':
                    # with experience replay memory
                    if len(self.memory) > self.batch_size:
                        state_mem, action_mem, next_state_mem, reward_mem, done_mem = self.memory.sample(
                            self.batch_size)
                        self.dqn.optimize_with_replay_memory(
                            state_mem, action_mem, next_state_mem, reward_mem,
                            done_mem)

                elif self.mode_optimize == 'dqn_taget_net':
                    # with experience target net
                    if len(self.memory) > self.batch_size:
                        state_mem, action_mem, next_state_mem, reward_mem, done_mem = self.memory.sample(
                            self.batch_size)
                        self.dqn.optimize_with_DQN(state_mem, action_mem,
                                                   next_state_mem, reward_mem,
                                                   done_mem)

                elif self.mode_optimize == 'dueling_dqn':
                    # with double DQN
                    if len(self.memory) > self.batch_size:
                        state_mem, action_mem, next_state_mem, reward_mem, done_mem = self.memory.sample(
                            self.batch_size)
                        self.dqn.optimize_with_dueling_DQN(
                            state_mem, action_mem, next_state_mem, reward_mem,
                            done_mem)

                if not done:
                    state = next_state
                else:
                    break

            # DQN update param
            self.dqn.update_param(i_episode)

            # Plotting
            error_dist = self.env.calc_dist()
            self.plot_result(i_episode, cumulated_reward, epsilon, error_dist)

            # Save Checkpoint
            temp_data = "{},{},{},{}".format(i_episode, cumulated_reward,
                                             epsilon, error_dist)
            self.logging_data.append(temp_data)

            if i_episode % self.save_fre == 0:
                rospy.loginfo('[RL] Save checkpoint: {}'.format(i_episode))

                self.dqn.save_model()  # save models
                self.memory.save()  # save replay memory

                # logging file
                with open(self.history_log, 'w') as f:
                    if not self.load_checkpoint:
                        f.write(
                            "i_episode,cumulated_reward,epsilon,error_dist\n")

                    for item in self.logging_data:
                        f.write("%s\n" % item)

                # save figures
                self.fig1.savefig(self.figure1, dpi=self.fig1.dpi)
                self.fig2.savefig(self.figure2, dpi=self.fig2.dpi)
                rospy.loginfo('[RL] Save figure1: {}'.format(self.figure1))
                rospy.loginfo('[RL] Save figure2: {}'.format(self.figure2))

            # Timing
            elapsed_time = time.time() - step_time
            total_time = time.time() - start_time
            print('\n********')
            print("Elapsed time: {}".format(
                time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))
            print("Total time: {}".format(
                time.strftime("%H:%M:%S", time.gmtime(total_time))))

        # Finish Training
        self.env.close()
        print()
        rospy.loginfo('[RL] Exit ...')

        total_time = time.time() - start_time
        print('\n*********************')
        print("Total time: ", time.strftime("%H:%M:%S",
                                            time.gmtime(total_time)))

        rospy.loginfo('[RL] Style plot: {}'.format(self.style_plot))
        plt.show(block=True)
Exemplo n.º 21
0
N = 20
env = Env(dt=np.pi / N)

RL = PolicyGradient(
    n_actions=env.n_actions,
    n_features=env.n_states,
    learning_rate=0.002,
    reward_decay=0.99,
)

fid_10 = 0
ep_max = 500
for episode in range(ep_max):

    observation = env.reset()

    for ii in range(N):

        action = RL.choose_action(observation)
        observation_, reward, done, fid = env.step(action)

        RL.store_transition(observation, action, reward)
        observation = observation_
        if done:
            if episode >= ep_max - 11:
                fid_10 = max(fid_10, fid)
            break

    RL.learn()
Exemplo n.º 22
0
    def play_episode(self, n_tot):

        self.model.eval()
        env = Env()

        n_human = 120
        humans_trajectories = iter(self.data)
        softmax = torch.nn.Softmax()

        # mask = torch.FloatTensor(consts.actions_mask[args.game])
        # mask = Variable(mask.cuda(), requires_grad=False)

        vsx = torch.FloatTensor(consts.short_bins[args.game])
        vlx = torch.FloatTensor(consts.long_bins[args.game])

        for i in range(n_tot):

            env.reset()
            observation = next(humans_trajectories)
            trajectory = self.data[observation]
            choices = np.arange(self.global_action_space, dtype=np.int)

            j = 0

            while not env.t:

                s = Variable(env.s.cuda(), requires_grad=False)
                vs, vl, beta, qs, ql, phi, pi_s, pi_l, pi_s_tau, pi_l_tau = self.model(
                    s, self.actions_matrix)
                beta = beta.squeeze(0)
                pi_l = pi_l.squeeze(0)
                pi_s = pi_s.squeeze(0)
                pi_l_tau = pi_l_tau.squeeze(0)
                pi_s_tau = pi_s_tau.squeeze(0)

                temp = 1

                # consider only 3 most frequent actions
                beta_np = beta.data.cpu().numpy()
                indices = np.argsort(beta_np)

                maskb = Variable(torch.FloatTensor(
                    [0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
                                 requires_grad=False).cuda()
                # maskb = Variable(torch.FloatTensor([0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
                #                  requires_grad=False).cuda()

                # pi = maskb * (beta / beta.max())

                pi = beta
                self.greedy = False

                beta_prob = pi

                if j < n_human:
                    a = trajectory[j, self.meta['action']]

                else:
                    eps = np.random.rand()
                    # a = np.random.choice(choices)
                    if self.greedy and eps > 0.1:
                        a = pi.data.cpu().numpy()
                        a = np.argmax(a)
                    else:
                        a = softmax(pi / temp).data.cpu().numpy()
                        a = np.random.choice(choices, p=a)

                env.step(a)

                vs = softmax(vs)
                vl = softmax(vl)
                vs = torch.sum(vsx * vs.data.cpu())
                vl = torch.sum(vlx * vl.data.cpu())

                yield {
                    'o': env.s.cpu().numpy(),
                    'vs': np.array([vs]),
                    'vl': np.array([vl]),
                    's': phi.data.cpu().numpy(),
                    'score': env.score,
                    'beta': beta_prob.data.cpu().numpy(),
                    'phi': phi.squeeze(0).data.cpu().numpy(),
                    'qs': qs.squeeze(0).data.cpu().numpy(),
                    'ql': ql.squeeze(0).data.cpu().numpy(),
                }

                j += 1

        raise StopIteration
Exemplo n.º 23
0
decay_factor = 0.999
num_episodes = 10

r_avg_list = []
r_sum_list = []

file = open('diag.txt', 'w')

for i in range(num_episodes):
    print("Episode {} of {}".format(i + 1, num_episodes))
    eps *= decay_factor
    r_sum = 0
    done = False
    diag_action = 0
    diag_reward = 0
    state = env.reset((i, num_episodes))
    while not done:
        env.reset((i, num_episodes))
        rand = np.random.random()
        if rand < eps:
            action = np.random.randint(0, 2)
        else:
            action = np.argmax(model.predict(np.identity(10)[state:state + 1]))
        new_s, r, done, _ = env.step(action=action, num=(i, num_episodes))
        target = r + y * np.max(model.predict(
            np.identity(10)[new_s:new_s + 1]))
        target_vec = model.predict(np.identity(10)[state:state + 1])[0]
        target_vec[action] = target
        model.fit(np.identity(10)[state:state + 1],
                  target_vec.reshape(-1, 2),
                  epochs=1,
Exemplo n.º 24
0
        episode.append((next_state, action, reward, done))

        if len(episode) > 200:
            # stop episode for time saving
            return [], False

        if done:
            break

        current_state = next_state
    return episode, True


# main loop
if __name__ == "__main__":
    env = Env()
    agent = MCAgent(actions=list(range(env.n_actions)))

    for episode in range(1000):
        print("Episode : ", episode + 1)
        current_state = env.reset()

        # generate episode
        episode, _ = generate_episode(env, agent)

        # update value table according to the episode
        agent.update(episode)

        # for monitoring values
        env.print_values(agent.value_table)
Exemplo n.º 25
0
if __name__ == "__main__":
    env = Env()
    agent = ReinforceAgent()

    global_step = 0
    scores, episodes = [], []

    for e in range(EPISODES):
        done = False
        score = 0
        #agent.model.load_model('qwerty_1.h5')
        # fresh env
        #tar = random.sample(range(0,25),1)
        #target = [int(tar[0]/5),tar[0]%5]
        img, g_map = env.reset()
        cv2.imshow('image', img)
        #state = np.reshape(state, [1, 3])
        #img = state[0]
        #g_map = state[1]
        img = np.reshape(img, [1, img.shape[0], img.shape[1], img.shape[2]])
        g_map = np.reshape(g_map, [1, 5, 5, 1])
        #state = [img,g_map]

        while not done:
            global_step += 1
            # get action for the current state and go one step in environment
            action = agent.get_action([img, g_map])
            next_state, reward, done = env.step(action)
            img = next_state[0]
            cv2.imshow('image', img)
Exemplo n.º 26
0
import random
import math
from gazebo_msgs.msg import *
import numpy as np
import csv
import rospkg
import matplotlib.pyplot as plt
from matplotlib import cm
import time
from environment import Env

if __name__ == "__main__":
    rospy.init_node("path_controller_node", anonymous=False)

    env = Env()
    state_scan = env.reset()
    action = np.zeros(2)

    pub = rospy.Publisher('/cmd_vel', Twist, queue_size=10)
    r = rospy.Rate(5)  # 10hz
    velocity = Twist()
    while not rospy.is_shutdown():
        # FACA SEU CODIGO AQUI
        if (min(state_scan[:20]) > 0.25):
            action[0] = .0
            action[1] = 0.
        else:
            action[0] = 0.
            action[1] = 0.0

        state_scan = env.step(action)
Exemplo n.º 27
0
def main():
    rospy.init_node('ddpg_stage_1')
    env = Env(is_training)
    agent = DDPG(env, state_dim, action_dim)
    past_action = np.array([0., 0.])
    print('State Dimensions: ' + str(state_dim))
    print('Action Dimensions: ' + str(action_dim))
    print('Action Max: ' + str(action_linear_max) + ' m/s and ' +
          str(action_angular_max) + ' rad/s')

    if is_training:
        print('Training mode')
        avg_reward_his = []
        total_reward = 0
        var = 1.

        while True:
            state = env.reset()
            one_round_step = 0

            while True:
                a = agent.action(state)
                a[0] = np.clip(np.random.normal(a[0], var), 0., 1.)
                a[1] = np.clip(np.random.normal(a[1], var), -0.5, 0.5)

                state_, r, done, arrive = env.step(a, past_action)
                time_step = agent.perceive(state, a, r, state_, done)

                if arrive:
                    result = 'Success'
                else:
                    result = 'Fail'

                if time_step > 0:
                    total_reward += r

                if time_step % 10000 == 0 and time_step > 0:
                    print(
                        '---------------------------------------------------')
                    avg_reward = total_reward / 10000
                    print('Average_reward = ', avg_reward)
                    avg_reward_his.append(round(avg_reward, 2))
                    print('Average Reward:', avg_reward_his)
                    total_reward = 0

                if time_step % 5 == 0 and time_step > exploration_decay_start_step:
                    var *= 0.9999

                past_action = a
                state = state_
                one_round_step += 1

                if arrive:
                    print('Step: %3i' % one_round_step, '| Var: %.2f' % var,
                          '| Time step: %i' % time_step, '|', result)
                    one_round_step = 0

                if done or one_round_step >= 500:
                    print('Step: %3i' % one_round_step, '| Var: %.2f' % var,
                          '| Time step: %i' % time_step, '|', result)
                    break

    else:
        print('Testing mode')
        while True:
            state = env.reset()
            one_round_step = 0

            while True:
                a = agent.action(state)
                a[0] = np.clip(a[0], 0., 1.)
                a[1] = np.clip(a[1], -0.5, 0.5)
                state_, r, done, arrive = env.step(a, past_action)
                past_action = a
                state = state_
                one_round_step += 1

                if arrive:
                    print('Step: %3i' % one_round_step, '| Arrive!!!')
                    one_round_step = 0

                if done:
                    print('Step: %3i' % one_round_step, '| Collision!!!')
                    break
Exemplo n.º 28
0
else:
	print("ERROR IN TEST MODE!")



# Main loop
if __name__ == '__main__':
    running_reward = None
    reward_sum = 0
    prev_x = None
    filename = './data/evaluation_logs.txt'

    for i_episode in range(default_config["max_iteration"]):
        attack_mode = random.randint(0, 6)
        state_new = env.reset(attack_mode)
        agent.update_current_channel(state_new)
        done = False


        for t in range(default_config["max_episode_length"]):
            # Get current channel
            x = np.zeros(default_config["max_channel"])
            x[agent.cur_channel] = 1
            # Put into the NN
            action_c = agent.c_policy.select_action(x).cpu().detach().numpy()[0]
            action_s = agent.s_policy.select_action(x).cpu().detach().numpy()[0]
            # print(int(action_c), " ", int(action_s))
            state_new, reward, done, info = env.step(int(action_c), int(action_s))
            agent.update_current_channel(state_new)
            reward_sum += reward
Exemplo n.º 29
0
def main():
    expert_demo = pickle.load(open('./Ree1_expert.p', "rb"))
    # Ree1 : action 1
    # Ree2 : action 100
    # Ree3 : action 50
    # Ree4 : action 10
    # Ree5 : action 4
    # Ree6 : action 0.5

    # print('expert_demo_shape : ', np.array(expert_demo).shape)
    expert_x = int(expert_demo[1][0])
    expert_y = int(expert_demo[1][1])
    env = Env(expert_x, expert_y)
    # env = Env(0,0)

    # env.seed(args.seed)
    torch.manual_seed(args.seed)

    num_inputs = 2
    num_actions = 8
    running_state = ZFilter((num_inputs, ), clip=5)

    print('state size:', num_inputs)
    print('action size:', num_actions)

    actor = Actor(num_inputs, num_actions, args)
    critic = Critic(num_inputs, args)
    discrim = Discriminator(num_inputs + num_actions, args)

    actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate)
    critic_optim = optim.Adam(critic.parameters(),
                              lr=args.learning_rate,
                              weight_decay=args.l2_rate)
    discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate)

    # load demonstrations
    # expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb"))

    demonstrations = np.array(expert_demo[0])

    # print("demonstrations.shape", demonstrations.shape)

    writer = SummaryWriter(args.logdir)

    if args.load_model is not None:
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model',
                                       str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)

        actor.load_state_dict(ckpt['actor'])
        critic.load_state_dict(ckpt['critic'])
        discrim.load_state_dict(ckpt['discrim'])

        running_state.rs.n = ckpt['z_filter_n']
        running_state.rs.mean = ckpt['z_filter_m']
        running_state.rs.sum_square = ckpt['z_filter_s']

        print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n))

    episodes = 0
    train_discrim_flag = True

    for iter in range(args.max_iter_num):
        actor.eval(), critic.eval()
        memory = deque()

        steps = 0
        scores = []

        while steps < args.total_sample_size:
            state = env.reset()
            score = 0

            state = running_state(state)

            for _ in range(1000):
                if args.render:
                    env.render()

                steps += 1

                mu, std = actor(torch.Tensor(state).unsqueeze(0))
                action2 = np.argmax(get_action(mu, std)[0])
                action = get_action(mu, std)[0]
                next_state, reward, done, _ = env.step(action2)
                # next_state, reward, done, _ = env.step(action)
                irl_reward = get_reward(discrim, state, action)

                if done:
                    mask = 0
                else:
                    mask = 1

                memory.append([state, action, irl_reward, mask])

                next_state = running_state(next_state)
                state = next_state

                score += reward

                if done:
                    break

            episodes += 1
            scores.append(score)

        score_avg = np.mean(scores)
        print('{}:: {} episode score is {:.2f}'.format(iter, episodes,
                                                       score_avg))
        writer.add_scalar('log/score', float(score_avg), iter)

        actor.train(), critic.train(), discrim.train()
        if train_discrim_flag:
            expert_acc, learner_acc = train_discrim(discrim, memory,
                                                    discrim_optim,
                                                    demonstrations, args)
            print("Expert: %.2f%% | Learner: %.2f%%" %
                  (expert_acc * 100, learner_acc * 100))

            temp_learner.append(learner_acc * 100)
            temp_expert.append(expert_acc * 100)

            if ((expert_acc > args.suspend_accu_exp
                 and learner_acc > args.suspend_accu_gen and iter % 55 == 0)
                    or iter % 50 == 0):
                # train_discrim_flag = False
                plt.plot(temp_learner, label='learner')
                plt.plot(temp_expert, label='expert')
                plt.xlabel('Episode')
                plt.ylabel('Accuracy')
                plt.xticks([])
                plt.legend()
                plt.savefig('accuracy{}.png'.format(iter))
                # plt.show()

                model_path = 'C:/Users/USER/9 GAIL/lets-do-irl/mujoco/gail'
                ckpt_path = os.path.join(model_path,
                                         'ckpt_' + str(score_avg) + '.pth.tar')

                print("check path", ckpt_path)
                save_checkpoint(
                    {
                        'actor': actor.state_dict(),
                        'critic': critic.state_dict(),
                        'discrim': discrim.state_dict(),
                        'z_filter_n': running_state.rs.n,
                        'z_filter_m': running_state.rs.mean,
                        'z_filter_s': running_state.rs.sum_square,
                        'args': args,
                        'score': score_avg
                    },
                    filename=ckpt_path)

        train_actor_critic(actor, critic, memory, actor_optim, critic_optim,
                           args)

        if iter % 100:
            score_avg = int(score_avg)

            model_path = os.path.join(os.getcwd(), 'save_model')
            if not os.path.isdir(model_path):
                os.makedirs(model_path)

            model_path = 'C:/Users/USER/9 GAIL/lets-do-irl/mujoco/gail'
            ckpt_path = os.path.join(model_path,
                                     'ckpt_' + str(score_avg) + '.pth.tar')

            save_checkpoint(
                {
                    'actor': actor.state_dict(),
                    'critic': critic.state_dict(),
                    'discrim': discrim.state_dict(),
                    'z_filter_n': running_state.rs.n,
                    'z_filter_m': running_state.rs.mean,
                    'z_filter_s': running_state.rs.sum_square,
                    'args': args,
                    'score': score_avg
                },
                filename=ckpt_path)
    plt.plot(temp_learner)
    plt.plot(temp_expert)
    plt.xlabel('Episode')
    plt.ylabel('Accuracy')
    plt.xticks([])
    plt.savefig('accuracy.png')
Exemplo n.º 30
0
    def play_episode(self, n_tot):

        self.model.eval()
        self.model_b.eval()
        env = Env()

        n_human = 120
        humans_trajectories = iter(self.data)
        softmax = torch.nn.Softmax()

        for i in range(n_tot):

            env.reset()
            observation = next(humans_trajectories)
            trajectory = self.data[observation]
            choices = np.arange(self.global_action_space, dtype=np.int)
            mask = Variable(torch.FloatTensor([0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
                             requires_grad=False).cuda()
            j = 0
            temp = 1

            while not env.t:

                s = Variable(env.s.cuda(), requires_grad=False)

                beta, vb, qb, _, _ = self.model_b(s, self.actions_matrix)
                pi, v, q, adv, x = self.model(s, self.actions_matrix, beta.detach())

                pi = pi.squeeze(0)
                self.greedy = False

                if j < n_human:
                    a = trajectory[j, self.meta['action']]

                else:
                    eps = np.random.rand()
                    # a = np.random.choice(choices)
                    if self.greedy and eps > 0.1:
                        a = pi.data.cpu().numpy()
                        a = np.argmax(a)
                    else:
                        a = softmax(pi/temp).data.cpu().numpy()
                        a = np.random.choice(choices, p=a)

                q = q[0, a, 0]
                q = q.squeeze(0)

                qb = qb[0, a, 0]
                qb = qb.squeeze(0)

                env.step(a)

                yield {'o': env.s.cpu().numpy(),
                       'v': v.squeeze(0).data.cpu().numpy(),
                       'vb': vb.squeeze(0).data.cpu().numpy(),
                       'qb': qb.squeeze(0).data.cpu().numpy(),
                       's': x[0, :512].data.cpu().numpy(),
                       'score': env.score,
                       'beta': pi.data.cpu().numpy(),
                       'phi': x[0, :512].data.cpu().numpy(),
                       'q': q.squeeze(0).data.cpu().numpy()}

                j += 1

        raise StopIteration
Exemplo n.º 31
0

if __name__ == "__main__":
    # maze game
    # env = Maze()
    env = Env()
    agent = DQNAgent()

    global_step = 0
    # agent.load_model("./save_model/10by10")
    scores, episodes = [], []

    for e in range(EPISODES):
        done = False
        score = 0
        state = env.reset()
        state = np.reshape(state, [1, 20])

        while not done:
            # fresh env
            if agent.render:
                env.render()
            global_step += 1

            # get action for the current state and go one step in environment
            action = agent.get_action(state)
            next_state, reward, done = env.step(action)
            next_state = np.reshape(next_state, [1, 20])

            agent.replay_memory(state, action, reward, next_state, done)
            # every time step we do training
Exemplo n.º 32
0
    plt.ion()
    plt.figure(figsize=(100, 5))  # 设置画布大小
    ax1 = plt.subplot(211)
    ax2 = plt.subplot(212)

    success = 0
    totally = 0
    zongzhou = []


    while True:
        # main1.rl.restore_net()
        # main2.rl.restore_net()

        dic_state = env.reset(tools)
        for episodes in range(1000):
            dic_action = {}
            suss = 0
            total = 0

            for x in dic_state:
                if x not in dic_action:
                    dic_action[x] = []

                if x == 1:
                    for num in range(len(dic_state[1])):
                        # temp_state = tools.get_list(dic_state[1][num])  # 车组中所有车辆状态合成
                        # temp = main1.rl.real_choose_action(temp_state)  # 学习到车组的动作组合
                        dic_action[1].append([int(env.cars_posit[dic_state[1][num][0][3]])])