Exemplo n.º 1
0
    def __init__(self,
                 state_dim,
                 action_dim,
                 learning_rate=0.001,
                 reward_decay=0.99,
                 e_greedy=0.9):
        self.action_dim = action_dim
        self.state_dim = state_dim
        self.lr = learning_rate
        self.gamma = reward_decay  #  in according to the parameters in the formulation.
        self.epsilon = e_greedy
        self.EPS_START = 0.9
        self.EPS_END = 0.05
        self.EPS_DECAY = 30000  # this decay is to slow. # TO DO: figure out the relationship between the decay and the totoal step.
        # try to use a good strategy to solve this problem.
        use_cuda = torch.cuda.is_available()
        self.LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
        self.FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
        self.model = QNet(self.state_dim,
                          self.action_dim).cuda() if use_cuda else QNet(
                              self.state_dim, self.action_dim)

        self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
        # self.scheduler = optim.StepLR(self.optimizer, step_size=10000, gamma=0.5) # the learning rate decrease by a factor gamma every 10000 step_size.

        util.weights_init(self.model)
Exemplo n.º 2
0
    def __init__(self, env):
        super(DDPG, self).__init__()

        pi_net = PiNet(self.ns, self.na)
        self.pi_net = pi_net.to(self.device)

        pi_target = PiNet(self.ns, self.na)
        self.pi_target = pi_target.to(self.device)
        self.load_state_dict(self.pi_target, self.pi_net.state_dict())

        q_net = QNet(self.ns, self.na)
        self.q_net = q_net.to(self.device)

        q_target = QNet(self.ns, self.na)
        self.q_target = q_target.to(self.device)
        self.load_state_dict(self.q_target, self.q_net.state_dict())

        self.optimizer_q = torch.optim.Adam(self.q_net.parameters(),
                                            lr=self.lr_q,
                                            betas=(0.9, 0.999),
                                            weight_decay=1e-2)

        self.optimizer_p = torch.optim.Adam(self.pi_net.parameters(),
                                            lr=self.lr_p,
                                            betas=(0.9, 0.999),
                                            weight_decay=0)

        self.noise = OrnsteinUhlenbeckActionNoise(
            torch.zeros(1, self.na).to(self.device),
            self.epsilon * torch.ones(1, self.na).to(self.device))
Exemplo n.º 3
0
    def __init__(self, args, state_size, action_size, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.per = args.per
        self.dueling = args.dueling
        self.buffer_size = args.buffer_size
        self.batch_size = args.batch_size
        self.gamma = args.gamma
        self.tau = args.tau
        self.lr = args.learning_rate
        self.update_freq = args.update_every
        # Q-Network
        if self.dueling:
            self.local_qnet = DuelingQNet(state_size, action_size,
                                          seed).to(device)
            self.target_qnet = DuelingQNet(state_size, action_size,
                                           seed).to(device)
        else:
            self.local_qnet = QNet(state_size, action_size, seed).to(device)
            self.target_qnet = QNet(state_size, action_size, seed).to(device)

        self.optimizer = optim.Adam(self.local_qnet.parameters(), lr=self.lr)

        # Replay Memory
        if self.per:
            self.memory = PrioritizedReplayMemory(args, self.buffer_size)
        else:
            self.memory = ReplayMemory(action_size, self.buffer_size,
                                       self.batch_size, seed)
        self.t_step = 0  # init time step for updating every UPDATE_EVERY steps
    def __init__(self, state_size, action_size, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        self.qnetwork_local = QNet(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNet(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay Memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        self.t_step = 0
Exemplo n.º 5
0
    def __init__(self):
        #Creating environment
        self.env = gym.make(settings.env_name)
        self.env.seed(settings.seed)
        self.env.action_space.seed(settings.seed)

        self.state_space = self.env.observation_space.shape[0]
        self.action_space = self.env.action_space.shape[0]

        self.obs_normalizer = Normalizer(self.state_space)

        self.device = torch.device(settings.device)
        self.writer = SummaryWriter(
            'runs/' + settings.env_name + "_" + settings.algo +
            '_{}_{}_{}'.format(p.alpha, p.ex_alpha, settings.seed))

        #Initializing common networks and their optimizers
        self.exploitory_policy = GaussianPolicy(
            self.state_space, self.action_space).to(self.device)
        self.exploitory_Q = QNet(self.state_space,
                                 self.action_space).to(self.device)
        self.exploitory_Q_target = QNet(self.state_space,
                                        self.action_space).to(self.device)
        self.exploitory_policy_optim = Adam(
            self.exploitory_policy.parameters(), lr=p.lr)
        self.exploitory_Q_optim = Adam(self.exploitory_Q.parameters(), lr=p.lr)

        self.target_update(self.exploitory_Q_target, self.exploitory_Q, 1.0)

        p.alpha = torch.Tensor([p.alpha]).to(self.device)
        if settings.automatic_entropy_tuning:
            self.target_entropy = -torch.prod(
                torch.Tensor(self.env.action_space.shape).to(
                    self.device)).item()
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device)
            self.alpha_optim = Adam([self.log_alpha], lr=p.lr)

        if settings.automatic_ex_entropy_tuning:
            self.ex_target_entropy = -torch.prod(
                torch.Tensor(self.env.action_space.shape).to(
                    self.device)).item()
            self.ex_log_alpha = torch.zeros(1,
                                            requires_grad=True,
                                            device=self.device)
            self.ex_alpha_optim = Adam([self.log_alpha], lr=p.lr)

        if settings.reward_model == 'novelty':
            self.ex_reward_model = Novelty(self.state_space, self.device)
Exemplo n.º 6
0
    def __init__(self, *largs, **kwargs):
        super(SSPG, self).__init__(*largs, **kwargs)

        pi_net = PiNet(self.ns, self.na, distribution='Normal')
        self.pi_net = pi_net.to(self.device)

        pi_target = PiNet(self.ns, self.na, distribution='Normal')
        self.pi_target = pi_target.to(self.device)
        self.load_state_dict(self.pi_target, self.pi_net.state_dict())

        q_net_1 = QNet(self.ns, self.na)
        self.q_net_1 = q_net_1.to(self.device)

        q_target_1 = QNet(self.ns, self.na)
        self.q_target_1 = q_target_1.to(self.device)
        self.load_state_dict(self.q_target_1, self.q_net_1.state_dict())

        q_net_2 = QNet(self.ns, self.na)
        self.q_net_2 = q_net_2.to(self.device)

        q_target_2 = QNet(self.ns, self.na)
        self.q_target_2 = q_target_2.to(self.device)
        self.load_state_dict(self.q_target_2, self.q_net_2.state_dict())

        self.optimizer_q_1 = torch.optim.Adam(self.q_net_1.parameters(),
                                              lr=self.lr_q,
                                              betas=(0.9, 0.999),
                                              weight_decay=self.weight_decay_q)

        self.optimizer_q_2 = torch.optim.Adam(self.q_net_2.parameters(),
                                              lr=self.lr_q,
                                              betas=(0.9, 0.999),
                                              weight_decay=self.weight_decay_q)

        self.optimizer_p = torch.optim.Adam(self.pi_net.parameters(),
                                            lr=self.lr_p,
                                            betas=(0.9, 0.999),
                                            weight_decay=self.weight_decay_p)

        if self.entropy_tunning:
            self.target_entropy = -torch.prod(
                torch.Tensor(self.na).to(self.device)).item()
            self.log_alpha = torch.tensor([0.],
                                          requires_grad=True,
                                          device=self.device)
            self.optimizer_alpha = torch.optim.Adam([self.log_alpha],
                                                    lr=self.lr_q)
            self.alpha = float(self.log_alpha.exp())
Exemplo n.º 7
0
def test(L, mouse_initial_indices, rewardlist, actions_list):
    online_net = QNet(3, 4).to(device)
    online_net.load_state_dict(
        torch.load("./qlearning_model", map_location=device))
    env = deepcopy(L)

    done = False
    eaubue = 0.
    steps = 0
    score = 0
    if mouse_initial_indices is None:
        all_possible_starting_positions = np.array([*np.where(L == 1)]).T
        mouse_initial_indices = all_possible_starting_positions[
            np.random.choice(range(len(all_possible_starting_positions)))]
    state = np.array(mouse_initial_indices)
    state = torch.Tensor(state).to(device)
    state = state.unsqueeze(0)

    def progress_loop(done, steps, state, score, eaubue):
        steps += 1

        action = get_action(state, online_net, 1, env, True, eaubue)
        displacement = np.array(actions_list[action])
        newstate = state + torch.Tensor(displacement).to(device)
        if env[int(newstate[0][0].tolist()),
               int(newstate[0][1].tolist())] != 0:
            next_state = newstate

            displayer.main_canva.move(displayer.mouse,
                                      *(displacement * displayer.square_size))

            reward = rewardlist[env[int(newstate[0][0].tolist()),
                                    int(newstate[0][1].tolist())]]
            if env[int(newstate[0][0].tolist()),
                   int(newstate[0][1].tolist())] == 2:
                done = True
            if env[int(newstate[0][0].tolist()),
                   int(newstate[0][1].tolist()
                       )] == 4:  #if the mouse is in the water
                env[int(newstate[0][0].tolist()),
                    int(newstate[0][1].tolist())] = 5  #there is no more water
                eaubue = 1.
        else:
            next_state = state
            reward = rewardlist[0]

        score += reward
        state = next_state
        print('position : ', state.tolist()[0], score)

        if done is False:
            displayer.window.after(
                800, lambda: progress_loop(done, steps, state, score, eaubue))

    displayer = Displayer()

    displayer.create_labyrinth(L, mouse_initial_indices)
    progress_loop(done, steps, state, score, 0.)

    displayer.window.mainloop()
def main():
    env = gym.make(env_name)
    env.seed(500)
    torch.manual_seed(500)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    net = QNet(num_inputs, num_actions)

    optimizer = optim.Adam(net.parameters(), lr=lr)
    writer = SummaryWriter('logs')

    net.to(device)
    net.train()
    running_score = 0
    steps = 0
    loss = 0

    for e in range(3000):
        done = False
        memory = Memory()

        score = 0
        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)

        while not done:
            steps += 1

            action = net.get_action(state)
            next_state, reward, done, _ = env.step(action)

            next_state = torch.Tensor(next_state)
            next_state = next_state.unsqueeze(0)

            mask = 0 if done else 1
            reward = reward if not done or score == 499 else -1

            action_one_hot = torch.zeros(2)
            action_one_hot[action] = 1
            memory.push(state, next_state, action_one_hot, reward, mask)

            score += reward
            state = next_state

        loss = QNet.train_model(net, memory.sample(), optimizer)

        score = score if score == 500.0 else score + 1
        running_score = 0.99 * running_score + 0.01 * score
        if e % log_interval == 0:
            print('{} episode | score: {:.2f}'.format(e, running_score))
            writer.add_scalar('log/score', float(running_score), e)
            writer.add_scalar('log/loss', float(loss), e)

        if running_score > goal_score:
            break
Exemplo n.º 9
0
def main():

    if not (os.path.isdir("logs")):
        os.makedirs("logs")

    working_dir = "logs/" + args.dir
    if not (os.path.isdir(working_dir)):
        raise NameError(args.dir + " does not exist in dir logs")

    print(args)

    env = QubeSwingupEnv(use_simulator=args.sim, batch_size= 2048*4)

    num_inputs = env.observation_space.shape[0]
    num_actions = NUMBER_OF_ACTIONS
    print('state size:', num_inputs)
    print('action size:', num_actions)

    net = QNet(num_inputs, num_actions) if not args.new_net else QNet_more_layers(num_inputs, num_actions)
    net.load_state_dict(torch.load(working_dir + "/best_model.pth", map_location=torch.device(device)))
    net.to(device)
    net.eval()
    running_score = 0
    epsilon = 1.0
    steps = 0
    beta = beta_start
    loss = 0

    best_running_score = -1000

    for e in range(1):
        done = False

        score = 0
        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)
        
        while not done:
            steps += 1
            action = get_continuous_action(get_action(state, net))
            if np.abs(state[0][1].item()) < deg2rad(25):
                action = pd_control_policy(state.cpu().numpy()[0])[0]
            next_state, reward, done, info = env.step(action)
            reward = give_me_reward(info["alpha"], info["theta"])
            if args.sim: env.render()
            reward = give_me_reward(info["alpha"], info["theta"])
            if done:
                print(info)
                print("theta:" , info["theta"] * 180/np.pi)
            next_state = torch.Tensor(next_state).to(device)
            next_state = next_state.unsqueeze(0)

            score += reward
            state = next_state

        running_score = 0.99 * running_score + 0.01 * score
        print('{} episode | running_score: {:.2f} | score: {:.2f} | steps: {} '.format(e, running_score, score, steps))
    env.close()
Exemplo n.º 10
0
    def __init__(self, env):
        super(SACV, self).__init__()

        self.env = env
        n_a = env.action_space.shape[0]
        n_s = env.observation_space.shape[0]

        pi_net = PiNet(n_s, n_a, distribution='Normal')
        self.pi_net = pi_net.to(self.device)

        q_net_1 = QNet(n_s, n_a)
        self.q_net_1 = q_net_1.to(self.device)

        q_net_2 = QNet(n_s, n_a)
        self.q_net_2 = q_net_2.to(self.device)

        v_net = QNet(n_s, 0)
        self.v_net = v_net.to(self.device)

        v_target = QNet(n_s, 0)
        self.v_target = v_target.to(self.device)
        self.load_state_dict(self.v_target, self.v_net.state_dict())

        self.optimizer_q_1 = torch.optim.Adam(self.q_net_1.parameters(),
                                              lr=self.lr_q,
                                              betas=(0.9, 0.999),
                                              weight_decay=1e-2)

        self.optimizer_q_2 = torch.optim.Adam(self.q_net_2.parameters(),
                                              lr=self.lr_q,
                                              betas=(0.9, 0.999),
                                              weight_decay=1e-2)

        self.optimizer_v = torch.optim.Adam(self.v_net.parameters(),
                                            lr=self.lr_q,
                                            betas=(0.9, 0.999),
                                            weight_decay=1e-2)

        # eps = 1e-04,
        self.optimizer_p = torch.optim.Adam(self.pi_net.parameters(),
                                            lr=self.lr_p,
                                            betas=(0.9, 0.999),
                                            weight_decay=0)

        self.sample = self.actor_rb
Exemplo n.º 11
0
    def __init__(self, state_size, action_size, seed):
        """
        Params
        ======
            state_size (int): state dimension
            action_size (int): action dimension
            seed (int): random seed for replicating experiment
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        self.QNet_local = QNet(state_size, action_size, seed).to(device)
        self.QNet_target = QNet(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.QNet_local.parameters(), lr=LR)

        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        self.t_step = 0
Exemplo n.º 12
0
    def __init__(self, actor_id, n_actors, shared_dict, device='cpu'):
        # params
        self.gamma = 0.99
        self.epsilon = 0.4 ** (1 + actor_id * 7 / (n_actors - 1))
        self.bootstrap_steps = 3
        self.alpha = 0.6
        self.priority_epsilon = 1e-6
        self.device = device
        self.actor_id = actor_id

        # path
        self.memory_path = os.path.join(
            './', 'logs', 'memory')

        # memory
        self.memory_size = 50000
        self.batch_size = 32
        self.action_repeat = 4
        self.n_stacks = 4
        self.burn_in_length = 10
        self.learning_length = 10
        self.overlap_length = 10
        self.eta = 0.9
        self.sequence_length = self.burn_in_length + self.learning_length
        self.stack_count = self.n_stacks // self.action_repeat
        self.memory_save_interval = 5
        self.episode_start_index = 0
        self.n_steps_memory = NStepMemory(self.bootstrap_steps, self.gamma)
        self.replay_memory = ReplayMemory(self.memory_size, self.batch_size, self.bootstrap_steps)

        # net
        self.shared_dict = shared_dict
        self.net_load_interval = 5
        self.net = QNet(self.device).to(self.device)
        self.target_net = QNet(self.device).to(self.device)
        self.target_net.load_state_dict(self.net.state_dict())

        # env
        self.env = PongEnv(self.action_repeat, self.n_stacks)
        self.episode_reward = 0
        self.n_episodes = 0
        self.n_steps = 0
        self.memory_count = 0
        self.state = self.env.reset()
Exemplo n.º 13
0
def main():
    env = gym.make(env_name)
    env.seed(500)
    torch.manual_seed(500)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    online_net = QNet(num_inputs, num_actions)
    target_net = QNet(num_inputs, num_actions)
    target_net.load_state_dict(online_net.state_dict())
    online_net.share_memory()
    target_net.share_memory()

    optimizer = SharedAdam(online_net.parameters(), lr=lr)
    global_ep, global_ep_r, res_queue = mp.Value('i',
                                                 0), mp.Value('d',
                                                              0.), mp.Queue()

    writer = SummaryWriter('logs')

    online_net.to(device)
    target_net.to(device)
    online_net.train()
    target_net.train()

    workers = [
        Worker(online_net, target_net, optimizer, global_ep, global_ep_r,
               res_queue, i) for i in range(mp.cpu_count())
    ]
    [w.start() for w in workers]
    res = []
    while True:
        r = res_queue.get()
        if r is not None:
            res.append(r)
            [ep, ep_r, loss] = r
            writer.add_scalar('log/score', float(ep_r), ep)
            writer.add_scalar('log/loss', float(loss), ep)
        else:
            break
    [w.join() for w in workers]
Exemplo n.º 14
0
    def __init__(self):
        super(Off_policy, self).__init__()
        self.memory = Replay_buffer(capacity=p.exploitory_policy_memory_size)
        self.exploratory_policy = GaussianPolicy(
            self.state_space, self.action_space).to(self.device)
        self.exploratory_Q = QNet(self.state_space,
                                  self.action_space).to(self.device)
        self.exploratory_Q_target = QNet(self.state_space,
                                         self.action_space).to(self.device)
        self.exploratory_policy_optim = Adam(
            self.exploratory_policy.parameters(), lr=p.lr)
        self.exploratory_Q_optim = Adam(self.exploratory_Q.parameters(),
                                        lr=p.lr)

        self.target_update(self.exploratory_policy, self.exploitory_policy,
                           1.0)

        self.kl_normalizer = Normalizer(1)
        self.ex_rewards_normalizer = Normalizer(1)
Exemplo n.º 15
0
def main():
    env = gym.make(args.env_name)
    env.seed(500)
    torch.manual_seed(500)

    img_shape = env.observation_space.shape
    num_actions = 3
    print('image size:', img_shape)
    print('action size:', num_actions)

    net = QNet(num_actions)
    net.load_state_dict(torch.load(args.save_path + 'model.pth'))

    net.to(device)
    net.eval()

    epsilon = 0

    for e in range(5):
        done = False

        score = 0
        state = env.reset()

        state = pre_process(state)
        state = torch.Tensor(state).to(device)
        history = torch.stack((state, state, state, state))

        for i in range(3):
            action = env.action_space.sample()
            state, reward, done, info = env.step(action)
            state = pre_process(state)
            state = torch.Tensor(state).to(device)
            state = state.unsqueeze(0)
            history = torch.cat((state, history[:-1]), dim=0)

        while not done:
            if args.render:
                env.render()

            steps += 1
            qvalue = net(history.unsqueeze(0))
            action = get_action(0, qvalue, num_actions)

            next_state, reward, done, info = env.step(action + 1)

            next_state = pre_process(next_state)
            next_state = torch.Tensor(next_state).to(device)
            next_state = next_state.unsqueeze(0)
            next_history = torch.cat((next_state, history[:-1]), dim=0)

            score += reward
            history = next_history

        print('{} episode | score: {:.2f}'.format(e, score))
Exemplo n.º 16
0
    def __init__(self, *largs, **kwargs):
        super(RBI, self).__init__(*largs, **kwargs)

        pi_net = PiNet(self.ns, self.na, distribution='Normal')
        self.pi_net = pi_net.to(self.device)

        pi_target = PiNet(self.ns, self.na, distribution='Normal')
        self.pi_target = pi_target.to(self.device)
        self.load_state_dict(self.pi_target, self.pi_net.state_dict())

        q_net_1 = QNet(self.ns, self.na)
        self.q_net_1 = q_net_1.to(self.device)

        q_target_1 = QNet(self.ns, self.na)
        self.q_target_1 = q_target_1.to(self.device)
        self.load_state_dict(self.q_target_1, self.q_net_1.state_dict())

        q_net_2 = QNet(self.ns, self.na)
        self.q_net_2 = q_net_2.to(self.device)

        q_target_2 = QNet(self.ns, self.na)
        self.q_target_2 = q_target_2.to(self.device)
        self.load_state_dict(self.q_target_2, self.q_net_2.state_dict())

        self.optimizer_q_1 = torch.optim.Adam(self.q_net_1.parameters(), lr=self.lr_q, betas=(0.9, 0.999),
                                     weight_decay=self.weight_decay_q)

        self.optimizer_q_2 = torch.optim.Adam(self.q_net_2.parameters(), lr=self.lr_q, betas=(0.9, 0.999),
                                     weight_decay=self.weight_decay_q)

        self.optimizer_p = torch.optim.Adam(self.pi_net.parameters(), lr=self.lr_p, betas=(0.9, 0.999),
                                    weight_decay=self.weight_decay_p)

        self.alpha = self.rbi_alpha
        if self.entropy_tunning:
            # self.target_entropy = -float(self.na)
            std_target = 0.3 / math.sqrt(self.na)
            self.target_entropy = self.na * 0.5 * math.log(2 * math.pi * math.e * (std_target ** 2))
            print(f'target entropy: {self.target_entropy}')
            self.lr_alpha = 0.01
Exemplo n.º 17
0
    def __init__(self, n_actors, device='cuda:0'):
        # params
        self.gamma = 0.99
        self.alpha = 0.6
        self.bootstrap_steps = 3
        self.initial_exploration = 50000
        self.priority_epsilon = 1e-6
        self.device = device
        self.n_epochs = 0
        self.n_actors = n_actors

        # path
        self.memory_path = os.path.join('./', 'logs', 'memory')
        self.net_path = os.path.join('./', 'logs', 'model', 'net.pt')
        self.target_net_path = os.path.join('./', 'logs', 'model',
                                            'target_net.pt')

        # memory
        self.memory_size = 500000
        self.batch_size = 128
        self.memory_load_interval = 10
        self.replay_memory = ReplayMemory(self.memory_size, self.batch_size,
                                          self.bootstrap_steps)

        # net
        self.net_save_interval = 50
        self.target_update_interval = 1000
        self.net = QNet(self.net_path, self.device).to(self.device)
        self.target_net = QNet(self.target_net_path,
                               self.device).to(self.device)
        self.target_net.load_state_dict(self.net.state_dict())
        self.net.save()
        self.target_net.save()
        self.optim = optim.RMSprop(self.net.parameters(),
                                   lr=0.00025 / 4.0,
                                   alpha=0.95,
                                   eps=1.5e-7,
                                   centered=True)
Exemplo n.º 18
0
    def __init__(self, *largs, **kwargs):
        super(TD3, self).__init__(*largs, **kwargs)

        pi_net = PiNet(self.ns, self.na)
        self.pi_net = pi_net.to(self.device)

        pi_target = PiNet(self.ns, self.na)
        self.pi_target = pi_target.to(self.device)
        self.load_state_dict(self.pi_target, self.pi_net.state_dict())

        q_net_1 = QNet(self.ns, self.na)
        self.q_net_1 = q_net_1.to(self.device)

        q_target_1 = QNet(self.ns, self.na)
        self.q_target_1 = q_target_1.to(self.device)
        self.load_state_dict(self.q_target_1, self.q_net_1.state_dict())

        q_net_2 = QNet(self.ns, self.na)
        self.q_net_2 = q_net_2.to(self.device)

        q_target_2 = QNet(self.ns, self.na)
        self.q_target_2 = q_target_2.to(self.device)
        self.load_state_dict(self.q_target_2, self.q_net_2.state_dict())

        self.optimizer_q_1 = torch.optim.Adam(self.q_net_1.parameters(),
                                              lr=self.lr_q,
                                              betas=(0.9, 0.999))

        self.optimizer_q_2 = torch.optim.Adam(self.q_net_2.parameters(),
                                              lr=self.lr_q,
                                              betas=(0.9, 0.999))

        self.optimizer_p = torch.optim.Adam(self.pi_net.parameters(),
                                            lr=self.lr_p,
                                            betas=(0.9, 0.999))

        self.noise = RandomNoise(
            torch.zeros(1, self.na).to(self.device), self.epsilon)
Exemplo n.º 19
0
    def __init__(self, run):
        self.run = run
        ckpt_dir = os.path.join(run, 'ckpt')    
        ckpts = glob2.glob(os.path.join(ckpt_dir, '*.pth'))
        assert ckpts, "No checkpoints to resume from!"

        def get_epoch(ckpt_url):
            s = re.findall("ckpt_e(\d+).pth", ckpt_url)
            epoch = int(s[0]) if s else -1
            return epoch, ckpt_url

        start_epoch, ckpt = max(get_epoch(c) for c in ckpts)
        print('Checkpoint:', ckpt)
        
        if torch.cuda.is_available():
           model = QNet().cuda()
        else:
           model = QNet()
        
        ckpt = torch.load(ckpt)
        model.load_state_dict(ckpt['model'])
        model.eval()
        
        self.model = model
Exemplo n.º 20
0
Arquivo: sac.py Projeto: km01/myrl
    def __init__(self, input_size, action_size, gamma, tau, alpha, hidden_size,
                 lr, device):

        self.gamma, self.tau, self.alpha = gamma, tau, alpha
        self.lr, self.device = lr, device

        self.policy = Actor(input_size, hidden_size,
                            action_size).to(self.device)
        self.critic = QNet(input_size, hidden_size,
                           action_size).to(self.device)

        self.policy_optim = torch.optim.Adam(self.policy.parameters(),
                                             lr=self.lr)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(),
                                             lr=self.lr)

        self.critic_target = copy.deepcopy(self.critic)
        self.critic_target.requires_grad_(False)
Exemplo n.º 21
0
    def setup(self, obs_shape, nb_action):
        self.lr_coef = 1
        self.epsilon = 1
        self.nb_action = nb_action
        model_args = Singleton_arger()['model']

        qnet = QNet(obs_shape, nb_action)

        self.qnet = copy.deepcopy(qnet)
        self.target_qnet = copy.deepcopy(qnet)

        self.memory = Memory(self.buffer_size, nb_action, self.with_cuda)

        if self.with_cuda:
            self.qnet.cuda()
            self.target_qnet.cuda()

        self.qnet_optim = Adam(self.qnet.parameters(), lr=self.critic_lr)
Exemplo n.º 22
0
    def __init__(self, *largs, **kwargs):
        super(PPO, self).__init__(*largs, **kwargs)

        self.pi_net = PiNet(self.ns,
                            self.na,
                            distribution='Normal',
                            bounded=False,
                            agent='ppo').to(self.device)
        self.v_net = QNet(self.ns, 0, agent='ppo').to(self.device)

        self.optimizer_v = torch.optim.Adam(self.v_net.parameters(),
                                            lr=self.lr_q,
                                            betas=(0.9, 0.999),
                                            weight_decay=self.weight_decay_q)

        self.optimizer_p = torch.optim.Adam(self.pi_net.parameters(),
                                            lr=self.lr_p,
                                            betas=(0.9, 0.999),
                                            weight_decay=self.weight_decay_p)
def main():
    env = gym.make(args.env_name)
    env.seed(500)
    torch.manual_seed(500)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    net = QNet(num_inputs, num_actions)
    net.load_state_dict(torch.load(args.save_path + 'model.pth'))

    net.to(device)
    net.eval()
    running_score = 0
    steps = 0

    for e in range(5):
        done = False

        score = 0
        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)

        while not done:
            env.render()

            steps += 1
            qvalue = net(state)
            action = get_action(qvalue)
            next_state, reward, done, _ = env.step(action)

            next_state = torch.Tensor(next_state).to(device)
            next_state = next_state.unsqueeze(0)

            score += reward
            state = next_state

        print('{} episode | score: {:.2f}'.format(e, score))
Exemplo n.º 24
0
def main():
    net = QNet().cuda().train()
    #    print(net)
    optimizer = optim.SGD([{
        'params': [
            param
            for name, param in net.named_parameters() if name[-4:] == 'bias'
        ],
        'lr':
        2 * args['lr']
    }, {
        'params': [
            param
            for name, param in net.named_parameters() if name[-4:] != 'bias'
        ],
        'lr':
        args['lr'],
        'weight_decay':
        args['weight_decay']
    }],
                          momentum=args['momentum'])

    if len(args['snapshot']) > 0:
        print('training resumes from ' + args['snapshot'])
        net.load_state_dict(
            torch.load(
                os.path.join(ckpt_path, exp_name, args['snapshot'] + '.pth')))
        optimizer.load_state_dict(
            torch.load(
                os.path.join(ckpt_path, exp_name,
                             args['snapshot'] + '_optim.pth')))
        optimizer.param_groups[0]['lr'] = 2 * args['lr']
        optimizer.param_groups[1]['lr'] = args['lr']

    check_mkdir(ckpt_path)
    check_mkdir(os.path.join(ckpt_path, exp_name))
    open(log_path, 'w').write(str(args) + '\n\n')
    train(net, optimizer)
Exemplo n.º 25
0
def main():
    env = gym.make(args.env_name)
    env.seed(500)
    torch.manual_seed(500)

    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    print('state size:', state_size)
    print('action size:', action_size)

    q_net = QNet(state_size, action_size, args)
    target_q_net = QNet(state_size, action_size, args)
    optimizer = optim.Adam(q_net.parameters(), lr=0.001)

    update_target_model(q_net, target_q_net)

    writer = SummaryWriter(args.logdir)

    replay_buffer = deque(maxlen=10000)
    running_score = 0
    steps = 0

    for episode in range(args.max_iter_num):
        done = False
        score = 0

        state = env.reset()
        state = np.reshape(state, [1, state_size])

        while not done:
            if args.render:
                env.render()

            steps += 1

            q_values = q_net(torch.Tensor(state))
            action = get_action(q_values, action_size, args.epsilon)

            next_state, reward, done, _ = env.step(action)

            next_state = np.reshape(next_state, [1, state_size])
            reward = reward if not done or score == 499 else -1
            mask = 0 if done else 1

            replay_buffer.append((state, action, reward, next_state, mask))

            state = next_state
            score += reward

            if steps > args.initial_exploration:
                args.epsilon -= args.epsilon_decay
                args.epsilon = max(args.epsilon, 0.1)

                mini_batch = random.sample(replay_buffer, args.batch_size)

                q_net.train(), target_q_net.train()
                train_model(q_net, target_q_net, optimizer, mini_batch)

                if steps % args.update_target == 0:
                    update_target_model(q_net, target_q_net)

        score = score if score == 500.0 else score + 1
        running_score = 0.99 * running_score + 0.01 * score

        if episode % args.log_interval == 0:
            print(
                '{} episode | running_score: {:.2f} | epsilon: {:.2f}'.format(
                    episode, running_score, args.epsilon))
            writer.add_scalar('log/score', float(score), episode)

        if running_score > args.goal_score:
            if not os.path.isdir(args.save_path):
                os.makedirs(args.save_path)

            ckpt_path = args.save_path + 'model.pth.tar'
            torch.save(q_net.state_dict(), ckpt_path)
            print('Running score exceeds 400. So end')
            break
Exemplo n.º 26
0
if __name__ == "__main__":
    env = env.MinecraftEnv()
    env.init(allowContinuousMovement=["move", "turn"],
             videoResolution=[800, 600])
    env.seed(500)
    torch.manual_seed(500)
    render_map = False

    num_inputs = env.observation_space.shape
    num_actions = len(env.action_names[0])

    print('state size:', num_inputs)
    print('action size:', num_actions)

    model = QNet(num_actions)
    model.apply(weights_init)
    target_model = QNet(num_actions)
    update_target_model(model, target_model)
    model.train()
    target_model.train()

    optimizer = optim.Adam(model.parameters(),
                           lr=hp.lr,
                           weight_decay=hp.l2_rate)

    memory = Memory(100000)
    if render_map:
        root, canvas = init_map()

    steps = 0
Exemplo n.º 27
0
def main():

    if not (os.path.isdir("logs")):
        os.makedirs("logs")

    if (args.entropy and args.boltzmann):
        raise ValueError("Entropy as well as Boltzmann set.")

    print(args)

    working_dir = "logs/" + args.dir
    if not (os.path.isdir(working_dir)):
        os.mkdir(working_dir)

    env = QubeSwingupEnv(use_simulator=True)

    num_inputs = env.observation_space.shape[0]
    num_actions = NUMBER_OF_ACTIONS
    print('state size:', num_inputs)
    print('action size:', num_actions)

    online_net = QNet(num_inputs, num_actions)
    target_net = QNet(num_inputs, num_actions)

    update_target_model(online_net, target_net)

    optimizer = optim.Adam(online_net.parameters(), lr=lr)
    writer = SummaryWriter(working_dir)

    online_net.to(device)
    target_net.to(device)
    online_net.train()
    target_net.train()
    memory = Memory_With_TDError(replay_memory_capacity)
    running_score = 0
    epsilon = 1.0
    steps = 0
    beta = beta_start
    loss = 0
    training_started = False

    best_running_score = -1000

    for e in range(args.e):
        done = False

        score = 0
        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)
        start_time = time.time()

        while not done:
            steps += 1
            action = get_action(state,
                                target_net,
                                epsilon,
                                use_entropy=args.entropy,
                                use_boltzmann=args.boltzmann)
            next_state, reward, done, info = env.step(
                get_continuous_action(action))

            reward = give_me_reward(info["alpha"], info["theta"])

            next_state = torch.Tensor(next_state).to(device)
            next_state = next_state.unsqueeze(0)

            mask = 0 if done else 1
            action_one_hot = np.zeros(NUMBER_OF_ACTIONS)
            action_one_hot[action] = 1
            memory.push(state, next_state, action_one_hot, reward, mask)

            score += reward
            state = next_state

            if steps > initial_exploration:
                if not training_started:
                    print("---------------- training started ---------------")
                    training_started = True
                epsilon -= 0.000005
                epsilon = max(epsilon, 0.1)
                beta += 0.000005
                beta = min(1, beta)

                batch, weights = memory.sample(batch_size, online_net,
                                               target_net, beta)
                loss = QNet.train_model(online_net, target_net, optimizer,
                                        batch, weights, device)

                if steps % update_target == 0:
                    update_target_model(online_net, target_net)

        end_time = time.time()
        running_score = 0.99 * running_score + 0.01 * score
        if e % log_interval == 0:
            print(
                '{} episode | score: {:.2f} | epsilon: {:.2f} | beta: {:.2f}'.
                format(e, running_score, epsilon, beta))
            writer.add_scalar('log/score', float(running_score), e)
            writer.add_scalar('log/loss', float(loss), e)

        if running_score > best_running_score and args.save:
            torch.save(online_net.state_dict(),
                       working_dir + "/best_model.pth")
            best_running_score = running_score
Exemplo n.º 28
0
def main():
    # cartpole test
    if (cartpole_test):
        envs_fun = [lambda: gym.make('CartPole-v0')]
        envs_fun = np.tile(envs_fun, 3)
        envs = ShmemVecEnv(envs_fun)
        dummy_env = envs_fun[0]()
    else:
        INPUT_FILE = '../data/05f2a901.json'
        with open(INPUT_FILE, 'r') as f:
            puzzle = json.load(f)

        envs_fun = [
            lambda: gym.make('arc-v0',
                             input=task['input'],
                             output=task['output'],
                             need_ui=need_ui) for task in puzzle['train']
        ]
        #pdb.set_trace()
        envs_fun = envs_fun[0:1]
        envs = ShmemVecEnv(envs_fun)
        dummy_env = envs_fun[0]()

    env_num = len(envs_fun)
    torch.manual_seed(500)

    num_inputs = dummy_env.observation_space.shape[0]
    num_actions = dummy_env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    online_net = QNet(num_inputs, num_actions, cartpole_test, evalution_mode)
    target_net = QNet(num_inputs, num_actions, cartpole_test, evalution_mode)

    if (evalution_mode):
        online_net = torch.load('../result/arc0.model')
        target_net = torch.load('../result/arc0.model')

    update_target_model(online_net, target_net)

    optimizer = optim.Adam(online_net.parameters(), lr=lr)
    writer = SummaryWriter('logs')

    online_net.to(device)
    target_net.to(device)
    online_net.train()
    target_net.train()
    memory = Memory(replay_memory_capacity)

    score = 0
    epsilon = 1.0
    steps = 0
    loss = 0

    states = envs.reset()

    try:
        while True:
            if (need_ui):
                envs.render()
            steps += 1

            global initial_exploration
            if (initial_exploration > 0):
                initial_exploration -= 1

            actions = []

            for state in states:
                state = torch.Tensor(state).to(device)
                state = state.unsqueeze(0)
                action = get_action(state, target_net,
                                    0 if evalution_mode else epsilon,
                                    dummy_env)
                if (evalution_mode):
                    print(action)
                actions.append(action)

            next_states, rewards, dones, info = envs.step(actions)
            #print(rewards)

            masks = np.zeros(envs.num_envs)
            for i in range(envs.num_envs):
                masks[i] = 0 if dones[i] else 1

            for i in range(envs.num_envs):
                #print(rewards[i])
                action_one_hot = np.zeros(dummy_env.action_space.n)
                action_one_hot[actions[i]] = 1
                memory.push(states[i], next_states[i], action_one_hot,
                            rewards[i], masks[i])

            #score += reward
            states = next_states

            if not evalution_mode and steps > initial_exploration:
                epsilon -= 0.00003
                epsilon = max(epsilon, 0.1)

                batch = memory.sample(batch_size)
                loss = QNet.train_model(online_net, target_net, optimizer,
                                        batch, device)

                if steps % update_target == 0:
                    update_target_model(online_net, target_net)

            if (steps > 1028):
                states = envs.reset()
                steps = 0
                print(
                    'new epsisode ------------------------------------------')

    except KeyboardInterrupt:
        print('save model')
        torch.save(target_net, '../result/arc.model')
        sys.exit(0)
def main():
    env = gym.make(env_name)
    env.seed(500)
    torch.manual_seed(500)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    online_net = QNet(num_inputs, num_actions)
    target_net = QNet(num_inputs, num_actions)
    update_target_model(online_net, target_net)

    optimizer = optim.Adam(online_net.parameters(), lr=lr)
    writer = SummaryWriter('logs')

    online_net.to(device)
    target_net.to(device)
    online_net.train()
    target_net.train()
    memory = Memory_With_TDError(replay_memory_capacity)
    running_score = 0
    epsilon = 1.0
    steps = 0
    beta = beta_start
    loss = 0

    for e in range(3000):
        done = False

        score = 0
        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)

        while not done:
            steps += 1
            action = get_action(state, target_net, epsilon, env)
            next_state, reward, done, _ = env.step(action)

            next_state = torch.Tensor(next_state)
            next_state = next_state.unsqueeze(0)

            mask = 0 if done else 1
            reward = reward if not done or score == 499 else -1
            action_one_hot = np.zeros(2)
            action_one_hot[action] = 1
            memory.push(state, next_state, action_one_hot, reward, mask)

            score += reward
            state = next_state

            if steps > initial_exploration:
                epsilon -= 0.00005
                epsilon = max(epsilon, 0.1)
                beta += 0.00005
                beta = min(1, beta)

                batch, weights = memory.sample(batch_size, online_net,
                                               target_net, beta)
                loss = QNet.train_model(online_net, target_net, optimizer,
                                        batch, weights)

                if steps % update_target == 0:
                    update_target_model(online_net, target_net)

        score = score if score == 500.0 else score + 1
        running_score = 0.99 * running_score + 0.01 * score
        if e % log_interval == 0:
            print(
                '{} episode | score: {:.2f} | epsilon: {:.2f} | beta: {:.2f}'.
                format(e, running_score, epsilon, beta))
            writer.add_scalar('log/score', float(running_score), e)
            writer.add_scalar('log/loss', float(loss), e)

        if running_score > goal_score:
            break
Exemplo n.º 30
0
def main():
    env = gym.make(env_name)
    env.seed(500)
    torch.manual_seed(500)

    ### NNのIn-Outは環境によって異なる
    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    ### 2つのNWを作成・初期化
    online_net = QNet(num_inputs, num_actions)
    target_net = QNet(num_inputs, num_actions)
    update_target_model(online_net, target_net)

    optimizer = optim.Adam(online_net.parameters(), lr=lr)

    ### 各NWの設定 CPU / GPU
    online_net.to(device)
    target_net.to(device)
    ### 各NWの設定 初めは学習モードにする
    online_net.train()
    target_net.train()

    ### 学習前の初期設定
    memory = Memory(replay_memory_capacity)
    running_score = 0
    epsilon = 1.0
    steps = 0
    loss = 0
    steps_before = 0

    for e in range(3000):
        done = False

        score = 0
        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)

        while not done:
            steps += 1

            ### 行動の決定はtarget_netで行う
            action = get_action(state, target_net, epsilon, env)

            ### 次の状態の観測、報酬の獲得
            next_state, reward, done, _ = env.step(action)
            next_state = torch.Tensor(next_state)
            next_state = next_state.unsqueeze(0)
            if e % 10 == 0:
                print(next_state, action, reward)

            ### わかりにくいので書き変えた
            if done:
                mask = 0
            else:
                mask = 1

            ### memoryに記録
            action_one_hot = np.zeros(num_actions)
            action_one_hot[action] = 1
            memory.push(state, next_state, action_one_hot, reward, mask)

            ### rewardは基本的に-1
            score += reward  ### そのepisodeで何ステップ行ったかを記録するためだけのもの

            state = next_state

            if steps > initial_exploration:
                epsilon -= 0.00005
                epsilon = max(epsilon, 0.1)

                ### online_net の学習
                batch = memory.sample(batch_size)
                loss = QNet.train_model(online_net, target_net, optimizer,
                                        batch)

                ### たまにtarget_netをonline_netで上書きする
                if steps % update_target == 0:
                    update_target_model(online_net, target_net)

        print("Ep {0:04d}: {1} step".format(e, steps - steps_before))
        steps_before = steps

        score = score if score == 200.0 else score + 1
        running_score = 0.99 * running_score + 0.01 * score
        if e % log_interval == 0:
            print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format(
                e, running_score, epsilon))

        if running_score > goal_score:
            break