def main():
    env = gym.make(env_name)
    env.seed(500)
    torch.manual_seed(500)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    net = QNet(num_inputs, num_actions)

    optimizer = optim.Adam(net.parameters(), lr=lr)
    writer = SummaryWriter('logs')

    net.to(device)
    net.train()
    running_score = 0
    steps = 0
    loss = 0

    for e in range(3000):
        done = False
        memory = Memory()

        score = 0
        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)

        while not done:
            steps += 1

            action = net.get_action(state)
            next_state, reward, done, _ = env.step(action)

            next_state = torch.Tensor(next_state)
            next_state = next_state.unsqueeze(0)

            mask = 0 if done else 1
            reward = reward if not done or score == 499 else -1

            action_one_hot = torch.zeros(2)
            action_one_hot[action] = 1
            memory.push(state, next_state, action_one_hot, reward, mask)

            score += reward
            state = next_state

        loss = QNet.train_model(net, memory.sample(), optimizer)

        score = score if score == 500.0 else score + 1
        running_score = 0.99 * running_score + 0.01 * score
        if e % log_interval == 0:
            print('{} episode | score: {:.2f}'.format(e, running_score))
            writer.add_scalar('log/score', float(running_score), e)
            writer.add_scalar('log/loss', float(loss), e)

        if running_score > goal_score:
            break
class Agent():
    def __init__(self, state_size, action_size, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        self.qnetwork_local = QNet(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNet(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay Memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)

        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.1):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences
        
        # For normal DQN
        #Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)

        # For double DQN
        Q_targets_next = np.argmax(self.qnetwork_local(next_states).detach(),axis=-1).unsqueeze(1)
        Q_targets_next = self.qnetwork_target(next_states).gather(1, Q_targets_next)
        
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        loss = F.mse_loss(Q_expected, Q_targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
예제 #3
0
def main():
    env = gym.make(env_name)
    env.seed(500)
    torch.manual_seed(500)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    online_net = QNet(num_inputs, num_actions)
    target_net = QNet(num_inputs, num_actions)
    target_net.load_state_dict(online_net.state_dict())
    online_net.share_memory()
    target_net.share_memory()

    optimizer = SharedAdam(online_net.parameters(), lr=lr)
    global_ep, global_ep_r, res_queue = mp.Value('i',
                                                 0), mp.Value('d',
                                                              0.), mp.Queue()

    writer = SummaryWriter('logs')

    online_net.to(device)
    target_net.to(device)
    online_net.train()
    target_net.train()

    workers = [
        Worker(online_net, target_net, optimizer, global_ep, global_ep_r,
               res_queue, i) for i in range(mp.cpu_count())
    ]
    [w.start() for w in workers]
    res = []
    while True:
        r = res_queue.get()
        if r is not None:
            res.append(r)
            [ep, ep_r, loss] = r
            writer.add_scalar('log/score', float(ep_r), ep)
            writer.add_scalar('log/loss', float(loss), ep)
        else:
            break
    [w.join() for w in workers]
예제 #4
0
def main():
    env = gym.make(env_name)
    env.seed(500)
    torch.manual_seed(500)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    ### ポリシーネットワークの構築
    ### inputに対してπ(a|s) と V(s) が出力される
    ### Vの出力は1つ が学習時にはAdvantage関数を計算する
    net = QNet(num_inputs, num_actions)
    optimizer = optim.Adam(net.parameters(), lr=lr)

    net.to(device)
    net.train()

    ### もろもろの初期化
    running_score = 0
    steps = 0
    loss = 0
    steps_before = 0

    df = pd.DataFrame(index=range(10000),
                      columns=["steps", "loss_policy", "loss_value"])

    memory = Memory()

    for e in range(10000):
        done = False
        ### 1エピソード分のメモリすら持たずに1ステップずつ学習

        ### 環境を初期状態に
        score = 0
        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)

        while not done:
            steps += 1

            ### epsilon は使わず、各行動の評価値を確率に直接変換して行動を決定
            action = net.get_action(state)
            next_state, reward, done, _ = env.step(action)

            next_state = torch.Tensor(next_state)
            next_state = next_state.unsqueeze(0)

            mask = 0 if done else 1
            reward = reward if not done or score == 499 else -1

            action_one_hot = torch.zeros(num_actions)
            action_one_hot[action] = 1

            transition = [state, next_state, action, reward, mask]

            memory.push(state, next_state, action_one_hot, reward, mask)

            score += reward
            state = next_state

        steps_before = steps

        score = score if score == 500.0 else score + 1
        running_score = 0.99 * running_score + 0.01 * score

        if e % 16 == 0:
            ### 16ステップごとに、まとめて学習
            loss, loss_policy, loss_value = QNet.train_model(
                net, optimizer, memory.sample())
            ### メモリの初期化
            memory = Memory()

            df.loc[e, "steps"] = running_score
            df.loc[e, "loss_policy"] = loss_policy
            df.loc[e, "loss_value"] = loss_value

            print(
                "Ep {0:04d}: score: {1:02d}, loss_policy: {2}, loss_value: {3}"
                .format(e, int(running_score), loss_policy, loss_value))

        if running_score > goal_score:
            break
    df.to_csv("loss.csv")
def train(render):
    online_net = QNet(h=84, w=84, outputs=36)
    online_net.load_state_dict(torch.load('saved/online_net.pt'))
    target_net = QNet(h=84, w=84, outputs=36)
    update_target_model(online_net, target_net)

    optimizer = optim.Adam(online_net.parameters(), lr=lr)

    online_net.to(device)
    target_net.to(device)
    online_net.train()
    target_net.train()
    memory = Memory(replay_memory_capacity)
    memory = torch.load('saved/model_memory.pt')
    epsilon = 0.1
    steps = 0
    beta = beta_start
    loss = 0

    for e in range(100000):
        #level = random.choice(LEVEL_SET)
        level = 'Level01'
        env = make_retro(game=env_name,
                         state=level,
                         use_restricted_actions=retro.Actions.DISCRETE)

        done = False

        total_reward = 0.0
        state = env.reset()
        state = torch.Tensor(state).to(device).permute(2, 0, 1)
        #state = state.view(state.size()[0], -1)
        state = state.unsqueeze(0)

        while not done:
            steps += 1
            action = get_action(state.to(device), target_net, epsilon, env)

            if render:
                env.render()

            next_state, reward, done, info = env.step(action)

            next_state = torch.Tensor(next_state).permute(2, 0, 1)
            #next_state = next_state.view(next_state.size()[0], -1)
            next_state = next_state.unsqueeze(0)

            total_reward += reward

            mask = 0 if done else 1
            action_one_hot = torch.zeros(36)
            action_one_hot[action] = 1

            reward = torch.tensor([info['score']]).to(device)
            memory.push(state, next_state, action_one_hot, reward, mask)

            state = next_state

            if len(memory) > initial_exploration:
                epsilon -= 0.00005
                epsilon = max(epsilon, 0.02)
                beta += 0.00005
                beta = min(1, beta)

                batch, weights = memory.sample(batch_size, online_net,
                                               target_net, beta)
                loss = QNet.train_model(online_net, target_net, optimizer,
                                        batch, weights)

                if steps % update_target == 0:
                    update_target_model(online_net, target_net)

        if e % 1 == 0:
            print('{} episode | Total Reward: {}'.format(e, total_reward))
            torch.save(online_net.state_dict(), 'saved/online_net.pt')
            torch.save(memory, 'saved/model_memory.pt')
        env.close()
예제 #6
0
def main(L, mouse_initial_indices, rewardlist, actions_list):
    if mouse_initial_indices is None:
        all_possible_starting_positions = np.array([*np.where(L == 1)]).T
    scores = [0]
    best_scores = [0]
    env = deepcopy(L)
    torch.manual_seed(2020)

    num_inputs = 2 + 1
    num_actions = 4
    print('state size:', num_inputs)
    print('action size:', num_actions)

    online_net = QNet(num_inputs, num_actions)
    target_net = QNet(num_inputs, num_actions)
    update_target_model(online_net, target_net)

    optimizer = optim.Adam(online_net.parameters(), lr=lr)
    # writer = SummaryWriter('logs')

    online_net.to(device)
    target_net.to(device)
    online_net.train()
    target_net.train()
    memory = Memory(replay_memory_capacity)
    running_score = 0
    epsilon = 1.0
    steps = 0
    loss = 0
    inint = mouse_initial_indices
    best_score = 0
    number_episode = 1000
    for e in range(number_episode):
        if inint is None:
            mouse_initial_indices = all_possible_starting_positions[
                np.random.choice(range(len(all_possible_starting_positions)))]

        done = False
        env = deepcopy(L)
        eaubue = 0.
        score = 0
        state = np.array(mouse_initial_indices)
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)

        while not done:
            steps += 1

            action = get_action(state, target_net, epsilon, env, eaubue=eaubue)
            newstate = state + torch.Tensor(np.array(
                actions_list[action])).to(device)
            if env[int(newstate[0][0].tolist()),
                   int(newstate[0][1].tolist())] != 0:
                next_state = newstate
                new_eaubue = eaubue
                reward = rewardlist[env[int(newstate[0][0].tolist()),
                                        int(newstate[0][1].tolist())]]
                if env[int(newstate[0][0].tolist()),
                       int(newstate[0][1].tolist())] == 2:
                    done = True
                if env[int(newstate[0][0].tolist()),
                       int(newstate[0][1].tolist()
                           )] == 4:  #if the mouse is in the water
                    env[int(newstate[0][0].tolist()),
                        int(newstate[0][1].tolist()
                            )] = 5  #there is no more water
                    new_eaubue = 1.
            else:
                next_state = state
                reward = rewardlist[0]
                new_eaubue = eaubue

            mask = 0 if done else 1
            action_one_hot = np.zeros(4)
            action_one_hot[action] = 1
            memory.push(
                torch.cat((
                    state,
                    torch.tensor(eaubue).unsqueeze(0).unsqueeze(0).to(device)),
                          1),
                torch.cat((next_state, torch.tensor(new_eaubue).unsqueeze(
                    0).unsqueeze(0).to(device)), 1), action_one_hot, reward,
                mask)

            score += reward
            state = next_state
            eaubue = new_eaubue

            if steps > initial_exploration:
                epsilon -= 0.00005
                epsilon = max(epsilon, 0.1)

                batch = memory.sample(batch_size)
                loss = QNet.train_model(online_net, target_net, optimizer,
                                        batch)

                if steps % update_target == 0:
                    update_target_model(online_net, target_net)

        # print("OK")
        if score > 35:
            print(score)
        running_score = 0.99 * running_score + 0.01 * score
        # running_score=score
        scores.append(running_score)
        best_scores.append(
            score if score > best_scores[-1] else best_scores[-1])
        if e % log_interval == 0:
            print(
                '{} episode | score: {:.2f} | best score: {:.2f} | epsilon: {:.2f}'
                .format(e, running_score, best_score, epsilon))
            # writer.add_scalar('log/score', float(running_score), e)
            # writer.add_scalar('log/loss', float(loss), e)
            if score > best_score:
                best_score = score
            torch.save(online_net.state_dict(), "./qlearning_model")

        if running_score > goal_score:
            break

    return number_episode, scores, best_scores
예제 #7
0
class Agent():
    def __init__(self, args, state_size, action_size, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.per = args.per
        self.dueling = args.dueling
        self.buffer_size = args.buffer_size
        self.batch_size = args.batch_size
        self.gamma = args.gamma
        self.tau = args.tau
        self.lr = args.learning_rate
        self.update_freq = args.update_every
        # Q-Network
        if self.dueling:
            self.local_qnet = DuelingQNet(state_size, action_size,
                                          seed).to(device)
            self.target_qnet = DuelingQNet(state_size, action_size,
                                           seed).to(device)
        else:
            self.local_qnet = QNet(state_size, action_size, seed).to(device)
            self.target_qnet = QNet(state_size, action_size, seed).to(device)

        self.optimizer = optim.Adam(self.local_qnet.parameters(), lr=self.lr)

        # Replay Memory
        if self.per:
            self.memory = PrioritizedReplayMemory(args, self.buffer_size)
        else:
            self.memory = ReplayMemory(action_size, self.buffer_size,
                                       self.batch_size, seed)
        self.t_step = 0  # init time step for updating every UPDATE_EVERY steps

    def step(self, state, action, reward, next_state, done):
        if self.per:
            self.memory.append(state, action, reward, next_state, done)
        else:
            self.memory.add(state, action, reward, next_state,
                            done)  # save experience to replay memory.
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.update_freq
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                if self.dueling:
                    self.learn_DDQN(self.gamma)
                else:
                    self.learn(self.gamma)

    def act(self, state, eps=0.):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.local_qnet.eval()
        with torch.no_grad():
            action_values = self.local_qnet(state)
        self.local_qnet.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, gamma):
        if self.per:
            idxs, states, actions, rewards, next_states, dones, weights = self.memory.sample(
                self.batch_size)
        else:
            states, actions, rewards, next_states, dones = self.memory.sample()
        # Get max predicted Q values for next states from target model
        Q_targets_next = self.target_qnet(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        Q_expected = self.local_qnet(states).gather(1, actions)

        # Compute loss - element-wise mean squared error
        # Now loss is a Tensor of shape (1,)
        # loss.item() gets the scalar value held in the loss.
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize loss
        self.optimizer.zero_grad()
        if self.per:
            (weights * loss).mean().backward(
            )  # Backpropagate importance-weighted minibatch loss
        else:
            loss.backward()
        self.optimizer.step()

        if self.per:
            errors = np.abs((Q_expected - Q_targets).detach().cpu().numpy())
            self.memory.update_priorities(idxs, errors)
        # Update target network
        self.soft_update(self.local_qnet, self.target_qnet, self.tau)

    def learn_DDQN(self, gamma):
        if self.per:
            idxs, states, actions, rewards, next_states, dones, weights = self.memory.sample(
                self.batch_size)
        else:
            states, actions, rewards, next_states, dones = self.memory.sample()
        # Get index of maximum value for next state from Q_expected
        Q_argmax = self.local_qnet(next_states).detach()
        _, a_prime = Q_argmax.max(1)
        # Get max predicted Q values for next states from target model
        Q_targets_next = self.target_qnet(next_states).detach().gather(
            1, a_prime.unsqueeze(1))
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Get expected Q values from local model
        Q_expected = self.local_qnet(states).gather(1, actions)

        # Compute loss
        # Now loss is a Tensor of shape (1,)
        # loss.item() gets the scalar value held in the loss.
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize loss
        self.optimizer.zero_grad()
        if self.per:
            (weights * loss).mean().backward(
            )  # Backpropagate importance-weighted minibatch loss
        else:
            loss.backward()
        self.optimizer.step()

        if self.per:
            errors = np.abs((Q_expected - Q_targets).detach().cpu().numpy())
            self.memory.update_priorities(idxs, errors)
        # Update target network
        self.soft_update(self.local_qnet, self.target_qnet, self.tau)

    def soft_update(self, local_model, target_model, tau):
        # θ_target = τ*θ_local + (1 - τ)*θ_target
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
예제 #8
0
    render_map = False

    num_inputs = env.observation_space.shape
    num_actions = len(env.action_names[0])

    print('state size:', num_inputs)
    print('action size:', num_actions)

    model = QNet(num_actions)
    model.apply(weights_init)
    target_model = QNet(num_actions)
    update_target_model(model, target_model)
    model.train()
    target_model.train()

    optimizer = optim.Adam(model.parameters(),
                           lr=hp.lr,
                           weight_decay=hp.l2_rate)

    memory = Memory(100000)
    if render_map:
        root, canvas = init_map()

    steps = 0
    scores = []
    epsilon = 1.0
    for episode in range(hp.num_episodes):
        state = env.reset()
        state = pre_process(state)
        history = np.stack((state, state, state, state), axis=2)
        history = np.reshape([history], (84, 84, 4))
예제 #9
0
def main():

    if not (os.path.isdir("logs")):
        os.makedirs("logs")

    if (args.entropy and args.boltzmann):
        raise ValueError("Entropy as well as Boltzmann set.")

    print(args)

    working_dir = "logs/" + args.dir
    if not (os.path.isdir(working_dir)):
        os.mkdir(working_dir)

    env = QubeSwingupEnv(use_simulator=True)

    num_inputs = env.observation_space.shape[0]
    num_actions = NUMBER_OF_ACTIONS
    print('state size:', num_inputs)
    print('action size:', num_actions)

    online_net = QNet(num_inputs, num_actions)
    target_net = QNet(num_inputs, num_actions)

    update_target_model(online_net, target_net)

    optimizer = optim.Adam(online_net.parameters(), lr=lr)
    writer = SummaryWriter(working_dir)

    online_net.to(device)
    target_net.to(device)
    online_net.train()
    target_net.train()
    memory = Memory_With_TDError(replay_memory_capacity)
    running_score = 0
    epsilon = 1.0
    steps = 0
    beta = beta_start
    loss = 0
    training_started = False

    best_running_score = -1000

    for e in range(args.e):
        done = False

        score = 0
        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)
        start_time = time.time()

        while not done:
            steps += 1
            action = get_action(state,
                                target_net,
                                epsilon,
                                use_entropy=args.entropy,
                                use_boltzmann=args.boltzmann)
            next_state, reward, done, info = env.step(
                get_continuous_action(action))

            reward = give_me_reward(info["alpha"], info["theta"])

            next_state = torch.Tensor(next_state).to(device)
            next_state = next_state.unsqueeze(0)

            mask = 0 if done else 1
            action_one_hot = np.zeros(NUMBER_OF_ACTIONS)
            action_one_hot[action] = 1
            memory.push(state, next_state, action_one_hot, reward, mask)

            score += reward
            state = next_state

            if steps > initial_exploration:
                if not training_started:
                    print("---------------- training started ---------------")
                    training_started = True
                epsilon -= 0.000005
                epsilon = max(epsilon, 0.1)
                beta += 0.000005
                beta = min(1, beta)

                batch, weights = memory.sample(batch_size, online_net,
                                               target_net, beta)
                loss = QNet.train_model(online_net, target_net, optimizer,
                                        batch, weights, device)

                if steps % update_target == 0:
                    update_target_model(online_net, target_net)

        end_time = time.time()
        running_score = 0.99 * running_score + 0.01 * score
        if e % log_interval == 0:
            print(
                '{} episode | score: {:.2f} | epsilon: {:.2f} | beta: {:.2f}'.
                format(e, running_score, epsilon, beta))
            writer.add_scalar('log/score', float(running_score), e)
            writer.add_scalar('log/loss', float(loss), e)

        if running_score > best_running_score and args.save:
            torch.save(online_net.state_dict(),
                       working_dir + "/best_model.pth")
            best_running_score = running_score
예제 #10
0
파일: train.py 프로젝트: kaznyan/temp
def main():
    env = gym.make(env_name)
    env.seed(500)
    torch.manual_seed(500)

    ### NNのIn-Outは環境によって異なる
    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    ### 2つのNWを作成・初期化
    online_net = QNet(num_inputs, num_actions)
    target_net = QNet(num_inputs, num_actions)
    update_target_model(online_net, target_net)

    optimizer = optim.Adam(online_net.parameters(), lr=lr)

    ### 各NWの設定 CPU / GPU
    online_net.to(device)
    target_net.to(device)
    ### 各NWの設定 初めは学習モードにする
    online_net.train()
    target_net.train()

    ### 学習前の初期設定
    memory = Memory(replay_memory_capacity)
    running_score = 0
    epsilon = 1.0
    steps = 0
    loss = 0
    steps_before = 0

    for e in range(3000):
        done = False

        score = 0
        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)

        while not done:
            steps += 1

            ### 行動の決定はtarget_netで行う
            action = get_action(state, target_net, epsilon, env)

            ### 次の状態の観測、報酬の獲得
            next_state, reward, done, _ = env.step(action)
            next_state = torch.Tensor(next_state)
            next_state = next_state.unsqueeze(0)
            if e % 10 == 0:
                print(next_state, action, reward)

            ### わかりにくいので書き変えた
            if done:
                mask = 0
            else:
                mask = 1

            ### memoryに記録
            action_one_hot = np.zeros(num_actions)
            action_one_hot[action] = 1
            memory.push(state, next_state, action_one_hot, reward, mask)

            ### rewardは基本的に-1
            score += reward  ### そのepisodeで何ステップ行ったかを記録するためだけのもの

            state = next_state

            if steps > initial_exploration:
                epsilon -= 0.00005
                epsilon = max(epsilon, 0.1)

                ### online_net の学習
                batch = memory.sample(batch_size)
                loss = QNet.train_model(online_net, target_net, optimizer,
                                        batch)

                ### たまにtarget_netをonline_netで上書きする
                if steps % update_target == 0:
                    update_target_model(online_net, target_net)

        print("Ep {0:04d}: {1} step".format(e, steps - steps_before))
        steps_before = steps

        score = score if score == 200.0 else score + 1
        running_score = 0.99 * running_score + 0.01 * score
        if e % log_interval == 0:
            print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format(
                e, running_score, epsilon))

        if running_score > goal_score:
            break
예제 #11
0
def main():
    env = gym.make(env_name)
    env.seed(500)
    torch.manual_seed(500)

    ### NNのIn-Outは環境によって異なる
    # num_inputs = env.observation_space.shape[0]
    num_inputs = 1024
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    ### 2つのNWを作成・初期化
    online_net = QNet(num_inputs, num_actions)
    target_net = QNet(num_inputs, num_actions)
    update_target_model(online_net, target_net)

    optimizer = optim.Adam(online_net.parameters(), lr=lr)

    ### 各NWの設定 CPU / GPU
    online_net.to(device)
    target_net.to(device)
    ### 各NWの設定 初めは学習モードにする
    online_net.train()
    target_net.train()

    ### 特徴抽出用の学習済みモデル
    # pre_model = models.resnet50(pretrained=True)
    # pre_model.fc = nn.Identity()
    pre_model = models.squeezenet1_0(pretrained=True)
    pre_model.classifier = nn.AdaptiveAvgPool2d((1, 1))
    pre_model.to(device)

    def state_to_feature(state):
        state_img = render_cv2img(state[0], state[2])
        state_img = cv2.resize(state_img, (224, 224))[:, :, 0]
        state_img = state_img.reshape((1, 224, 224))
        state_img_rgb = np.zeros((1, 3, 224, 224))
        state_img_rgb[:, 0] = state_img
        state_img_rgb[:, 1] = state_img
        state_img_rgb[:, 2] = state_img
        state_img_rgb_tensor = torch.Tensor(state_img_rgb).to(device)

        state_feature = pre_model(state_img_rgb_tensor)
        return state_feature

    ### メモリの保存場所(改修中)
    memory_dir = "memory/"
    memory = Memory(replay_memory_capacity, memory_dir)

    ### 学習前の初期設定
    running_score = 0
    epsilon = 1.0
    steps = 0
    loss = 0
    steps_before = 0

    for e in range(3000):
        done = False

        score = 0

        ### state = [位置, 速度, 角度, 角速度]
        state = env.reset(
        )  ### [-0.01517264  0.02423424  0.02480018 -0.04009749]
        ### state = [[2048次元のベクトル]]
        state = state_to_feature(state)

        ### 前の時間の情報が無いときついため、それを入れるためのもの 最初はstateと同値でよさそう
        previous_state = state

        while not done:
            steps += 1

            ### 行動の決定はtarget_netで行う
            previous_present_state = torch.cat((previous_state, state), 1)
            action = get_action(previous_present_state, target_net, epsilon,
                                env)

            ### 次の状態の観測、報酬の獲得
            next_state, reward, done, _ = env.step(action)
            next_state = state_to_feature(next_state)
            present_next_state = torch.cat((state, next_state), 1)

            ### わかりにくいので書き変えた
            if done:
                mask = 0
            else:
                mask = 1
            if (done and (score != 499)):  ### 499ステップまで行かずにdoneになったら
                reward = -1
            else:
                pass  ### rewardは基本的に1

            ### memoryに記録
            action_one_hot = np.zeros(2)
            action_one_hot[action] = 1
            memory.push(previous_present_state, present_next_state,
                        action_one_hot, reward, mask)

            ### rewardは基本的に1
            score += reward  ### そのepisodeで何ステップ行ったかを記録するためだけのもの

            if steps > initial_exploration:
                epsilon -= 0.00005
                epsilon = max(epsilon, 0.1)

                ### online_net の学習
                batch = memory.sample(batch_size)
                loss = QNet.train_model(online_net, target_net, optimizer,
                                        batch)

                ### たまにtarget_netをonline_netで上書きする
                if steps % update_target == 0:
                    update_target_model(online_net, target_net)

            ### 次のステップ
            previous_state = state
            state = next_state

        print("Ep {0:04d}: {1} step".format(e, steps - steps_before))
        steps_before = steps

        score = score if score == 500.0 else score + 1
        running_score = 0.99 * running_score + 0.01 * score
        if e % log_interval == 0:
            print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format(
                e, running_score, epsilon))

        if running_score > goal_score:
            break
예제 #12
0
    train_loader = DataLoader(train_data, shuffle=True,  batch_size=args.batch, num_workers=8, pin_memory=True)
    #create the loader for the validation set
    val_data = HdrVdpDataset(val_data, args.data, args.group, bPrecompGroup = args.groupprecomp)
    val_loader = DataLoader(val_data, shuffle=False, batch_size=args.batch, num_workers=8, pin_memory=True)
    #create the loader for the testing set
    test_data = HdrVdpDataset(test_data, args.data, args.group, bPrecompGroup = args.groupprecomp)
    test_loader = DataLoader(test_data, shuffle=False, batch_size=args.batch, num_workers=8, pin_memory=True)

    #create the model
    if(torch.cuda.is_available()):
        model = QNet().cuda()
    else:
        model = QNet()

    #create the optmizer
    optimizer = Adam(model.parameters(), lr=args.lr)
    scheduler = ReduceLROnPlateau(optimizer, patience=15, factor=0.5, verbose=True)

    log = pd.DataFrame()
    
    #training loop
    best_mse = None
    a_t = []
    a_v = []
    a_te = []
    
    start_epoch = 1
    if args.resume:
       ckpt_dir_r = os.path.join(args.resume, 'ckpt')
       ckpts = glob2.glob(os.path.join(ckpt_dir_r, '*.pth'))
       assert ckpts, "No checkpoints to resume from!"
예제 #13
0

# Build environment
env = make_atari('PongNoFrameskip-v4', stack=2)
env = wrap_pytorch(env)

number_actions = env.action_space.n
replay_buffer = ReplayBuffer(replay_memory_size)

# Separate target net & policy net
input_shape = env.reset().shape
current_net = QNet(input_shape, number_actions).to(device)
target_net = QNet(input_shape, number_actions).to(device)  # with older weights
target_net.load_state_dict(current_net.state_dict())
target_net.eval()
optimizer = opt_algorithm(current_net.parameters(), lr=learning_rate)

n_episode = 1
episode_return = 0
best_return = 0
returns = []
state = env.reset()
for i in count():
    # env.render()
    eps = get_epsilon(i)
    action = select_action(state,
                           current_net,
                           eps,
                           number_action=number_actions)
    next_state, reward, done, _ = env.step(action)
    replay_buffer.push(state, action, reward, next_state, done)
예제 #14
0
파일: ppo.py 프로젝트: eladsar/contact
class PPO(Algorithm):
    def __init__(self, *largs, **kwargs):
        super(PPO, self).__init__(*largs, **kwargs)

        self.pi_net = PiNet(self.ns,
                            self.na,
                            distribution='Normal',
                            bounded=False,
                            agent='ppo').to(self.device)
        self.v_net = QNet(self.ns, 0, agent='ppo').to(self.device)

        self.optimizer_v = torch.optim.Adam(self.v_net.parameters(),
                                            lr=self.lr_q,
                                            betas=(0.9, 0.999),
                                            weight_decay=self.weight_decay_q)

        self.optimizer_p = torch.optim.Adam(self.pi_net.parameters(),
                                            lr=self.lr_p,
                                            betas=(0.9, 0.999),
                                            weight_decay=self.weight_decay_p)

    def play(self, env, evaluate=False):

        with torch.no_grad():
            a = self.pi_net(env.s, evaluate=evaluate)

        if not (self.env_steps >= self.warmup_steps or evaluate):
            a = None

        state = env(a)
        state['logp'] = self.pi_net.log_prob(state['a']).detach()

        return state

    def episodic_training(self, train_results, tail):

        episode = self.replay_buffer.get_tail(tail)

        sl = episode['s']
        sl = list(torch.chunk(sl, int((len(sl) / self.batch) + 1)))

        s, r, t, e = [episode[k] for k in ['s', 'r', 't', 'e']]

        v = []
        for s in sl:
            v.append(self.v_net(s))

        v.append(torch.zeros_like(v[0][:1]))
        v = torch.cat(v).detach()
        v1, v2 = v[:-1], v[1:]

        adv, v_target = generalized_advantage_estimation(
            r,
            t,
            e,
            v1,
            v2,
            self.gamma,
            self.lambda_gae,
            norm=self.norm_rewards)

        episode['adv'] = adv
        episode['v_target'] = v_target

        if self.batch_ppo:
            n = self.steps_per_episode * self.batch
            indices = torch.randperm(tail * max(1, n // tail + 1)) % tail
            indices = indices[:n].unsqueeze(1).view(self.steps_per_episode,
                                                    self.batch)

            samples = {k: v[indices] for k, v in episode.items()}
            iterator_pi = iter_dict(samples)
            iterator_v = iter_dict(samples)
        else:
            iterator_pi = itertools.repeat(episode, self.steps_per_episode)
            iterator_v = itertools.repeat(episode, self.steps_per_episode)

        for i, sample in enumerate(iterator_pi):
            s, a, r, t, stag, adv, v_target, log_pi_old = [
                sample[k] for k in
                ['s', 'a', 'r', 't', 'stag', 'adv', 'v_target', 'logp']
            ]
            self.pi_net(s)
            log_pi = self.pi_net.log_prob(a)
            ratio = torch.exp((log_pi - log_pi_old).sum(dim=1))

            clip_adv = torch.clamp(ratio, 1 - self.eps_ppo,
                                   1 + self.eps_ppo) * adv
            loss_p = -(torch.min(ratio * adv, clip_adv)).mean()

            approx_kl = -float((log_pi - log_pi_old).sum(dim=1).mean())
            ent = float(self.pi_net.entropy().sum(dim=1).mean())

            if approx_kl > self.target_kl:
                train_results['scalar']['pi_opt_rounds'].append(i)
                break

            clipped = ratio.gt(1 + self.eps_ppo) | ratio.lt(1 - self.eps_ppo)
            clipfrac = float(
                torch.as_tensor(clipped, dtype=torch.float32).mean())

            self.optimizer_p.zero_grad()
            loss_p.backward()
            if self.clip_p:
                nn.utils.clip_grad_norm(self.pi_net.parameters(), self.clip_p)
            self.optimizer_p.step()

            train_results['scalar']['loss_p'].append(float(loss_p))
            train_results['scalar']['approx_kl'].append(approx_kl)
            train_results['scalar']['ent'].append(ent)
            train_results['scalar']['clipfrac'].append(clipfrac)

        for sample in iterator_v:
            s, a, r, t, stag, adv, v_target, log_pi_old = [
                sample[k] for k in
                ['s', 'a', 'r', 't', 'stag', 'adv', 'v_target', 'logp']
            ]

            v = self.v_net(s)
            loss_v = F.mse_loss(v, v_target, reduction='mean')

            self.optimizer_v.zero_grad()
            loss_v.backward()
            if self.clip_q:
                nn.utils.clip_grad_norm(self.v_net.parameters(), self.clip_q)
            self.optimizer_v.step()

            train_results['scalar']['loss_v'].append(float(loss_v))

        return train_results
예제 #15
0
def main():
    ### 環境を初期化
    env = gym.make(env_name)
    env.seed(500)
    torch.manual_seed(500)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    ### ポリシーネットワークの構築
    net = QNet(num_inputs, num_actions)
    optimizer = optim.Adam(net.parameters(), lr=lr)

    net.to(device)
    net.train()

    ### もろもろの初期化
    running_score = 0
    steps = 0
    loss = 0
    steps_before = 0

    for e in range(10000):
        done = False
        ### 1エピソードごとにMemoryは空にする(実質、Experience Replay がない)
        memory = Memory()

        ### 環境を初期状態に
        score = 0
        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)

        while not done:
            steps += 1

            ### epsilon は使わず、各行動の評価値を確率に直接変換して行動を決定
            action = net.get_action(state)
            next_state, reward, done, _ = env.step(action)

            next_state = torch.Tensor(next_state)
            next_state = next_state.unsqueeze(0)

            mask = 0 if done else 1
            reward = reward if not done or score == 499 else -1

            action_one_hot = torch.zeros(num_actions)
            action_one_hot[action] = 1
            memory.push(state, next_state, action_one_hot, reward, mask)

            score += reward
            state = next_state

        ### 1エピソード分をまとめて学習
        ### memory.sample はランダムに選択ではなく、1エピソードのmemory全体を返す
        loss = QNet.train_model(net, optimizer, memory.sample())

        print("Ep {0:04d}: {1} step".format(e, steps - steps_before))
        steps_before = steps

        score = score if score == 500.0 else score + 1
        running_score = 0.99 * running_score + 0.01 * score
        if e % log_interval == 0:
            print('{} episode | score: {:.2f}'.format(e, running_score))

        if running_score > goal_score:
            break
예제 #16
0
class Agent():
    """Agent definition for interacting with environment"""
    def __init__(self, state_size, action_size, seed):
        """
        Params
        ======
            state_size (int): state dimension
            action_size (int): action dimension
            seed (int): random seed for replicating experiment
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        self.QNet_local = QNet(state_size, action_size, seed).to(device)
        self.QNet_target = QNet(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.QNet_local.parameters(), lr=LR)

        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Add current experience to replay memory
        self.memory.add(state, action, reward, next_state, done)

        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Get favored action

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.QNet_local.eval()
        with torch.no_grad():
            action_values = self.QNet_local(state)
        self.QNet_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Perform learning on experiences

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        Q_targets_next = self.QNet_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        Q_expected = self.QNet_local(states).gather(1, actions)

        loss = F.mse_loss(Q_expected, Q_targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_update(self.QNet_local, self.QNet_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """ θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): model to copy weights from
            target_model (PyTorch model): copy to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
def main():
    env = gym.make(args.env_name)
    env.seed(500)
    torch.manual_seed(500)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    net = QNet(num_inputs, num_actions)
    target_net = QNet(num_inputs, num_actions)
    update_target_model(net, target_net)

    optimizer = optim.Adam(net.parameters(), lr=0.001)
    writer = SummaryWriter('logs')

    if not os.path.isdir(args.save_path):
        os.makedirs(args.save_path)
    
    net.to(device)
    target_net.to(device)
    net.train()
    target_net.train()
    memory = Memory(10000)
    running_score = 0
    epsilon = 1.0
    steps = 0
    
    for e in range(3000):
        done = False
        
        score = 0
        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)

        while not done:
            if args.render:
                env.render()

            steps += 1
            qvalue = net(state)
            action = get_action(epsilon, qvalue, num_actions)
            next_state, reward, done, _ = env.step(action)
            
            next_state = torch.Tensor(next_state)
            next_state = next_state.unsqueeze(0)
            
            mask = 0 if done else 1
            reward = reward if not done or score == 499 else -1
            memory.push(state, next_state, action, reward, mask)

            score += reward
            state = next_state

            if steps > args.initial_exploration:
                epsilon -= 0.00005
                epsilon = max(epsilon, 0.1)

                batch = memory.sample(args.batch_size)
                train_model(net, target_net, optimizer, batch, args.batch_size)

                if steps % args.update_target:
                    update_target_model(net, target_net)

        score = score if score == 500.0 else score + 1
        running_score = 0.99 * running_score + 0.01 * score
        if e % args.log_interval == 0:
            print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format(
                e, running_score, epsilon))
            writer.add_scalar('log/score', float(score), running_score)

        if running_score > args.goal_score:
            ckpt_path = args.save_path + 'model.pth'
            torch.save(net.state_dict(), ckpt_path)
            print('running score exceeds 400 so end')
            break   
예제 #18
0
파일: train.py 프로젝트: kaznyan/temp
def main():
    env = gym.make(env_name)
    env.seed(500)
    torch.manual_seed(500)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    ### ポリシーネットワークの構築
    ### inputに対してπ(a|s) と Q(s, a) が出力される
    ### 次元とユニット数は2つで同じ
    net = QNet(num_inputs, num_actions)
    optimizer = optim.Adam(net.parameters(), lr=lr)

    net.to(device)
    net.train()

    ### もろもろの初期化
    running_score = 0
    steps = 0
    loss = 0
    steps_before = 0

    df = pd.DataFrame(index=range(10000), columns=["steps", "loss_policy", "loss_value"])

    for e in range(10000):
        done = False
        ### 1エピソード分のメモリすら持たずに1ステップずつ学習

        ### 環境を初期状態に
        score = 0
        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)

        lp = []
        lv = []
        while not done:
            steps += 1

            ### epsilon は使わず、各行動の評価値を確率に直接変換して行動を決定
            action = net.get_action(state)
            next_state, reward, done, _ = env.step(action)

            next_state = torch.Tensor(next_state)
            next_state = next_state.unsqueeze(0)

            mask = 0 if done else 1
            reward = reward if not done or score == 499 else -1
            transition = [state, next_state, action, reward, mask]

            score += reward
            state = next_state

            ### 1ステップごとに、そのステップの結果のみを学習
            loss, loss_policy, loss_value = QNet.train_model(net, optimizer, transition)
            # loss = QNet.train_model(net, optimizer, transition)
            lp.append(loss_policy.item())
            lv.append(loss_value.item())

        lp = np.asarray(lp[:-1]).sum() / (len(lp) - 1)
        lv = np.asarray(lv[:-1]).sum() / (len(lv) - 1)
        print("Ep {0:04d}: {1} step, loss_policy: {2}, loss_value: {3}".format(e, steps - steps_before, lp, lv))
        # print("Ep {0:04d}: {1} step".format(e, steps - steps_before))
        df.loc[e, "steps"]       = steps - steps_before
        df.loc[e, "loss_policy"] = lp
        df.loc[e, "loss_value"]  = lv
        steps_before = steps

        score = score if score == 500.0 else score + 1
        running_score = 0.99 * running_score + 0.01 * score
        if e % log_interval == 0:
            print('{} episode | score: {:.2f}'.format(e, running_score))

        if running_score > goal_score:
            break
    df.to_csv("loss.csv")
예제 #19
0
class Learner:
    def __init__(self, n_actors, shared_dict, device='cuda:0'):
        # params
        self.gamma = 0.99
        self.alpha = 0.6
        self.bootstrap_steps = 3
        self.initial_exploration = 50000
        self.priority_epsilon = 1e-6
        self.device = device
        self.n_epochs = 0
        self.n_actors = n_actors

        # path
        self.memory_path = os.path.join('./', 'logs', 'memory')

        # memory
        self.burn_in_length = 10
        self.learning_length = 10
        self.sequence_length = self.burn_in_length + self.learning_length
        self.memory_size = 500000
        self.batch_size = 8
        self.memory_load_interval = 20
        self.replay_memory = ReplayMemory(self.memory_size, self.batch_size,
                                          self.bootstrap_steps)

        # net
        self.shared_dict = shared_dict
        self.net_save_interval = 100
        self.target_update_interval = 1000
        self.net = QNet(self.device).to(self.device)
        self.target_net = QNet(self.device).to(self.device)
        self.target_net.load_state_dict(self.net.state_dict())
        self.save_model()
        self.optim = optim.RMSprop(self.net.parameters(),
                                   lr=0.00025 / 4.0,
                                   alpha=0.95,
                                   eps=1.5e-7,
                                   centered=True)

    def run(self):
        while True:
            if self.replay_memory.size > self.initial_exploration:
                self.train()
                if self.n_epochs % 100 == 0:
                    print('trained', self.n_epochs, 'epochs')
            self.interval()

    def train(self):
        batch, seq_index, index = self.replay_memory.sample(self.device)

        self.net.set_state(batch['hs'], batch['cs'])
        self.target_net.set_state(batch['target_hs'], batch['target_cs'])

        ### burn-in step ###
        state = batch['state'][:self.burn_in_length]
        next_state = batch['next_state'][:self.burn_in_length]
        with torch.no_grad():
            _ = self.net(state)
            _ = self.target_net(next_state)

        ### learning step ###
        state = batch['state'][self.burn_in_length:]
        next_state = batch['next_state'][self.burn_in_length:]

        # q_value
        q_value = self.net(state).gather(1, batch['action'].view(-1, 1))

        # target q_value
        with torch.no_grad():
            next_action = torch.argmax(self.net(next_state), 1).view(-1, 1)
            next_q_value = self.target_net(next_state).gather(1, next_action)
            target_q_value = batch["reward"].view(
                -1, 1) + (self.gamma**self.bootstrap_steps) * next_q_value * (
                    1 - batch['done'].view(-1, 1))

        # update
        self.optim.zero_grad()
        loss = torch.mean(0.5 * (q_value - target_q_value)**2)
        loss.backward()
        self.optim.step()

        priority = (np.abs(
            (q_value - target_q_value).detach().cpu().numpy()).reshape(-1) +
                    self.priority_epsilon)**self.alpha
        self.replay_memory.update_priority(
            index[self.burn_in_length:].reshape(-1), priority)
        self.replay_memory.update_sequence_priority(seq_index, True)

    def interval(self):
        self.n_epochs += 1
        if self.n_epochs % self.target_update_interval == 0:
            self.target_net.load_state_dict(self.net.state_dict())
        if self.n_epochs % self.net_save_interval == 0:
            self.save_model()
        if self.n_epochs % self.memory_load_interval == 0:
            for i in range(self.n_actors):
                self.replay_memory.load(self.memory_path, i)

    def save_model(self):
        self.shared_dict['net_state'] = deepcopy(self.net).cpu().state_dict()
        self.shared_dict['target_net_state'] = deepcopy(
            self.target_net).cpu().state_dict()
예제 #20
0
def main():
    # cartpole test
    if (cartpole_test):
        envs_fun = [lambda: gym.make('CartPole-v0')]
        envs_fun = np.tile(envs_fun, 3)
        envs = ShmemVecEnv(envs_fun)
        dummy_env = envs_fun[0]()
    else:
        INPUT_FILE = '../data/05f2a901.json'
        with open(INPUT_FILE, 'r') as f:
            puzzle = json.load(f)

        envs_fun = [
            lambda: gym.make('arc-v0',
                             input=task['input'],
                             output=task['output'],
                             need_ui=need_ui) for task in puzzle['train']
        ]
        #pdb.set_trace()
        envs_fun = envs_fun[0:1]
        envs = ShmemVecEnv(envs_fun)
        dummy_env = envs_fun[0]()

    env_num = len(envs_fun)
    torch.manual_seed(500)

    num_inputs = dummy_env.observation_space.shape[0]
    num_actions = dummy_env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    online_net = QNet(num_inputs, num_actions, cartpole_test, evalution_mode)
    target_net = QNet(num_inputs, num_actions, cartpole_test, evalution_mode)

    if (evalution_mode):
        online_net = torch.load('../result/arc0.model')
        target_net = torch.load('../result/arc0.model')

    update_target_model(online_net, target_net)

    optimizer = optim.Adam(online_net.parameters(), lr=lr)
    writer = SummaryWriter('logs')

    online_net.to(device)
    target_net.to(device)
    online_net.train()
    target_net.train()
    memory = Memory(replay_memory_capacity)

    score = 0
    epsilon = 1.0
    steps = 0
    loss = 0

    states = envs.reset()

    try:
        while True:
            if (need_ui):
                envs.render()
            steps += 1

            global initial_exploration
            if (initial_exploration > 0):
                initial_exploration -= 1

            actions = []

            for state in states:
                state = torch.Tensor(state).to(device)
                state = state.unsqueeze(0)
                action = get_action(state, target_net,
                                    0 if evalution_mode else epsilon,
                                    dummy_env)
                if (evalution_mode):
                    print(action)
                actions.append(action)

            next_states, rewards, dones, info = envs.step(actions)
            #print(rewards)

            masks = np.zeros(envs.num_envs)
            for i in range(envs.num_envs):
                masks[i] = 0 if dones[i] else 1

            for i in range(envs.num_envs):
                #print(rewards[i])
                action_one_hot = np.zeros(dummy_env.action_space.n)
                action_one_hot[actions[i]] = 1
                memory.push(states[i], next_states[i], action_one_hot,
                            rewards[i], masks[i])

            #score += reward
            states = next_states

            if not evalution_mode and steps > initial_exploration:
                epsilon -= 0.00003
                epsilon = max(epsilon, 0.1)

                batch = memory.sample(batch_size)
                loss = QNet.train_model(online_net, target_net, optimizer,
                                        batch, device)

                if steps % update_target == 0:
                    update_target_model(online_net, target_net)

            if (steps > 1028):
                states = envs.reset()
                steps = 0
                print(
                    'new epsisode ------------------------------------------')

    except KeyboardInterrupt:
        print('save model')
        torch.save(target_net, '../result/arc.model')
        sys.exit(0)
def main():
    env = gym.make(args.env_name)
    env.seed(500)
    torch.manual_seed(500)

    img_shape = env.observation_space.shape
    num_actions = 3
    print('image size:', img_shape)
    print('action size:', num_actions)

    net = QNet(num_actions)
    target_net = QNet(num_actions)
    update_target_model(net, target_net)

    optimizer = optim.RMSprop(net.parameters(), lr=0.00025, eps=0.01)
    writer = SummaryWriter('logs')

    if not os.path.isdir(args.save_path):
        os.makedirs(args.save_path)

    net.to(device)
    target_net.to(device)
    net.train()
    target_net.train()
    memory = Memory(100000)
    running_score = 0
    epsilon = 1.0
    steps = 0

    for e in range(10000):
        done = False
        dead = False

        score = 0
        avg_loss = []
        start_life = 5
        state = env.reset()

        state = pre_process(state)
        state = torch.Tensor(state).to(device)
        history = torch.stack((state, state, state, state))

        for i in range(3):
            action = env.action_space.sample()
            state, reward, done, info = env.step(action)
            state = pre_process(state)
            state = torch.Tensor(state).to(device)
            state = state.unsqueeze(0)
            history = torch.cat((state, history[:-1]), dim=0)

        while not done:
            if args.render:
                env.render()

            steps += 1
            qvalue = net(history.unsqueeze(0))
            action = get_action(epsilon, qvalue, num_actions)

            next_state, reward, done, info = env.step(action + 1)

            next_state = pre_process(next_state)
            next_state = torch.Tensor(next_state).to(device)
            next_state = next_state.unsqueeze(0)
            next_history = torch.cat((next_state, history[:-1]), dim=0)

            if start_life > info['ale.lives']:
                dead = True
                start_life = info['ale.lives']

            score += reward
            reward = np.clip(reward, -1, 1)

            mask = 0 if dead else 1
            memory.push(history.cpu(), next_history.cpu(), action, reward,
                        mask)

            if dead:
                dead = False

            if steps > args.initial_exploration:
                epsilon -= 1e-6
                epsilon = max(epsilon, 0.1)

                batch = memory.sample(args.batch_size)
                loss = train_model(net, target_net, optimizer, batch)

                if steps % args.update_target:
                    update_target_model(net, target_net)
            else:
                loss = 0

            avg_loss.append(loss)
            history = next_history

        if e % args.log_interval == 0:
            print(
                '{} episode | score: {:.2f} | epsilon: {:.4f} | steps: {} | loss: {:.4f}'
                .format(e, score, epsilon, steps, np.mean(avg_loss)))
            writer.add_scalar('log/score', float(score), steps)
            writer.add_scalar('log/score', np.mean(avg_loss), steps)

        if score > args.goal_score:
            ckpt_path = args.save_path + 'model.pth'
            torch.save(net.state_dict(), ckpt_path)
            print('running score exceeds 400 so end')
            break
예제 #22
0
class QTDAgent(object):
    def __init__(self,
                 state_dim,
                 action_dim,
                 learning_rate=0.001,
                 reward_decay=0.99,
                 e_greedy=0.9):
        self.action_dim = action_dim
        self.state_dim = state_dim
        self.lr = learning_rate
        self.gamma = reward_decay  #  in according to the parameters in the formulation.
        self.epsilon = e_greedy
        self.EPS_START = 0.9
        self.EPS_END = 0.05
        self.EPS_DECAY = 30000  # this decay is to slow. # TO DO: figure out the relationship between the decay and the totoal step.
        # try to use a good strategy to solve this problem.
        use_cuda = torch.cuda.is_available()
        self.LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
        self.FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
        self.model = QNet(self.state_dim,
                          self.action_dim).cuda() if use_cuda else QNet(
                              self.state_dim, self.action_dim)

        self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
        # self.scheduler = optim.StepLR(self.optimizer, step_size=10000, gamma=0.5) # the learning rate decrease by a factor gamma every 10000 step_size.

        util.weights_init(self.model)

    def sbc(self, v, volatile=False):
        return Variable(self.FloatTensor((np.expand_dims(v, 0).tolist())),
                        volatile=volatile)

    def get_actions(self, state):
        action = self.model(self.sbc(state, volatile=True))
        return action

    def select_action(self, state, steps_done):
        util.adjust_learning_rate(self.optimizer,
                                  self.lr,
                                  steps_done,
                                  10000,
                                  lr_decay=0.2)  # global steps_done
        sample = random.random()
        esp_threshold = self.EPS_END + (self.EPS_START - self.EPS_END) * \
                np.exp(-1. * steps_done / self.EPS_DECAY)

        if sample > esp_threshold:
            actions = self.get_actions(state)
            action = actions.data.max(1)[1].view(1, 1)
            return action
        else:
            return self.LongTensor([[random.randrange(self.action_dim)]])

    def update(self, pending):  #	def update(self, s, a, r, s_, a_,done=False):
        pending_len = len(pending)
        loss = 0
        while (pending_len):
            pending_len = pending_len - 1
            [s, a, r, s_, a_, done] = pending[pending_len]
            if (done == True):
                expect_state_action_value = r
            else:
                non_final_next_states = self.model(self.sbc(s_, volatile=True))
                expect_state_action_value = r + self.gamma * non_final_next_states.max(
                    1)[0]
                expect_state_action_value.volatile = False
            # expect_state_action_value = r + self.gamma*self.model(Variable(torch.from_numpy(np.expand_dims(s_,0).astype('float32')))).max(1)[0]
            state_action_value = self.model(self.sbc(s))[0, a]
            loss += 0.5 * (state_action_value -
                           expect_state_action_value).pow(2)
        self.optimizer.zero_grad()
        loss.backward()
        # loss.backward()
        # for param in self.model.parameters():
        # 	param.grad.data.clamp_(-1,1)
        self.optimizer.step()

    def save_model(self, path):
        torch.save(self.model.state_dict(), '{}QTDAgent.pt'.format(path))
        # torch.save(self.target_critic.state_dict(), '{}/critic.pt'.format(path))
        print('Models saved successfully')

    def load_model(self, name):
        self.model.load_state_dict(name)
예제 #23
0
파일: sac.py 프로젝트: km01/myrl
class SAC(object):
    def __init__(self, input_size, action_size, gamma, tau, alpha, hidden_size,
                 lr, device):

        self.gamma, self.tau, self.alpha = gamma, tau, alpha
        self.lr, self.device = lr, device

        self.policy = Actor(input_size, hidden_size,
                            action_size).to(self.device)
        self.critic = QNet(input_size, hidden_size,
                           action_size).to(self.device)

        self.policy_optim = torch.optim.Adam(self.policy.parameters(),
                                             lr=self.lr)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(),
                                             lr=self.lr)

        self.critic_target = copy.deepcopy(self.critic)
        self.critic_target.requires_grad_(False)

    @torch.no_grad()
    def select_action(self, obs, sample=True):
        obs = torch.FloatTensor(obs).to(self.device).unsqueeze(0)
        policy = self.policy(obs)
        action = Cat(raw_base=policy).sample(onehot=False, sample=sample)
        action = action.cpu().numpy()[0]
        return action

    def update_parameters(self, batch):
        obs, act, rew, done, obs_next = batch
        obs = torch.FloatTensor(obs).to(self.device)
        act = torch.LongTensor(act).unsqueeze(-1).to(self.device)
        rew = torch.FloatTensor(rew).unsqueeze(-1).to(self.device)
        done = torch.BoolTensor(done).unsqueeze(-1).to(self.device)
        obs_next = torch.FloatTensor(obs_next).to(self.device)

        with torch.no_grad():
            next_policy = Cat(raw_base=self.policy(obs_next))
            next_q = torch.min(*self.critic_target(obs_next))
            next_eval = (next_policy.probs * next_q).sum(dim=-1, keepdim=True)
            next_entr = -(next_policy.probs * next_policy.logits).sum(
                dim=-1, keepdim=True)
            next_v = (next_eval + self.alpha * next_entr).masked_fill(done, 0.)
            q_targ = rew + self.gamma * next_v

        self.critic_optim.zero_grad()
        q1, q2 = self.critic(obs)

        q_pred = torch.min(q1, q2).detach()
        q1, q2 = q1.gather(dim=-1, index=act), q2.gather(dim=-1, index=act)
        critic_loss = (q1 - q_targ).pow(2.).mul(0.5) + (
            q2 - q_targ).pow(2.).mul(0.5)
        critic_loss = critic_loss.mean()
        critic_loss.backward()
        self.critic_optim.step()

        with torch.no_grad():
            critic_loss = (torch.min(q1, q2) - q_targ).pow(2.).mul(0.5).mean()

        self.policy_optim.zero_grad()
        policy = Cat(raw_base=self.policy(obs))
        policy_entr = -(policy.probs.detach() *
                        policy.logits).sum(dim=-1).mean()
        policy_eval = (policy.probs * q_pred).sum(dim=-1).mean()
        policy_loss = self.alpha * policy_entr - policy_eval
        policy_loss.backward()
        self.policy_optim.step()

        soft_update(self.critic_target, self.critic, self.tau)

        loss_info = {
            'critic_loss': critic_loss.item(),
            'policy_loss': policy_loss.item(),
            'policy_entr': policy_entr.item()
        }

        return loss_info
예제 #24
0
def main():
    env = gym.make(args.env_name)
    env.seed(500)
    torch.manual_seed(500)

    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    print('state size:', state_size)
    print('action size:', action_size)

    q_net = QNet(state_size, action_size, args)
    target_q_net = QNet(state_size, action_size, args)
    optimizer = optim.Adam(q_net.parameters(), lr=0.001)

    update_target_model(q_net, target_q_net)

    writer = SummaryWriter(args.logdir)

    replay_buffer = deque(maxlen=10000)
    running_score = 0
    steps = 0

    for episode in range(args.max_iter_num):
        done = False
        score = 0

        state = env.reset()
        state = np.reshape(state, [1, state_size])

        while not done:
            if args.render:
                env.render()

            steps += 1

            q_values = q_net(torch.Tensor(state))
            action = get_action(q_values, action_size, args.epsilon)

            next_state, reward, done, _ = env.step(action)

            next_state = np.reshape(next_state, [1, state_size])
            reward = reward if not done or score == 499 else -1
            mask = 0 if done else 1

            replay_buffer.append((state, action, reward, next_state, mask))

            state = next_state
            score += reward

            if steps > args.initial_exploration:
                args.epsilon -= args.epsilon_decay
                args.epsilon = max(args.epsilon, 0.1)

                mini_batch = random.sample(replay_buffer, args.batch_size)

                q_net.train(), target_q_net.train()
                train_model(q_net, target_q_net, optimizer, mini_batch)

                if steps % args.update_target == 0:
                    update_target_model(q_net, target_q_net)

        score = score if score == 500.0 else score + 1
        running_score = 0.99 * running_score + 0.01 * score

        if episode % args.log_interval == 0:
            print(
                '{} episode | running_score: {:.2f} | epsilon: {:.2f}'.format(
                    episode, running_score, args.epsilon))
            writer.add_scalar('log/score', float(score), episode)

        if running_score > args.goal_score:
            if not os.path.isdir(args.save_path):
                os.makedirs(args.save_path)

            ckpt_path = args.save_path + 'model.pth.tar'
            torch.save(q_net.state_dict(), ckpt_path)
            print('Running score exceeds 400. So end')
            break
예제 #25
0
class Off_policy(Algo):
    def __init__(self):
        super(Off_policy, self).__init__()
        self.memory = Replay_buffer(capacity=p.exploitory_policy_memory_size)
        self.exploratory_policy = GaussianPolicy(
            self.state_space, self.action_space).to(self.device)
        self.exploratory_Q = QNet(self.state_space,
                                  self.action_space).to(self.device)
        self.exploratory_Q_target = QNet(self.state_space,
                                         self.action_space).to(self.device)
        self.exploratory_policy_optim = Adam(
            self.exploratory_policy.parameters(), lr=p.lr)
        self.exploratory_Q_optim = Adam(self.exploratory_Q.parameters(),
                                        lr=p.lr)

        self.target_update(self.exploratory_policy, self.exploitory_policy,
                           1.0)

        self.kl_normalizer = Normalizer(1)
        self.ex_rewards_normalizer = Normalizer(1)

    def start(self):
        total_numsteps = 0

        for episode in itertools.count(1):
            episode_rewards = 0.0
            episode_steps = 0
            done = False
            state = self.env.reset()

            while not done:
                episode_steps += 1
                if p.random_steps > total_numsteps:
                    action = self.env.action_space.sample()
                else:
                    norm_state = self.obs_normalizer.normalize(state)
                    action = self.select_action(norm_state,
                                                self.exploratory_policy)

                if len(self.memory) > p.exploitory_batch_size and len(
                        self.memory) > p.exploratory_batch_size:
                    for i in range(p.exploitory_policy_updates_per_steps):
                        qf1_loss, qf2_loss, policy_loss, alpha_loss, alpha, ex_reward_model_loss = self.update_exploitory_policy(
                            self.memory)
                        if episode % p.exploitory_target_update_interval == 0:
                            self.target_update(self.exploitory_Q_target,
                                               self.exploitory_Q, p.tau)

                    for i in range(p.exploratory_policy_updates_per_steps):
                        ex_qf1_loss, ex_qf2_loss, ex_policy_loss, divergence_loss = self.update_exploratory_policy(
                            self.memory)
                        if episode % p.exploratory_target_update_interval == 0:
                            self.target_update(self.exploratory_Q_target,
                                               self.exploratory_Q, p.tau)

                next_state, reward, done, _ = self.env.step(action)
                total_numsteps += 1
                episode_rewards += reward

                # Ignore the done signal if it comes from hitting the time horizon.
                mask = 1.0 if episode_steps == self.env._max_episode_steps else float(
                    not done)

                self.memory.push((state, action, reward, next_state, mask))
                self.obs_normalizer.update(state)
                state = next_state

            if episode % p.test_freq == 0:
                average_rewards, average_episode_steps = self.test_current_policy(
                )
                try:

                    data = {
                        'average_rewards': average_rewards,
                        'total_numsteps': total_numsteps,
                        'average_episode_steps': average_episode_steps,
                        'qf1_loss': qf1_loss,
                        'qf2_loss': qf2_loss,
                        'exploitory_policy_loss': policy_loss,
                        'alpha_loss': alpha_loss,
                        'alpha_value': alpha,
                        'ex_qf1_loss': ex_qf1_loss,
                        'ex_qf2_loss': ex_qf2_loss,
                        'ex_policy_loss': ex_policy_loss,
                        'ex_reward_model_loss': ex_reward_model_loss,
                        'divergence_loss': divergence_loss
                    }

                    self.log(data)
                except UnboundLocalError:
                    pass

            if total_numsteps > p.max_numsteps:
                self.env.close()
                self.writer.close()
                break

    def update_exploratory_policy(self, memory):
        state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(
            p.exploitory_batch_size)
        state_batch, next_state_batch = self.obs_normalizer.normalize(
            state_batch), self.obs_normalizer.normalize(next_state_batch)

        state_batch = torch.FloatTensor(state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(
            self.device).unsqueeze(1)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1)

        with torch.no_grad():
            ex_rewards = self.ex_reward_model.get_reward(
                state_batch, next_state_batch)
            ex_rewards = ex_rewards.unsqueeze(1).cpu().numpy()
            ex_reward_batch = self.ex_rewards_normalizer.normalize(ex_rewards)
            self.ex_rewards_normalizer.update(ex_rewards)
            ex_reward_batch = torch.FloatTensor(ex_reward_batch).to(
                self.device)

            ex_next_state_action, ex_next_state_log_pi, _ = self.exploratory_policy.sample(
                next_state_batch)
            qf1_next_target, qf2_next_target = self.exploratory_Q_target(
                next_state_batch, ex_next_state_action)
            '''
			ex_mean_actions, ex_log_std = self.exploratory_policy(next_state_batch)
			mean_actions, log_std = self.exploitory_policy(next_state_batch)
			ex_normal = Normal(ex_mean_actions, ex_log_std.exp())
			normal = Normal(mean_actions, log_std.exp())
			kl_div = torch.distributions.kl_divergence(ex_normal, normal).mean(1).unsqueeze(1)
			'''

            ex_next_state_log_prob = torch.clamp(
                self.exploratory_policy.get_logprob(next_state_batch,
                                                    ex_next_state_action),
                min=p.log_std_min,
                max=p.log_std_max)
            next_state_log_prob = torch.clamp(
                self.exploitory_policy.get_logprob(next_state_batch,
                                                   ex_next_state_action),
                min=p.log_std_min,
                max=p.log_std_max)

            kl_div = (ex_next_state_log_prob -
                      next_state_log_prob).mean(1).unsqueeze(1)

            min_qf_next_target = p.ex_alpha * (
                torch.min(qf1_next_target, qf2_next_target) -
                (p.alpha * ex_next_state_log_pi)) - kl_div
            next_q_value = ex_reward_batch + mask_batch * p.gamma * (
                min_qf_next_target)

        qf1, qf2 = self.exploratory_Q(state_batch, action_batch)

        qf1_loss = F.mse_loss(qf1, next_q_value)
        qf2_loss = F.mse_loss(qf2, next_q_value)
        qf_loss = qf1_loss + qf2_loss

        self.exploratory_Q_optim.zero_grad()
        qf_loss.backward()
        self.exploratory_Q_optim.step()

        ex_pi, ex_log_pi, _ = self.exploratory_policy.sample(state_batch)

        qf1_pi, qf2_pi = self.exploratory_Q(state_batch, ex_pi)
        min_qf_pi = torch.min(qf1_pi, qf2_pi)
        '''
		ex_mean_actions, ex_log_std = self.exploratory_policy(state_batch)
		mean_actions, log_std = self.exploitory_policy(state_batch)
		ex_normal = Normal(ex_mean_actions, ex_log_std.exp())
		normal = Normal(mean_actions, log_std.exp())
		kl_div = torch.distributions.kl_divergence(ex_normal, normal).mean(1).unsqueeze(1)
		'''

        ex_state_log_prob = torch.clamp(self.exploratory_policy.get_logprob(
            state_batch, ex_pi),
                                        min=p.log_std_min,
                                        max=p.log_std_max)
        with torch.no_grad():
            state_log_prob = torch.clamp(self.exploitory_policy.get_logprob(
                state_batch, ex_pi),
                                         min=p.log_std_min,
                                         max=p.log_std_max)
        kl_div = (ex_state_log_prob - state_log_prob).mean(1).unsqueeze(1)

        policy_loss = (p.ex_alpha * ((p.alpha * ex_log_pi) - min_qf_pi) +
                       kl_div).mean()

        self.exploratory_policy_optim.zero_grad()
        policy_loss.backward()
        self.exploratory_policy_optim.step()

        ex_alpha_loss = torch.Tensor([0.0])

        if settings.automatic_ex_entropy_tuning:
            ex_alpha_loss = -(
                self.ex_log_alpha *
                (ex_log_pi + self.ex_target_entropy).detach()).mean()
            self.ex_alpha_optim.zero_grad()
            ex_alpha_loss.backward()
            self.ex_alpha_optim.step()

            p.ex_alpha = self.ex_log_alpha.exp().item()

        return qf1_loss.item(), qf2_loss.item(), policy_loss.item(
        ), kl_div.mean().item()
예제 #26
0
def main():
    env = gym.make(env_name)
    env.seed(500)
    torch.manual_seed(500)

    num_inputs = 2
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    online_net = QNet(num_inputs, num_actions)
    target_net = QNet(num_inputs, num_actions)
    update_target_model(online_net, target_net)

    optimizer = optim.Adam(online_net.parameters(), lr=lr)
    writer = SummaryWriter('logs')

    online_net.to(device)
    target_net.to(device)
    online_net.train()
    target_net.train()
    memory = Memory(replay_memory_capacity)
    running_score = 0
    epsilon = 1.0
    steps = 0
    loss = 0

    for e in range(30000):
        done = False

        state_series = deque(maxlen=sequence_length)
        next_state_series = deque(maxlen=sequence_length)
        score = 0
        state = env.reset()

        state = state_to_partial_observability(state)
        state = torch.Tensor(state).to(device)

        next_state_series.append(state)
        while not done:
            steps += 1
            state_series.append(state)
            action = get_action(state_series, target_net, epsilon, env)
            next_state, reward, done, _ = env.step(action)

            next_state = state_to_partial_observability(next_state)
            next_state = torch.Tensor(next_state)

            mask = 0 if done else 1
            reward = reward if not done or score == 499 else -1
            action_one_hot = np.zeros(2)
            action_one_hot[action] = 1
            if len(state_series) >= sequence_length:
                memory.push(state_series, next_state_series, action_one_hot,
                            reward, mask)

            score += reward
            state = next_state

            if steps > initial_exploration:
                epsilon -= 0.000005
                epsilon = max(epsilon, 0.1)

                batch = memory.sample(batch_size)
                loss = QNet.train_model(online_net, target_net, optimizer,
                                        batch)

                if steps % update_target == 0:
                    update_target_model(online_net, target_net)

        score = score if score == 500.0 else score + 1
        if running_score == 0:
            running_score = score
        else:
            running_score = 0.99 * running_score + 0.01 * score
        if e % log_interval == 0:
            print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format(
                e, running_score, epsilon))
            writer.add_scalar('log/score', float(running_score), e)
            writer.add_scalar('log/loss', float(loss), e)

        if running_score > goal_score:
            break
예제 #27
0
class Learner:
    def __init__(self, n_actors, device='cuda:0'):
        # params
        self.gamma = 0.99
        self.alpha = 0.6
        self.bootstrap_steps = 3
        self.initial_exploration = 50000
        self.priority_epsilon = 1e-6
        self.device = device
        self.n_epochs = 0
        self.n_actors = n_actors

        # path
        self.memory_path = os.path.join('./', 'logs', 'memory')
        self.net_path = os.path.join('./', 'logs', 'model', 'net.pt')
        self.target_net_path = os.path.join('./', 'logs', 'model',
                                            'target_net.pt')

        # memory
        self.memory_size = 500000
        self.batch_size = 128
        self.memory_load_interval = 10
        self.replay_memory = ReplayMemory(self.memory_size, self.batch_size,
                                          self.bootstrap_steps)

        # net
        self.net_save_interval = 50
        self.target_update_interval = 1000
        self.net = QNet(self.net_path, self.device).to(self.device)
        self.target_net = QNet(self.target_net_path,
                               self.device).to(self.device)
        self.target_net.load_state_dict(self.net.state_dict())
        self.net.save()
        self.target_net.save()
        self.optim = optim.RMSprop(self.net.parameters(),
                                   lr=0.00025 / 4.0,
                                   alpha=0.95,
                                   eps=1.5e-7,
                                   centered=True)

    def run(self):
        while True:
            if self.replay_memory.size > self.initial_exploration:
                self.train()
            self.interval()

    def train(self):
        batch, index, weights = self.replay_memory.sample(self.device)

        # q_value
        q_value = self.net(batch['state'])
        q_value = q_value.gather(1, batch['action'])

        # target q_value
        with torch.no_grad():
            next_action = torch.argmax(self.net(batch["next_state"]),
                                       1).view(-1, 1)
            next_q_value = self.target_net(batch["next_state"]).gather(
                1, next_action)
            target_q_value = batch["reward"] + (
                self.gamma**
                self.bootstrap_steps) * next_q_value * (1 - batch['done'])

        # update
        self.optim.zero_grad()
        loss = torch.mean(0.5 * (q_value - target_q_value)**2)
        loss.backward()
        self.optim.step()

        priority = (np.abs(
            (q_value - target_q_value).detach().cpu().numpy()).reshape(-1) +
                    self.priority_epsilon)**self.alpha
        self.replay_memory.update_priority(index, priority)

    def interval(self):
        self.n_epochs += 1
        if self.n_epochs % self.target_update_interval == 0:
            self.target_net.load_state_dict(self.net.state_dict())
        if self.n_epochs % self.net_save_interval == 0:
            self.net.save()
            self.target_net.save()
        if self.n_epochs % self.memory_load_interval == 0:
            for i in range(self.n_actors):
                self.replay_memory.load(self.memory_path, i)
def main():
    env = gym.make(env_name)
    env.seed(500)
    torch.manual_seed(500)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    online_net = QNet(num_inputs, num_actions)
    target_net = QNet(num_inputs, num_actions)
    update_target_model(online_net, target_net)

    optimizer = optim.Adam(online_net.parameters(), lr=lr)
    writer = SummaryWriter('logs')

    online_net.to(device)
    target_net.to(device)
    online_net.train()
    target_net.train()
    memory = Memory_With_TDError(replay_memory_capacity)
    running_score = 0
    epsilon = 1.0
    steps = 0
    beta = beta_start
    loss = 0

    for e in range(3000):
        done = False

        score = 0
        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)

        while not done:
            steps += 1
            action = get_action(state, target_net, epsilon, env)
            next_state, reward, done, _ = env.step(action)

            next_state = torch.Tensor(next_state)
            next_state = next_state.unsqueeze(0)

            mask = 0 if done else 1
            reward = reward if not done or score == 499 else -1
            action_one_hot = np.zeros(2)
            action_one_hot[action] = 1
            memory.push(state, next_state, action_one_hot, reward, mask)

            score += reward
            state = next_state

            if steps > initial_exploration:
                epsilon -= 0.00005
                epsilon = max(epsilon, 0.1)
                beta += 0.00005
                beta = min(1, beta)

                batch, weights = memory.sample(batch_size, online_net,
                                               target_net, beta)
                loss = QNet.train_model(online_net, target_net, optimizer,
                                        batch, weights)

                if steps % update_target == 0:
                    update_target_model(online_net, target_net)

        score = score if score == 500.0 else score + 1
        running_score = 0.99 * running_score + 0.01 * score
        if e % log_interval == 0:
            print(
                '{} episode | score: {:.2f} | epsilon: {:.2f} | beta: {:.2f}'.
                format(e, running_score, epsilon, beta))
            writer.add_scalar('log/score', float(running_score), e)
            writer.add_scalar('log/loss', float(loss), e)

        if running_score > goal_score:
            break
예제 #29
0
class Algo:
    def __init__(self):
        #Creating environment
        self.env = gym.make(settings.env_name)
        self.env.seed(settings.seed)
        self.env.action_space.seed(settings.seed)

        self.state_space = self.env.observation_space.shape[0]
        self.action_space = self.env.action_space.shape[0]

        self.obs_normalizer = Normalizer(self.state_space)

        self.device = torch.device(settings.device)
        self.writer = SummaryWriter(
            'runs/' + settings.env_name + "_" + settings.algo +
            '_{}_{}_{}'.format(p.alpha, p.ex_alpha, settings.seed))

        #Initializing common networks and their optimizers
        self.exploitory_policy = GaussianPolicy(
            self.state_space, self.action_space).to(self.device)
        self.exploitory_Q = QNet(self.state_space,
                                 self.action_space).to(self.device)
        self.exploitory_Q_target = QNet(self.state_space,
                                        self.action_space).to(self.device)
        self.exploitory_policy_optim = Adam(
            self.exploitory_policy.parameters(), lr=p.lr)
        self.exploitory_Q_optim = Adam(self.exploitory_Q.parameters(), lr=p.lr)

        self.target_update(self.exploitory_Q_target, self.exploitory_Q, 1.0)

        p.alpha = torch.Tensor([p.alpha]).to(self.device)
        if settings.automatic_entropy_tuning:
            self.target_entropy = -torch.prod(
                torch.Tensor(self.env.action_space.shape).to(
                    self.device)).item()
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device)
            self.alpha_optim = Adam([self.log_alpha], lr=p.lr)

        if settings.automatic_ex_entropy_tuning:
            self.ex_target_entropy = -torch.prod(
                torch.Tensor(self.env.action_space.shape).to(
                    self.device)).item()
            self.ex_log_alpha = torch.zeros(1,
                                            requires_grad=True,
                                            device=self.device)
            self.ex_alpha_optim = Adam([self.log_alpha], lr=p.lr)

        if settings.reward_model == 'novelty':
            self.ex_reward_model = Novelty(self.state_space, self.device)

    def target_update(self, target, source, tau=p.tau):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - tau) +
                                    param.data * tau)

    def update_exploitory_policy(self, memory):
        state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(
            p.exploitory_batch_size)
        state_batch, next_state_batch = self.obs_normalizer.normalize(
            state_batch), self.obs_normalizer.normalize(next_state_batch)

        state_batch = torch.FloatTensor(state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(
            self.device).unsqueeze(1)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1)

        with torch.no_grad():
            next_state_action, next_state_log_pi, _ = self.exploitory_policy.sample(
                next_state_batch)
            qf1_next_target, qf2_next_target = self.exploitory_Q_target(
                next_state_batch, next_state_action)
            min_qf_next_target = torch.min(
                qf1_next_target, qf2_next_target) - p.alpha * next_state_log_pi
            next_q_value = reward_batch + mask_batch * p.gamma * (
                min_qf_next_target)

        qf1, qf2 = self.exploitory_Q(state_batch, action_batch)

        qf1_loss = F.mse_loss(qf1, next_q_value)
        qf2_loss = F.mse_loss(qf2, next_q_value)
        qf_loss = qf1_loss + qf2_loss

        self.exploitory_Q_optim.zero_grad()
        qf_loss.backward()
        self.exploitory_Q_optim.step()

        pi, log_pi, _ = self.exploitory_policy.sample(state_batch)
        qf1_pi, qf2_pi = self.exploitory_Q(state_batch, pi)
        min_qf_pi = torch.min(qf1_pi, qf2_pi)
        policy_loss = ((p.alpha * log_pi) - min_qf_pi).mean()

        self.exploitory_policy_optim.zero_grad()
        policy_loss.backward()
        self.exploitory_policy_optim.step()

        alpha_loss = torch.Tensor([0.0])

        if settings.automatic_entropy_tuning:
            alpha_loss = -(self.log_alpha *
                           (log_pi + self.target_entropy).detach()).mean()
            self.alpha_optim.zero_grad()
            alpha_loss.backward()
            self.alpha_optim.step()

            p.alpha = self.log_alpha.exp().item()

        ex_reward_model_loss = self.ex_reward_model.update(memory)

        return qf1_loss.item(), qf2_loss.item(), policy_loss.item(
        ), alpha_loss.item(), p.alpha, ex_reward_model_loss

    def test_current_policy(self):
        avg_reward = 0
        avg_steps = 0
        avg_ex_rewards = 0

        for episode in range(p.testing_episodes):
            episode_steps = 0
            state = self.env.reset()
            episode_rewards = 0
            episode_ex_rewards = 0
            done = False

            while not done:
                episode_steps += 1
                norm_state = self.obs_normalizer.normalize(state)
                action = self.select_action(norm_state,
                                            self.exploitory_policy,
                                            evaluate=True)
                next_state, reward, done, _ = self.env.step(action)
                episode_rewards += reward

                state = next_state

            avg_reward += episode_rewards
            avg_ex_rewards += episode_ex_rewards
            avg_steps += episode_steps

        avg_reward = avg_reward / p.testing_episodes
        avg_ex_rewards = avg_ex_rewards / p.testing_episodes
        avg_steps = avg_steps / p.testing_episodes

        return avg_reward, avg_steps

    def select_action(self, state, policy, evaluate=False):
        with torch.no_grad():
            try:
                state = torch.FloatTensor(state).to(self.device).unsqueeze(0)
                if evaluate is False:
                    action, log_prob, _ = policy.sample(state)
                else:
                    _, log_prob, action = policy.sample(state)

                return action.cpu().numpy()[0]

            except:
                state = state.unsqueeze(0)
                if evaluate is False:
                    action, log_prob, _ = policy.sample(state)
                else:
                    _, log_prob, action = policy.sample(state)

                return action

    def log(self, data):
        for key in data.keys():
            if key != "total_numsteps":
                self.writer.add_scalar(
                    key.split('_')[-1] + "/" + key, data[key],
                    data['total_numsteps'])
        print("Total number of Steps: {} \t Average reward per episode: {}".
              format(data['total_numsteps'], round(data['average_rewards'],
                                                   1)))

    def start(self):
        raise NotImplementedError