Пример #1
0
def load_checkpoint(file_dir, i_epoch, layer_sizes, input_size, device='cuda'):
    checkpoint = torch.load(os.path.join(file_dir, "ckpt_eps%d.pt" % i_epoch),
                            map_location=device)

    policy_net = PolicyNet(layer_sizes).to(device)
    value_net = ValueNet(input_size).to(device)
    policy_net.load_state_dict(checkpoint["policy_net"])
    policy_net.train()
    value_net.load_state_dict(checkpoint["value_net"])
    value_net.train()

    policy_lr = checkpoint["policy_lr"]
    valuenet_lr = checkpoint["valuenet_lr"]

    policynet_optim = optim.Adam(policy_net.parameters(), lr=policy_lr)
    policynet_optim.load_state_dict(checkpoint["policynet_optim"])
    valuenet_optim = optim.Adam(value_net.parameters(), lr=valuenet_lr)
    valuenet_optim.load_state_dict(checkpoint["valuenet_optim"])

    checkpoint.pop("policy_net")
    checkpoint.pop("value_net")
    checkpoint.pop("policynet_optim")
    checkpoint.pop("valuenet_optim")
    checkpoint.pop("i_epoch")
    checkpoint.pop("policy_lr")
    checkpoint.pop("valuenet_lr")

    return policy_net, value_net, policynet_optim, valuenet_optim, checkpoint
Пример #2
0
def load_checkpoint(file_dir, i_epoch, layer_sizes, device='cuda'):
    checkpoint = torch.load(os.path.join(file_dir, "ckpt_eps%d.pt" % i_epoch))

    policy_net = PolicyNet(layer_sizes).to(device)
    policy_net.load_state_dict(checkpoint["policy_net"])
    policy_net.train()

    learning_rate = checkpoint["learning_rate"]

    optimizer = optim.Adam(policy_net.parameters())
    # optimizer = optim.SGD(policy_net.parameters(), lr=learning_rate)
    optimizer.load_state_dict(checkpoint["optimizer"])

    checkpoint.pop("policy_net")
    checkpoint.pop("optimizer")
    checkpoint.pop("i_epoch")
    checkpoint.pop("learning_rate")

    return policy_net, optimizer, checkpoint
Пример #3
0
plt.ion()

# Create OpenAI gym environment
env = gym.make(env_name)
if is_unwrapped:
    env = env.unwrapped

# Get device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Current usable device is: ", device)

# Create the model
policy_net = PolicyNet(layer_sizes).to(device)

# Set up optimizer - Minimal
optimizer = optim.Adam(policy_net.parameters())
# optimizer = optim.SGD(policy_net.parameters(), lr=learning_rate)

###################################################################
# Start training

# Dictionary for extra training information to save to checkpoints
training_info = {"epoch mean durations" : [],
                 "epoch mean rewards" : [],
                 "max reward achieved": 0,
                 "past %d epochs mean reward" %  (num_avg_epoch): 0,}

# Batch that records trajectories
batch_log_prob = []
batch_rewards = []
Пример #4
0
class A3CGlobal:
    def __init__(self, config):
        self.config = config

        # 정책신경망 생성
        self.actor = PolicyNet(self.config.n_state, self.config.n_action)
        self.actor.to(device)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=self.config.actor_lr)

        # 가치신경망 생성
        self.critic = ValueNet(self.config.n_state, 1)
        self.critic.to(device)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=self.config.critic_lr)

    # 리턴값 계산
    def get_returns(self, rewards, done, next_value):
        returns = torch.zeros(len(rewards),
                              dtype=torch.float).to(self.config.device)
        R = 0 if done else next_value
        for i in reversed(range(0, len(rewards))):
            R = rewards[i] + self.config.discount_factor * R
            returns[i] = R
        return returns

    # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트
    def train_model(self, states, actions, rewards, next_states, done):
        states = torch.tensor(states, dtype=torch.float).to(self.config.device)
        actions = torch.tensor(actions,
                               dtype=torch.float).to(self.config.device)
        next_states = torch.tensor(next_states,
                                   dtype=torch.float).to(self.config.device)

        next_values = self.critic(next_states).view(-1)

        # 리턴값 계산
        returns = self.get_returns(rewards, done, next_values[-1])

        values = self.critic(states).view(-1)

        # 가치신경망 학습
        critic_loss = self.train_critic(values, returns)
        # 정책신경망 학습
        actor_loss = self.train_actor(states, actions, returns - values)

        return actor_loss, critic_loss

    # 정책신경망을 업데이트하는 함수
    def train_actor(self, states, actions, advantages):
        policy = self.actor(states)
        action_prob = torch.sum(actions * policy, dim=1)
        cross_entropy = torch.log(action_prob + 1.e-7) * advantages.detach()
        actor_loss = -torch.mean(cross_entropy)

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        return actor_loss.item()

    # 가치신경망을 업데이트하는 states
    def train_critic(self, values, targets):
        critic_loss = torch.mean(torch.pow(targets - values, 2))

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        return critic_loss.item()

    # GPU 메모리 반납
    def close(self):
        del self.actor
        del self.critic
Пример #5
0
env = gym.make("CartPole-v1")
# observation = env.reset()
# print(observation)
# print(env.observation_space)

MAXSTEP = 100
BATCHSIZE = 16
EPOCH = 1000
GAMMA = 0.99

policy_net = PolicyNet()
value_net = ValueNet()

policy_net.cuda()
value_net.cuda()
opt1 = optim.Adam(policy_net.parameters(), lr=1e-3)
opt2 = optim.Adam(value_net.parameters(), lr=1e-3)


# train one epoch
def train_step():

    observ_batch = []
    reward_batch = []
    action_batch = []
    mask_batch = []

    policy_net.cpu()
    value_net.cpu()
    for _ in range(BATCHSIZE):
        observ = []
Пример #6
0
class PGAgent:
    def __init__(self, config):
        self.config = config

        # replay memory
        self.replay_memory = deque(maxlen=self.config.n_replay_memory)

        # 정책신경망 생성
        self.model = PolicyNet(self.config.n_state, self.config.n_action)
        self.model.to(device)
        self.model_optimizer = torch.optim.Adam(self.model.parameters(),
                                                lr=self.config.learning_rate)

    # 정책신경망의 출력을 받아 확률적으로 행동을 선택
    def get_action(self, state):
        state = torch.tensor(state, dtype=torch.float).to(device)
        policy = self.model(state)
        policy = policy.detach().cpu().numpy()[0]
        return np.random.choice(self.config.n_action, 1, p=policy)[0]

    # 히스토리 추가
    def append_replay(self, state, action, reward, next_state):
        act = np.zeros(self.config.n_action)
        act[action] = 1
        self.replay_memory.append((state, act, reward, next_state))

    # 리턴값 계산
    def get_returns(self, rewards):
        returns = torch.zeros(len(rewards),
                              dtype=torch.float).to(self.config.device)
        R = 0
        for i in reversed(range(0, len(rewards))):
            R = rewards[i] + self.config.discount_factor * R
            returns[i] = R
        if 1 < len(returns):
            returns -= torch.mean(returns)
            returns /= (torch.std(returns) + 1.e-7)
        return returns

    # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트
    def train_model(self):
        # 히스토리를 배열 형태로 정렬
        replay_memory = np.array(self.replay_memory)
        self.replay_memory.clear()
        states = np.vstack(replay_memory[:, 0])
        actions = list(replay_memory[:, 1])
        rewards = list(replay_memory[:, 2])
        next_states = list(replay_memory[:, 3])

        states = torch.tensor(states, dtype=torch.float).to(self.config.device)
        actions = torch.tensor(actions,
                               dtype=torch.float).to(self.config.device)

        # 리턴값 계산
        returns = self.get_returns(rewards)

        loss = self.train_policy(states, actions, returns)

        return loss

    # 정책신경망을 업데이트하는 함수
    def train_policy(self, states, actions, returns):
        policy = self.model(states)
        action_prob = torch.sum(actions * policy, dim=1)
        cross_entropy = torch.log(action_prob + 1.e-7) * returns
        loss = -torch.mean(cross_entropy)

        self.model_optimizer.zero_grad()
        loss.backward()
        self.model_optimizer.step()

        return loss.item()

    # model의 weight를 파일로 저장
    def save(self):
        torch.save(self.model.state_dict(), self.config.save_file)

    # 파일로 부터 model의 weight를 읽어 옴
    def load(self):
        self.model.load_state_dict(torch.load(self.config.save_file))

    # GPU 메모리 반납
    def close(self):
        del self.model
Пример #7
0
def main():
    #Parse arguments
    #----------------------------
    parser = argparse.ArgumentParser()
    parser.add_argument("--env", default="CartPole-v0")
    parser.add_argument("--conti", action="store_true")
    args = parser.parse_args()

    #Parameters
    #----------------------------
    env_id = args.env
    mb_size = 256
    lr = 1e-5
    n_iter = 100000
    disp_step = 1000
    save_step = 10000
    save_dir = "./save"
    device = "cuda:0"
    expert_path = "../save/{}_traj.pkl".format(args.env)

    #Create environment
    #----------------------------
    env = gym.make(env_id)

    if args.conti:
        s_dim = env.observation_space.shape[0]
        a_dim = env.action_space.shape[0]
    else:
        s_dim = env.observation_space.shape[0]
        a_dim = env.action_space.n

    #Load expert trajectories
    #----------------------------
    if os.path.exists(expert_path):
        s_traj, a_traj = pkl.load(open(expert_path, "rb"))
        s_traj = np.concatenate(s_traj, 0)
        a_traj = np.concatenate(a_traj, 0)
    else:
        print("ERROR: No expert trajectory file found")
        sys.exit(1)

    #Create model
    #----------------------------
    policy_net = PolicyNet(s_dim, a_dim, conti=args.conti).to(device)
    opt = torch.optim.Adam(policy_net.parameters(), lr)

    #Load model
    #----------------------------
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

    if os.path.exists(os.path.join(save_dir, "{}.pt".format(args.env))):
        print("Loading the model ... ", end="")
        checkpoint = torch.load(
            os.path.join(save_dir, "{}.pt".format(args.env)))
        policy_net.load_state_dict(checkpoint["PolicyNet"])
        start_it = checkpoint["it"]
        print("Done.")
    else:
        start_it = 0

    #Start training
    #----------------------------
    t_start = time.time()
    policy_net.train()

    for it in range(start_it, n_iter + 1):
        #Train
        mb_obs, mb_actions = sample_batch(s_traj, a_traj, mb_size)
        mb_a_logps, mb_ents = policy_net.evaluate(
            torch.from_numpy(mb_obs).to(device),
            torch.from_numpy(mb_actions).to(device))
        loss = -mb_a_logps.mean()

        opt.zero_grad()
        loss.backward()
        opt.step()

        #Print the result
        if it % disp_step == 0:
            print("[{:5d} / {:5d}] Elapsed time = {:.2f}, actor loss = {:.6f}".
                  format(it, n_iter,
                         time.time() - t_start, loss.item()))

        #Save model
        if it % save_step == 0:
            print("Saving the model ... ", end="")
            torch.save({
                "it": it,
                "PolicyNet": policy_net.state_dict()
            }, os.path.join(save_dir, "{}.pt".format(args.env)))
            print("Done.")
            print()

    env.close()
Пример #8
0
class A2CAgent:
    def __init__(self, config):
        self.config = config

        # replay memory
        self.replay_memory = deque(maxlen=self.config.n_replay_memory)

        # 정책신경망 생성
        self.actor = PolicyNet(self.config.n_state, self.config.n_action)
        self.actor.to(device)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.actor_lr)

        # 가치신경망 생성
        self.critic = ValueNet(self.config.n_state, 1)
        self.critic.to(device)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.critic_lr)

    # 정책신경망의 출력을 받아 확률적으로 행동을 선택
    def get_action(self, state):
        state = torch.tensor(state, dtype=torch.float).to(device)
        policy = self.actor(state)
        policy = policy.detach().cpu().numpy()[0]
        return np.random.choice(self.config.n_action, 1, p=policy)[0]

    # 히스토리 추가
    def append_replay(self, state, action, reward, next_state):
        act = np.zeros(self.config.n_action)
        act[action] = 1
        self.replay_memory.append((state, act, reward, next_state))

    # 리턴값 계산
    def get_returns(self, rewards, done, next_value):
        returns = torch.zeros(len(rewards), dtype=torch.float).to(self.config.device)
        R = 0 if done else next_value
        for i in reversed(range(0, len(rewards))):
            R = rewards[i] + self.config.discount_factor * R
            returns[i] = R
        return returns

    # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트
    def train_model(self, done):
        # 히스토리를 배열 형태로 정렬
        replay_memory = np.array(self.replay_memory)
        self.replay_memory.clear()
        states = np.vstack(replay_memory[:, 0])
        actions = list(replay_memory[:, 1])
        rewards = list(replay_memory[:, 2])
        next_states = list(replay_memory[:, 3])

        states = torch.tensor(states, dtype=torch.float).to(self.config.device)
        actions = torch.tensor(actions, dtype=torch.float).to(self.config.device)
        next_states = torch.tensor(next_states, dtype=torch.float).to(self.config.device)

        next_values = self.critic(next_states).view(-1)

        # 리턴값 계산
        returns = self.get_returns(rewards, done, next_values[-1])

        values = self.critic(states).view(-1)

        # 가치신경망 학습
        critic_loss = self.train_critic(values, returns)
        # 정책신경망 학습
        actor_loss = self.train_actor(states, actions, returns - values)

        return actor_loss, critic_loss
    
    # 정책신경망을 업데이트하는 함수
    def train_actor(self, states, actions, advantages):
        policy = self.actor(states)
        action_prob = torch.sum(actions * policy, dim=1)
        cross_entropy = torch.log(action_prob + 1.e-7) * advantages.detach()
        actor_loss = -torch.mean(cross_entropy)

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        return actor_loss.item()
    
    # 가치신경망을 업데이트하는 states
    def train_critic(self, values, targets):
        critic_loss = torch.mean(torch.pow(targets - values, 2))

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        return critic_loss.item()

    # model의 weight를 파일로 저장
    def save(self):
        torch.save(self.actor.state_dict(), self.config.save_file + ".actor")
        torch.save(self.critic.state_dict(), self.config.save_file + ".critic")
    
    # 파일로 부터 model의 weight를 읽어 옴
    def load(self):
        self.actor.load_state_dict(torch.load(self.config.save_file + ".actor"))
        self.critic.load_state_dict(torch.load(self.config.save_file + ".critic"))
    
    # GPU 메모리 반납
    def close(self):
        del self.actor
        del self.critic
Пример #9
0
    writer = SummaryWriter("./log")

    env = gym.make("Pong-v0")
    MAXSTEP = 6
    NWORKERS = 4
    EPOCHSTEP = 4000 * 1024 // (MAXSTEP * BATCHSIZE * NWORKERS
                                )  # around ~4000 1 EPOCH in A3C paper
    print("1 epoch contains {} steps".format(EPOCHSTEP))
    NEPOCH = 100 * EPOCHSTEP
    GAMMA = 0.99
    NFRAMES = 4

    policy_net = PolicyNet(NFRAMES)
    policy_net.cuda()
    policy_net.share_memory()  # make it store in shared memory
    opt = optim.RMSprop(policy_net.parameters(), lr=5e-4, alpha=0.99, eps=1e-5)

    samplers = [
        EnvSampler(env, policy_net, NFRAMES, MAXSTEP, GAMMA)
        for _ in range(NWORKERS)
    ]
    global_step = 0

    ctx = mp.get_context('spawn')
    queue = ctx.Queue()
    event = ctx.Event()

    workers = []
    for i in range(NWORKERS):
        worker = ctx.Process(target=sample,
                             args=(samplers[i], queue, event),
Пример #10
0
if is_unwrapped:
    env = env.unwrapped

# Get device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Current usable device is: ", device)

# Create the model
policy_net = PolicyNet(layer_sizes, action_lim).to(device)  # Policy network
value_net = ValueNet(input_size).to(device)  # Value network

# Set up memory
memory = Memory(capacity, device)

# Set up optimizer
policynet_optimizer = optim.Adam(policy_net.parameters(), lr=policy_lr)
valuenet_optimizer = optim.Adam(value_net.parameters(), lr=valuenet_lr)

###################################################################
# Start training

# Dictionary for extra training information to save to checkpoints
training_info = {
    "epoch mean durations": [],
    "epoch mean rewards": [],
    "max reward achieved": 0,
    "past %d epochs mean reward" % num_avg_epoch: 0,
    "value net loss": []
}

# Batch that records trajectories
Пример #11
0
    env = env.unwrapped

# Get device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Current usable device is: ", device)

# Create the model
policy_net = PolicyNet(layer_sizes).to(device)  # Policy network
value_net = ValueNet(input_size).to(device)  # Value network

# Set up memory
memory = Memory(capacity, device)

# Set up optimizer
# policynet_optimizer = optim.Adam(policy_net.parameters(), lr=policy_lr)
policynet_optimizer = optim.RMSprop(policy_net.parameters(), lr=policy_lr)
valuenet_optimizer = optim.Adam(value_net.parameters(), lr=valuenet_lr)

###################################################################
# Start training

# Dictionary for extra training information to save to checkpoints
training_info = {
    "epoch mean durations": [],
    "epoch mean rewards": [],
    "max reward achieved": 0,
    "past %d epochs mean reward" % num_avg_epoch: 0,
    "value net loss": []
}

# Batch that records trajectories
Пример #12
0
class SAC:
    def __init__(self, env, gamma, tau, buffer_maxlen, value_lr, q_lr, policy_lr):

        self.env = env
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        self.action_range = [env.action_space.low, env.action_space.high]

        # hyperparameters
        self.gamma = gamma
        self.tau = tau

        # initialize networks
        self.value_net = ValueNet(self.state_dim).to(device)
        self.target_value_net = ValueNet(self.state_dim).to(device)
        self.q1_net = SoftQNet(self.state_dim, self.action_dim).to(device)
        self.q2_net = SoftQNet(self.state_dim, self.action_dim).to(device)
        self.policy_net = PolicyNet(self.state_dim, self.action_dim).to(device)

        # Load the target value network parameters
        for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):
            target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param)

            # Initialize the optimizer
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=value_lr)
        self.q1_optimizer = optim.Adam(self.q1_net.parameters(), lr=q_lr)
        self.q2_optimizer = optim.Adam(self.q2_net.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr)

        # Initialize thebuffer
        self.buffer = ReplayBeffer(buffer_maxlen)

    def get_action(self, state):
        action = self.policy_net.action(state)
        action = action * (self.action_range[1] - self.action_range[0]) / 2.0 + \
                 (self.action_range[1] + self.action_range[0]) / 2.0

        return action

    def update(self, batch_size):
        state, action, reward, next_state, done = self.buffer.sample(batch_size)
        new_action, log_prob = self.policy_net.evaluate(state)

        # V value loss
        value = self.value_net(state)
        new_q1_value = self.q1_net(state, new_action)
        new_q2_value = self.q2_net(state, new_action)
        next_value = torch.min(new_q1_value, new_q2_value) - log_prob
        value_loss = F.mse_loss(value, next_value.detach())

        # Soft q  loss
        q1_value = self.q1_net(state, action)
        q2_value = self.q2_net(state, action)
        target_value = self.target_value_net(next_state)
        target_q_value = reward + done * self.gamma * target_value
        q1_value_loss = F.mse_loss(q1_value, target_q_value.detach())
        q2_value_loss = F.mse_loss(q2_value, target_q_value.detach())

        # Policy loss
        policy_loss = (log_prob - torch.min(new_q1_value, new_q2_value)).mean()

        # Update v
        self.value_optimizer.zero_grad()
        value_loss.backward()
        self.value_optimizer.step()

        # Update Soft q
        self.q1_optimizer.zero_grad()
        self.q2_optimizer.zero_grad()
        q1_value_loss.backward()
        q2_value_loss.backward()
        self.q1_optimizer.step()
        self.q2_optimizer.step()

        # Update Policy
        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()

        # Update target networks
        for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):
            target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param)