def run_batch_episode_exp(total_eps: int, update_every: int, wandb_project: str, wandb_group: str): # NOTE: # This code doesn't run properly on Windows 10. # The result can be reproduced on Ubuntu and Mac OS. config = dict() config['update_every'] = update_every wandb.init(project=wandb_project, entity='junyoung-park', reinit=True, group=wandb_group, config=config) env = gym.make('CartPole-v1') s_dim = env.observation_space.shape[0] a_dim = env.action_space.n policy_net = MLP(s_dim, a_dim, [128]) value_net = MLP(s_dim, 1, [128]) agent = TDActorCritic(policy_net, value_net) memory = EpisodicMemory(max_size=100, gamma=1.0) n_update = 0 wandb.watch(agent) for ep in range(total_eps): s = env.reset() cum_r = 0 while True: s = to_tensor(s, size=(1, 4)) a = agent.get_action(s) ns, r, done, info = env.step(a.item()) # preprocess data r = torch.ones(1, 1) * r done = torch.ones(1, 1) * done memory.push(s, a.view(-1, 1), r, to_tensor(ns, size=(1, 4)), done) s = ns cum_r += r if done: break if ep % update_every == 0: s, a, r, ns, done, _ = memory.get_samples() agent.update(state=s.float(), action=a.float(), reward=r.float(), next_state=ns.float(), done=done) memory.reset() n_update += 1 wandb.log({"episode return": cum_r, "num_update": n_update})
def __init__(self): super(Critic, self).__init__() self.state_encoder = MLP(3, 64, num_neurons=[], out_act='ReLU') # single layer model self.action_encoder = MLP(1, 64, num_neurons=[], out_act='ReLU') # single layer model self.q_estimator = MLP(128, 1, num_neurons=[32], hidden_act='ReLU', out_act='Identity')
def __init__(self): super(Actor, self).__init__() self.mlp = MLP(3, 1, num_neurons=[128, 64], hidden_act='ReLU', out_act='Identity')
def run_minibatch_fullbatch(num_reps : int, n_samples : int, batch_size : int, epoch : int): criteria = torch.nn.MSELoss() sgd_losses = [] gd_losses = [] for _ in range(num_reps): mlp = MLP(input_dim=1, output_dim=1, num_neurons=[64], hidden_act='Identity', out_act='Identity') opt = torch.optim.Adam(params=mlp.parameters(), lr=1e-3) mlp2 = MLP(input_dim=1, output_dim=1, num_neurons=[64], hidden_act='Identity', out_act='Identity') mlp2.load_state_dict(mlp.state_dict()) opt2 = torch.optim.Adam(params=mlp2.parameters(), lr=1e-3) xs, ys = generate_samples(n_samples) ds = torch.utils.data.TensorDataset(xs, ys) data_loader = torch.utils.data.DataLoader(ds, batch_size=batch_size) full_loader = torch.utils.data.DataLoader(ds, batch_size=n_samples) # SGD - Mini batch sgd_loss = train_model(mlp, opt, data_loader, epoch, criteria, xs, ys) sgd_losses.append(sgd_loss) # GD - Full batch gd_loss = train_model(mlp2, opt2, full_loader, epoch, criteria, xs, ys) gd_losses.append(gd_loss) sgd_losses = np.stack(sgd_losses) gd_losses = np.stack(gd_losses) return sgd_losses, gd_losses
def run_exp(total_eps: int, wandb_project: str, wandb_group: str): # NOTE: # This code doesn't run properly on Windows 10 # The result can be reproduced on Ubuntu and Mac OS. config = dict() config['sample_update'] = True wandb.init(project=wandb_project, entity='junyoung-park', reinit=True, group=wandb_group, config=config) env = gym.make('CartPole-v1') s_dim = env.observation_space.shape[0] a_dim = env.action_space.n policy_net = MLP(s_dim, a_dim, [128]) value_net = MLP(s_dim, 1, [128]) agent = TDActorCritic(policy_net, value_net) n_update = 0 for ep in range(total_eps): s = env.reset() cum_r = 0 while True: s = to_tensor(s, size=(1, 4)) a = agent.get_action(s) ns, r, done, info = env.step(a.item()) ns = to_tensor(ns, size=(1, 4)) agent.update(s, a.view(-1, 1), r, ns, done) s = ns.numpy() cum_r += r n_update += 1 if done: break wandb.log({"episode return": cum_r, "num_update": n_update})
self.s = None self.alpha = alpha def update(self, y): if self.s is None: self.s = y else: self.s = self.alpha * y + (1 - self.alpha) * self.s env = gym.make('CartPole-v1') s_dim = env.observation_space.shape[0] a_dim = env.action_space.n qnet = MLP(input_dim=s_dim, output_dim=a_dim, num_neurons=[128], hidden_act='ReLU', out_act='Identity') agent = NaiveDQN(state_dim=s_dim, action_dim=a_dim, qnet=qnet, lr=1e-4, gamma=1.0, epsilon=1.0) n_eps = 10000 print_every = 500 ema_factor = 0.5 ema = EMAMeter(ema_factor)
dist = Categorical(logits=self.policy(s)) prob = dist.probs[a] # Don't forget to put '-' in the front of pg_loss !!!!!!!!!!!!!!!! # the default behavior of pytorch's optimizer is to minimize the targets # add 'self_eps' to prevent numerical problems of logarithms # 여기서의 loss는 maximize가 되어야 하기 때문에 앞에 -를 붙여준다 pg_loss = - torch.log(prob + self._eps) * g self.opt.zero_grad() pg_loss.backward() self.opt.step() ``` ''' net = MLP(s_dim, a_dim, [128]) agent = REINFORCE(net) ema = EMAMeter() n_eps = 10000 print_every = 500 for ep in range(n_eps): s = env.reset() cum_r = 0 states = [] actions = [] rewards = [] while True:
self.optimizer.step() if __name__ == '__main__': import gym import torch from src.part3.MLP import MultiLayerPerceptron as MLP from src.part4.ActorCritic import TDActorCritic from src.common.train_utils import EMAMeter, to_tensor env = gym.make('CartPole-v1') s_dim = env.observation_space.shape[0] a_dim = env.action_space.n policy_net = MLP(s_dim, a_dim, [128]) value_net = MLP(s_dim, 1, [128]) agent = TDActorCritic(policy_net, value_net) ema = EMAMeter() n_eps = 10000 print_every = 500 for ep in range(n_eps): s = env.reset() cum_r = 0 while True: s = to_tensor(s, size=(1, 4)) a = agent.get_action(s).view(-1, 1)
이 실습에 쓰인 하이퍼 파라미터는 https://github.com/seungeunrho/minimalRL/blob/master/dqn.py 에서 제안된 값들을 사용하였습니다. ''' lr = 1e-4 * 5 batch_size = 256 gamma = 1.0 memory_size = 50000 total_eps = 3000 # eplison은 시간이 지남에 따라 감소시키도록 한다 eps_max = 0.08 eps_min = 0.01 # 초반에 학습을 시작하기 전에 먼저 샘플을 2000번까지 모아놓기로 한다(불안정성을 방지하기 위해) sampling_only_until = 2000 target_update_interval = 10 # Q-network와 Q_target-network를 Multi layers perceptron으로 구현 qnet = MLP(4, 2, num_neurons=[128]) qnet_target = MLP(4, 2, num_neurons=[128]) # 초기의 target network와 main network는 동일하다 # initialize target network same as the main network. qnet_target.load_state_dict(qnet.state_dict()) agent = DQN(4, 1, qnet=qnet, qnet_target=qnet_target, lr=lr, gamma=gamma, epsilon=1.0) env = gym.make('CartPole-v1') memory = ReplayMemory(memory_size)
def run_DQN(batch_size: int, target_update_interval: int, wandb_project: str): # the hyperparameters are taken from 'minimalRL' implementation # https://github.com/seungeunrho/minimalRL/blob/master/dqn.py # the usage is under agreement with the original author. lr = 1e-4 * 5 batch_size = batch_size gamma = 1.0 memory_size = 50000 total_eps = 3000 eps_max = 0.08 eps_min = 0.01 sampling_only_until = 2000 config = dict() config['lr'] = lr config['batch_size'] = batch_size config['target_update_interval'] = target_update_interval config['total_eps'] = total_eps config['eps_max'] = eps_max config['eps_min'] = eps_min config['sampling_only_until'] = sampling_only_until wandb.init(project=wandb_project, entity='junyoung-park', reinit=True, config=config) qnet = MLP(4, 2, num_neurons=[128]) qnet_target = MLP(4, 2, num_neurons=[128]) # initialize target network same as the main network. qnet_target.load_state_dict(qnet.state_dict()) agent = DQN(4, 1, qnet=qnet, qnet_target=qnet_target, lr=lr, gamma=gamma, epsilon=1.0) wandb.watch(agent) env = gym.make('CartPole-v1') memory = ReplayMemory(memory_size) for n_epi in range(total_eps): # epsilon scheduling # slowly decaying_epsilon epsilon = max(eps_min, eps_max - eps_min * (n_epi / 200)) agent.epsilon = torch.tensor(epsilon) s = env.reset() cum_r = 0 while True: s = to_tensor(s, size=(1, 4)) a = agent.get_action(s) ns, r, done, info = env.step(a) experience = (s, torch.tensor(a).view(1, 1), torch.tensor( r / 100.0).view(1, 1), torch.tensor(ns).view(1, 4), torch.tensor(done).view(1, 1)) memory.push(experience) s = ns cum_r += r if done: break if len(memory) >= sampling_only_until: # train agent sampled_exps = memory.sample(batch_size) sampled_exps = prepare_training_inputs(sampled_exps) agent.update(*sampled_exps) if n_epi % target_update_interval == 0: qnet_target.load_state_dict(qnet.state_dict()) log_dict = dict() log_dict['cum_r'] = cum_r log_dict['epsilon'] = epsilon wandb.log(log_dict) torch.save(agent.state_dict(), join(wandb.run.dir, "agent.pt")) wandb.join()
> 모델을 train 모드로 바꾸기 ```python model.train() ``` > 모델을 eval 모드로 바꾸기 ```python model.eval() ``` ''' ## 다층 퍼셉트론 모델 만들기 mlp = MLP(input_dim=1, output_dim=1, num_neurons=[64,32,32], hidden_act='ReLU', out_act='Identity') mlp ## model.state_dict() ''' `model.state_dict()` 는 모델의 파라미터 값 및 buffer의 현재 상태등을 저장해놓은 dictionary 입니다. 차후에 <5. 심층강화학습> 자주 사용하게 될건데요. 많은 경우, 훈련중 혹은 훈련이 끝난 모델의 파라미터를 구한 후 디스크에 저장하는 용도로 사용하게 됩니다. 이 경우에는 똑같은 모델 두개를 만들기 위해서 사용해볼까요? ''' mlp.state_dict() mlp2 = MLP(input_dim=1,