def __init__(self, policy_model, input_shape, action_size, pixel_control=True, RP=1.0, PC=1.0, VR=1.0, entropy_coeff=0.001, value_coeff=0.5, lr=1e-3, lr_final=1e-4, decay_steps=50e6, grad_clip=0.5, policy_args={}, optim=torch.optim.RMSprop, device='cuda', optim_args={}): super(UnrealA2C2, self).__init__() self.RP, self.PC, self.VR = RP, PC, VR self.lr = lr self.entropy_coeff, self.value_coeff = entropy_coeff, value_coeff self.pixel_control = pixel_control self.grad_clip = grad_clip self.action_size = action_size self.device = device try: iterator = iter(input_shape) except TypeError: input_size = (input_shape,) self.policy = ActorCritic(policy_model, input_shape, action_size, entropy_coeff=entropy_coeff, value_coeff=value_coeff, build_optimiser=False, device=device, **policy_args) if pixel_control: self.feat_map = torch.nn.Sequential(torch.nn.Linear(self.policy.dense_size, 32*8*8), torch.nn.ReLU()).to(device) self.deconv1 = torch.nn.Sequential(torch.nn.ConvTranspose2d(32, 32, kernel_size=[3,3], stride=[1,1]), torch.nn.ReLU()).to(device) self.deconv_advantage = torch.nn.ConvTranspose2d(32, action_size, kernel_size=[3,3], stride=[2,2]).to(device) self.deconv_value = torch.nn.ConvTranspose2d(32, 1, kernel_size=[3,3], stride=[2,2]).to(device) # reward model self.r1 = torch.nn.Sequential(torch.nn.Linear(self.policy.dense_size, 128), torch.nn.ReLU()).to(device) self.r2 = torch.nn.Linear(128, 3).to(device) self.optimiser = optim(self.parameters(), lr, **optim_args) self.scheduler = polynomial_sheduler(self.optimiser, lr_final, decay_steps, power=1)
def __init__(self, policy_model, target_model, input_size, action_size, entropy_coeff=0.001, intr_coeff=0.5, extr_coeff=1.0, lr=1e-4, lr_final=0, decay_steps=1e5, grad_clip=0.5, policy_clip=0.1, policy_args={}, RND_args={}, optim=torch.optim.Adam, optim_args={}, device='cuda'): super(RND, self).__init__() self.intr_coeff = intr_coeff self.extr_coeff = extr_coeff self.entropy_coeff = entropy_coeff self.lr = lr self.grad_clip = grad_clip self.action_size = action_size self.device = device target_size = (1, input_size[1], input_size[2]) if len( input_size ) == 3 else input_size # only use last frame in frame-stack for convolutions self.policy = PPOIntrinsic(policy_model, input_size, action_size, lr, lr_final, decay_steps, grad_clip, entropy_coeff=entropy_coeff, policy_clip=policy_clip, extr_coeff=extr_coeff, intr_coeff=intr_coeff, device=device, build_optimiser=False, **policy_args) # randomly weighted and fixed neural network, acts as a random_id for each state self.target_model = target_model(target_size, trainable=False).to(device) # learns to predict target model # i.e. provides rewards based ability to predict a fixed random function, thus behaves as density map of explored areas self.predictor_model = target_model(target_size, trainable=True).to(device) self.optimiser = optim(self.parameters(), lr, **optim_args) self.scheduler = polynomial_sheduler(self.optimiser, lr_final, decay_steps, power=1)
def __init__(self, model, input_shape, action_size, lr=1e-3, lr_final=0, decay_steps=6e5, grad_clip=0.5, build_optimiser=True, optim=torch.optim.Adam, optim_args={}, device='cuda', **model_args): super(ValueModel, self).__init__() self.lr = lr self.lr_final = lr_final self.action_size = action_size self.decay_steps = decay_steps self.grad_clip = grad_clip self.device = device self.model = model(input_shape, **model_args).to(self.device) dense_size = self.model.dense_size self.V = torch.nn.Linear(dense_size, 1).to(self.device) if build_optimiser: self.optimiser = optim(self.parameters(), lr, **optim_args) self.scheduler = polynomial_sheduler(self.optimiser, lr_final, decay_steps, power=1)
def __init__(self, policy_model, ICM_model, input_size, action_size, forward_coeff, policy_importance, reward_scale, entropy_coeff, value_coeff=0.5, lr=1e-3, lr_final=1e-3, decay_steps=6e5, grad_clip=0.5, policy_args={}, ICM_args={}, device='cuda'): super(Curiosity, self).__init__() self.reward_scale, self.forward_coeff, self.policy_importance, self.entropy_coeff = reward_scale, forward_coeff, policy_importance, entropy_coeff self.lr, self.lr_final, self.decay_steps = lr, lr_final, decay_steps self.grad_clip = grad_clip self.action_size = action_size self.device = device try: iterator = iter(input_size) except TypeError: input_size = (input_size, ) self.ICM = ICM(ICM_model, input_size, action_size, forward_coeff, device=device, **ICM_args) self.AC = ActorCritic(policy_model, input_size, action_size, entropy_coeff, value_coeff, lr, lr_final, decay_steps, grad_clip, build_optimiser=False, device=device, **policy_args) self.optimiser = torch.optim.RMSprop(self.parameters(), lr=lr) self.scheduler = polynomial_sheduler(self.optimiser, lr_final, decay_steps, power=1)
def __init__(self, model, input_size, action_size, lr=1e-3, lr_final=0, decay_steps=6e5, grad_clip=0.5, entropy_coeff=0.01, policy_clip=0.1, extr_coeff=2.0, intr_coeff=1.0, build_optimiser=True, optim=torch.optim.Adam, optim_args={}, device='cuda', **model_args): super(PPOIntrinsic, self).__init__() self.action_size = action_size self.input_size = input_size self.lr = lr self.lr_final = lr_final self.decay_steps = decay_steps self.grad_clip = grad_clip self.entropy_coeff = entropy_coeff self.policy_clip = policy_clip self.extr_coeff = extr_coeff self.intr_coeff = intr_coeff self.device = device self.model = model(input_size, **model_args).to(self.device) self.dense_size = dense_size = self.model.dense_size self.policy = torch.nn.Sequential( torch.nn.Linear(dense_size, action_size), torch.nn.Softmax(dim=-1)).to(self.device) # Actor self.Ve = torch.nn.Linear(dense_size, 1).to(self.device) # Critic (Extrinsic) self.Vi = torch.nn.Linear(dense_size, 1).to( self.device ) # Intrinsic Value i.e. expected instrinsic value of state if build_optimiser: self.optimiser = optim(self.parameters(), lr, **optim_args) self.scheduler = polynomial_sheduler(self.optimiser, lr_final, decay_steps, power=1)
def __init__(self, model, input_size, action_size, cell_size, entropy_coeff=0.01, value_coeff=0.5, lr=1e-3, lr_final=1e-6, decay_steps=6e5, grad_clip=0.5, build_optimiser=True, optim=torch.optim.RMSprop, optim_args={}, device='cuda', **model_args): super(ActorCritic_LSTM, self).__init__() self.lr = lr self.lr_final = lr_final self.input_size = input_size self.entropy_coeff = entropy_coeff self.value_coeff = value_coeff self.decay_steps = decay_steps self.grad_clip = grad_clip self.cell_size = cell_size self.action_size = action_size self.device = device self.model = model(input_size, **model_args).to(self.device) self.dense_size = self.model.dense_size #self.lstm = MaskedRNN(MaskedLSTMCell(cell_size, self.dense_size), time_major=True) self.lstm = MaskedLSTMBlock(self.dense_size, cell_size, time_major=True).to(self.device) self.policy_distrib = torch.nn.Linear(cell_size, action_size, device=self.device) # Actor self.V = torch.nn.Linear(cell_size, 1, device=self.device) # Critic if build_optimiser: self.optimiser = optim(self.parameters(), lr, **optim_args) self.scheduler = polynomial_sheduler(self.optimiser, lr_final, decay_steps, power=1)
def __init__(self, model, input_shape, action_size, lr=1e-3, lr_final=0, decay_steps=6e5, grad_clip=0.5, entropy_coeff=0.01, policy_clip=0.1, adv_coeff=0.25, build_optimiser=True, optim=torch.optim.Adam, optim_args={}, device='cuda', **model_args): super(PolicyModel, self).__init__() self.lr = lr self.lr_final = lr_final self.action_size = action_size self.entropy_coeff = entropy_coeff self.decay_steps = decay_steps self.grad_clip = grad_clip self.policy_clip = policy_clip self.adv_coeff = adv_coeff self.device = device self.model = model(input_shape, **model_args).to(self.device) dense_size = self.model.dense_size self.policy = torch.nn.Sequential( torch.nn.Linear(dense_size, action_size), torch.nn.Softmax(dim=-1)).to(self.device) self.Adv = torch.nn.Linear(dense_size, 1).to(self.device) if build_optimiser: self.optimiser = optim(self.parameters(), lr, **optim_args) self.scheduler = polynomial_sheduler(self.optimiser, lr_final, decay_steps, power=1)
def __init__(self, policy_model, target_model, input_size, action_size, pixel_control=True, intr_coeff=0.5, extr_coeff=1.0, entropy_coeff=0.001, policy_clip=0.1, lr=1e-4, lr_final=1e-5, decay_steps=6e5, grad_clip=0.5, RP=1, VR=1, PC=1, policy_args={}, RND_args={}, optim=torch.optim.Adam, optim_args={}, device='cuda'): super(RANDAL, self).__init__() self.lr = lr self.entropy_coeff = entropy_coeff self.intr_coeff = intr_coeff self.extr_coeff = extr_coeff self.pixel_control = pixel_control self.grad_clip = grad_clip self.action_size = action_size self.device = device self.RP = RP # reward prediction self.VR = VR # value replay self.PC = PC # pixel control self.policy = PPOIntrinsic(policy_model, input_size, action_size, lr=lr, lr_final=lr_final, decay_steps=decay_steps, grad_clip=grad_clip, entropy_coeff=entropy_coeff, policy_clip=policy_clip, extr_coeff=extr_coeff, intr_coeff=intr_coeff, build_optimiser=False, **policy_args) target_size = (1, input_size[1], input_size[2]) if len( input_size ) == 3 else input_size # only use last frame in frame-stack for convolutions # randomly weighted and fixed neural network, acts as a random_id for each state self.target_model = target_model(target_size, trainable=False, **RND_args).to(device) # learns to predict target model # i.e. provides rewards based ability to predict a fixed random function, thus behaves as density map of explored areas self.predictor_model = target_model(target_size, trainable=True, **RND_args).to(device) self.optimiser = optim(self.parameters(), lr, **optim_args) self.scheduler = polynomial_sheduler(self.optimiser, lr_final, decay_steps, power=1) if pixel_control: self.feat_map = torch.nn.Sequential( torch.nn.Linear(self.policy.dense_size, 32 * 8 * 8), torch.nn.ReLU()).to(device) self.deconv1 = torch.nn.Sequential( torch.nn.ConvTranspose2d(32, 32, kernel_size=[3, 3], stride=[1, 1]), torch.nn.ReLU()).to(device) self.deconv_advantage = torch.nn.ConvTranspose2d( 32, action_size, kernel_size=[3, 3], stride=[2, 2]).to(device) self.deconv_value = torch.nn.ConvTranspose2d(32, 1, kernel_size=[3, 3], stride=[2, 2]).to(device) # reward model self.r1 = torch.nn.Sequential( torch.nn.Linear(self.policy.dense_size, 128), torch.nn.ReLU()).to(device) self.r2 = torch.nn.Linear(128, 3).to(device) self.optimiser = optim(self.parameters(), lr, **optim_args) self.scheduler = polynomial_sheduler(self.optimiser, lr_final, decay_steps, power=1)