def select_action(state): state = torch.from_numpy(state).float().unsqueeze(0) probs = policy(state) m = Categorical(probs) action = m.sample() policy.saved_log_probs.append(m.log_prob(action)) return action.item()
def select_action(state): state = torch.from_numpy(state).float() probs, state_value = model(state) m = Categorical(probs) action = m.sample() model.saved_actions.append(SavedAction(m.log_prob(action), state_value)) return action.item()
def select_action(state): state = torch.from_numpy(state).float().unsqueeze(0) probs, state_value = model(Variable(state)) m = Categorical(probs) action = m.sample() model.saved_actions.append(SavedAction(m.log_prob(action), state_value)) return action.data[0]
def select_action(state): state = torch.from_numpy(state).float().unsqueeze(0) state = state.cuda() probs = policy(Variable(state)) m = Categorical(probs) action = m.sample() policy.saved_log_probs.append(m.log_prob(action)) return action.data[0]
def run(self, x): x=Variable(x) p=self(x) if self.original_output: d=Categorical(logits=p) else: #Suppose after the output_activation, we get the probability(i.e. a softmax activation) #This assumption might be false. d=Categorical(probs=p) action=d.sample() log_prob=d.log_prob(action) return action, log_prob
def run(self, x): x=Variable(Tensor(x)) p=self(x) if self.original_output: d=Categorical(logits=p) else: #Suppose after the output_activation, we get the probability(i.e. a softmax activation) #This assumption might be false. d=Categorical(probs=p) action=d.sample() self.history_of_log_probs.append(d.log_prob(action)) return action #haven't checked the type of action, might be buggy here
def ctrl_fn(state): state_feats = torch.from_numpy(state.board.flatten()).float().unsqueeze(0) probs, value = net(state_feats) mask = torch.zeros_like(probs).index_fill_(1, torch.from_numpy(state.valid_actions), 1) probs = probs * mask if train: m = Categorical(probs) action = m.sample() net.log_prob = m.log_prob(action) net.value = value return action.item() else: action = probs.argmax(dim=-1) return action.item()
def __init__(self, n_obs, mu0, sigma0, n_clusters, hidden_dim = 30): # dimension of the problem self.dim = len(mu0) self.n_clusters = n_clusters self.n_obs = n_obs # prior parameters self.mu0 = mu0 self.sigma0 = torch.Tensor([sigma0]) # uniform prior on weights self.prior_weights = torch.ones(self.n_clusters) / self.n_clusters # true parameters self.set_true_params() self.cat_rv = Categorical(probs = self.prior_weights) # the encoder # self.gmm_encoder = GMMEncoder(data_dim = self.dim, # n_classes = self.n_clusters, # hidden_dim = hidden_dim) # # self.var_params = {'encoder_params': self.gmm_encoder.parameters()} # other variational paramters: we use point masses for # the means and variances self.set_random_var_params() # draw data self.n_obs = n_obs self.y, self.z = self.draw_data(n_obs = n_obs)
def __init__(self, total_count=1, probs=None, logits=None, validate_args=None): if not isinstance(total_count, Number): raise NotImplementedError('inhomogeneous total_count is not supported') self.total_count = total_count self._categorical = Categorical(probs=probs, logits=logits) batch_shape = self._categorical.batch_shape event_shape = self._categorical.param_shape[-1:] super(Multinomial, self).__init__(batch_shape, event_shape, validate_args=validate_args)
def __init__(self, slen = 68, padding = 14, data_dir = '../mnist_data/', propn_sample = 1.0, indices = None, train_set = True): # slen is the side length of the image on which an mnist digit (28 x 28) # is placed. Padding is the width of the border of the full image super(MovingMNISTDataSet, self).__init__() # Load MNIST dataset assert os.path.exists(data_dir) # This is the full dataset self.mnist_data_set = load_mnist_data(data_dir = data_dir, train = train_set) if train_set: n_image_full = len(self.mnist_data_set.train_labels) else: n_image_full = len(self.mnist_data_set.test_labels) # we may wish to subset if indices is None: self.num_images = round(n_image_full * propn_sample) self.sample_indx = np.random.choice(n_image_full, self.num_images, replace = False) else: self.num_images = len(indices) self.sample_indx = indices # set up parameters for moving MNIST # original mnist side length self.mnist_slen = self.mnist_data_set[0][0].shape[-1] # padded side-length self.slen = slen self.padding = padding # number of possible pixel locations self.n_pixel_1d = (slen - 2 * padding) ** 2 # define uniform categorical variable over pixels unif_probs = torch.ones(self.n_pixel_1d) / self.n_pixel_1d unif_probs = unif_probs.view(-1, self.n_pixel_1d) self.categorical = Categorical(unif_probs) # for padding the image, we cache this grid r0 = (slen - 1) / 2 self.grid_out = \ torch.FloatTensor(np.mgrid[0:slen, 0:slen].transpose() - r0)
def forward(self, jobs, machines, allocable_jobs=None, allocable_machines=None, argmax=False): job_input_size = jobs.size(0) machine_input_size = machines.size(0) E_j, E_m = self.get_embedding(jobs, machines) g_j1 = self.get_job_attention(self.last_j, E_j) g_m1 = self.get_node_attention(self.last_j, E_m) E_j = torch.cat([E_j, self.no_select_job.unsqueeze(0)], dim=0) E_m = torch.cat([E_m, self.no_select_machine.unsqueeze(0)], dim=0) g_1 = torch.cat([self.last_j, g_j1, g_m1]) j_logits = self.j_att(g_1, E_j) ### selecting processes if allocable_jobs is not None: x = [] for _ in range(job_input_size): if _ not in allocable_jobs: x.append(_) if len(x) > 0: mask = torch.from_numpy(np.array(x, dtype=int)) j_logits[mask] = -1e8 job_softmax = torch.softmax(j_logits, 0) job_sampler = Categorical(job_softmax) if argmax: selected_job = torch.argmax(job_softmax) else: try: selected_job = job_sampler.sample() except: raise UnboundLocalError; if selected_job == job_input_size: return (-1, -1), job_sampler.log_prob(selected_job) as_i = int(selected_job.detach().numpy()) e_js = E_j[selected_job] g_j2 = self.get_job_attention(e_js, E_j) g_m2 = self.get_node_attention(e_js, E_m) g_2 = torch.cat([e_js, g_j2, g_m2]) m_logits = self.m_att(g_2, E_m) self.last_j = e_js.detach() ### selecting process if allocable_machines is not None: x = [] for _ in range(machine_input_size): if _ not in allocable_machines[as_i]: x.append(_) mask = torch.from_numpy(np.array(x, dtype=int)) m_logits[mask] = -1e8 machine_softmax = torch.softmax(m_logits, 0) machine_sampler = Categorical(machine_softmax) if argmax: selected_machine = torch.argmax(machine_softmax, -1) else: selected_machine = machine_sampler.sample() logpas = machine_sampler.log_prob(selected_machine) + job_sampler.log_prob(selected_job) if selected_machine == machine_input_size: return (-1, -1), logpas return (int(selected_job.detach().numpy()), int(selected_machine.detach().numpy())), logpas
def get_action(self, state): state = u.t_from_np_to_float32(state) probs = self.actor(state) return Categorical(probs).sample().item()
def learn_multi(model, update_timestep, env, max_reward=-2): log_interval = update_timestep total_correct_moves=0 correct_moves = 0 my_rewards = [0, 0, 0, 0] # used for correct reward adding (if round is finished) max_reward = 1 jjj = 0 for i_episode in range(100000000): h_out = (torch.zeros([1, 1, 32], dtype=torch.float), torch.zeros([1, 1, 32], dtype=torch.float)) s = env.reset() done = False while not done: for t in range(T_horizon): i = env.my_game.active_player h_in = h_out prob, h_out = model[i].pi(torch.from_numpy(s).float(), h_in) prob = prob.view(-1) m = Categorical(prob) a = m.sample().item() s_prime, r, done, info = env.step(a) if r["ai_reward"] is None: # illegal move rr=-1 else:#shift round ->0 or leagal play move rr=0 model[i].put_data((s, a, rr, s_prime, prob[a].item(), h_in, h_out, done)) if info["round_finished"] and r["state"] == "play" and int(r["ai_reward"]) is not None: for u in range(4): last_transition= model[u].data if len(model[u].data)>1: last_transition= model[u].data[:-1] last_transition_list = list(last_transition[0]) last_transition_list[2] = (int(r["final_rewards"][u])+60)/40 last_transition[0] = last_transition_list model[u].data[:-1] = last_transition # win_player = r["player_win_idx"] # last_transition= model[win_player].data # if len(model[win_player].data)>1: # last_transition= model[win_player].data[:-1] # last_transition_list = list(last_transition[0]) # last_transition_list[2] = int(r["final_rewards"][win_player]) # last_transition[0] = last_transition_list # model[win_player].data[:-1] = last_transition s = s_prime if done: break #for lll in range(4): model[i].train_net() total_correct_moves +=info["correct_moves"] if i_episode % log_interval == 0: jjj +=1 total_correct_moves = total_correct_moves/log_interval corr_moves, mean_reward, finished_games = test_with_random(model[0], env, jjj) #test play against random aaa = ('Game ,{:07d}, reward per game in {} g. ,{:0.5}, corr_moves ,{:4.4}, Time ,{},\n'.format(i_episode, finished_games, float(mean_reward), float(corr_moves), datetime.datetime.now()-start_time)) print(aaa) #max correct moves: 61 if mean_reward>max_reward and corr_moves>2.0: path = 'PPO_{}_{}_{}'.format(i_episode, finished_games, mean_reward) torch.save(model[0].state_dict(), path+".pth") max_reward = mean_reward print("exported path \n") total_correct_moves = 0 with open(log_path, "a") as myfile: myfile.write(aaa)
def sample_class_weights(class_weights): # draw a sample from Categorical variable with # probabilities class_weights cat_rv = Categorical(probs = class_weights) return cat_rv.sample().detach()
def lfl(n_run, tmax, kmax, trajectories=None, computeTrajectories=False): # set hyperparameters gride_size = 5 n_states = gride_size**2 n_actions = 4 mu = np.zeros(n_states) mu[0] = 1 gamma = 0.96 alpha = 0.3 alpha_model = 0.7 entropy_coef = 0.01 n_epoch = 10 # generate a deterministic gridworld: g = Grid(gride_size, stochastic=False) # we just need the reward and dynamic of the MDP: r_gpomdp, p_gpomdp, _ = g.make_tables_gpomdp() r, p = g.make_tables() learner_score = [] observer_score = [] weights = [] trajectories_spi = [] for run in range(n_run): print('run', run) torch.manual_seed(run) # init first policy pi = np.ones((n_states, n_actions)) / n_actions # sample initial trajectory: if trajectories is None: np.random.seed(run) trajectory = sample_sa_trajectory(p, pi, tmax) else: trajectory = trajectories[run, 0] print(trajectory.shape) trajectory = trajectory.tolist() # transition estimation: p_ = np.ones((n_states, n_actions, n_states)) * 1e-15 count = np.ones((n_states, n_actions, n_states)) * n_states * 1e-15 for (s, a), (s_, _) in zip(trajectory[:-1], trajectory[1:]): p_[int(s), int(a), int(s_)] += 1 count[int(s), int(a), :] += 1 p_ /= count demos = [trajectory] policies = [torch.Tensor(pi)] # policy iterations for k in range(kmax): if trajectories is None: q = np.random.rand(n_states, n_actions) for _ in range(100): v = np.zeros(n_states) for state in range(n_states): for action_ in range(n_actions): v[state] += pi[state, action_] * \ (q[state, action_] - alpha * np.log(pi[state, action_])) q *= 0 for state in range(n_states): for action in range(n_actions): q[state, action] = r[state, action] for state_ in range(n_states): q[state, action] += gamma * p[state, action, state_] * v[state_] pi = np.zeros((n_states, n_actions)) for state in range(n_states): pi[state, :] = softmax(q[state, :] / alpha) # sample trajectory with new policy: trajectory = sample_sa_trajectory(p, pi, tmax) else: trajectory = trajectories[run, k + 1] trajectory = trajectory.tolist() demos.append(trajectory) policies.append(torch.Tensor(pi)) if not computeTrajectories: # learner score mdp_to_evaluate = MDP(n_states, n_actions, p_gpomdp, r_gpomdp, mu, gamma) j_pi_learner = mdp_to_evaluate.policy_evaluation(pi.T)[0] learner_score.append(j_pi_learner) # estimate learner policies torch_p = torch.from_numpy(p_).float() logpi_ = tuple(nn.Parameter(torch.rand(n_states, n_actions, \ requires_grad=True)) \ for _ in range(kmax + 1)) optimizer_pi = torch.optim.Adam(logpi_, lr=5e-1) for epoch in range(n_epoch): loss_pi = 0 for k, demo in enumerate(demos): demo_sas = [(s, a, s_) for (s, a), (s_, _) in zip(demo[:-1], demo[1:])] for s, a, s_ in demo_sas: dist = Categorical(torch.exp(logpi_[k][int(s), :])) log_prob_demo = torch.log(dist.probs[int(a)]) loss_pi -= (log_prob_demo + entropy_coef * dist.entropy()) optimizer_pi.zero_grad() loss_pi.backward() optimizer_pi.step() # create target reward functions: targets = [] for k, demo in enumerate(demos[:-1]): dist_2 = torch.exp(logpi_[k + 1]) \ / torch.exp(logpi_[k + 1]).sum(1, keepdim=True) dist_1 = torch.exp(logpi_[k]) / torch.exp(logpi_[k]).sum( 1, keepdim=True) kl = torch.log(dist_2) - torch.log(dist_1) r_shape = torch.zeros(n_states, n_actions) for state in range(n_states): for action in range(n_actions): r_shape[state, action] = alpha_model \ * torch.log(dist_2[state, action]) for state_ in range(n_states): for action_ in range(n_actions): r_shape[state, action] -= alpha_model * gamma \ * (kl[state_, action_]) * torch_p[state, action, state_] \ * dist_1[state_, action_] targets.append(r_shape) # recover state-action reward and shaping r_ = nn.Parameter( torch.zeros(n_states, n_actions, requires_grad=True)) r_sh = (r_,) + tuple(nn.Parameter(torch.zeros(n_states, requires_grad=True)) \ for _ in range(kmax)) optimizer = torch.optim.Adam(r_sh, lr=1) for epoch in range(200): loss = 0 for k, target in enumerate(targets): loss += \ ((r_sh[0] + r_sh[k + 1].repeat(n_actions, 1).t() - gamma * \ torch.sum(torch_p * r_sh[k + 1].repeat(n_states, n_actions, 1), 2) \ - target.detach()) ** 2).sum() optimizer.zero_grad() loss.backward() optimizer.step() r_ = r_.detach().numpy() # solve with r_: mdp = MDP(n_states, n_actions, p_gpomdp, r_.T, mu, gamma) pi_observer = mdp.get_best_policy() # observer score with true reward: mdp_to_evaluate = MDP(n_states, n_actions, p_gpomdp, r_gpomdp, mu, gamma) j_pi_observer = mdp_to_evaluate.policy_evaluation(pi_observer)[0] observer_score.append(j_pi_observer) weights.append(r_) else: trajectories_spi.append(demos) np.save( '../results/comparison_learn/lfl_SPI-v3/lfl_svi_' + '' + str(kmax + 1), observer_score) np.save( '../results/comparison_learn/lfl_SPI-v3/weights_svi_' + '' + str(kmax + 1), weights)
def act(self, obs): with torch.no_grad(): prob = self.policy_net(obs) m = Categorical(prob) return m.sample().item()
def local_train(process, global_model, optimizer): env = CreateBreakout() local_model = ActorCriticNet() local_model.load_state_dict(global_model.state_dict()) total_reward = 0 max_score = 0 for T in range(max_T): state = env.reset() done = False score = 0 while not done: log_probs, values, entropys, rewards = [], [], [], [] for t in range(max_t): prob, value = local_model(torch.FloatTensor([state])) m = Categorical(prob) action = m.sample() log_prob = m.log_prob(action) entropy = m.entropy() next_state, reward, done, _ = env.step(action.item()) score += reward log_probs.append(log_prob) values.append(value) entropys.append(entropy) rewards.append(reward) state = next_state if done: break state_final = torch.FloatTensor([next_state]) R = 0.0 if not done: _, R = local_model(state_final) R = R.item() td_target_lst = [] for reward in rewards[::-1]: R = reward + R * gamma td_target_lst.append([R]) td_target_lst.reverse() log_probs = torch.stack(log_probs) values = torch.cat(values) entropys = torch.stack(entropys) td_targets = torch.FloatTensor(td_target_lst) advantages = (td_targets - values).detach() actor_loss = -torch.mean(log_probs * advantages) critic_loss = F.smooth_l1_loss(values, td_targets.detach()) entropy_loss = torch.mean(entropys) total_loss = actor_loss + critic_loss - beta * entropy_loss optimizer.zero_grad() local_model.zero_grad() total_loss.backward() torch.nn.utils.clip_grad_norm_(local_model.parameters(), 5) for local_param, global_param in zip(local_model.parameters(), global_model.parameters()): if global_param.grad is not None: break global_param._grad = local_param.grad optimizer.step() local_model.load_state_dict(global_model.state_dict()) total_reward += score if score > max_score: max_score = score if (T + 1) % 10 == 0: print('Process {} of episode {}, avg score : {}, max score : {}'. format(process, T + 1, total_reward / 10, max_score)) total_reward = 0 env.close()
def select_action(state): action_prb = teacher_model(state.detach()) m = Categorical(action_prb) action = m.sample() teacher_model.saved_log_probs.append(m.log_prob(action)) return action
class TwoPlayerGANModel(BaseModel): @staticmethod def modify_commandline_options(parser, is_train=True): """Add new model-specific options and rewrite default values for existing options. Parameters: parser -- the option parser is_train -- if it is training phase or test phase. You can use this flag to add training-specific or test-specific options. Returns: the modified parser. """ #parser.set_defaults(dataset_mode='aligned') # You can rewrite default values for this model. For example, this model usually uses aligned dataset as its dataset if is_train: parser.add_argument( '--g_loss_mode', type=str, default='lsgan', help='lsgan | nsgan | vanilla | wgan | hinge | rsgan') parser.add_argument( '--d_loss_mode', type=str, default='lsgan', help='lsgan | nsgan | vanilla | wgan | hinge | rsgan') parser.add_argument('--which_D', type=str, default='S', help='Standard(S) | Relativistic_average (Ra)') return parser def __init__(self, opt): """Initialize this model class. Parameters: opt -- training/test options A few things can be done here. - (required) call the initialization function of BaseModel - define loss function, visualization images, model names, and optimizers """ BaseModel.__init__(self, opt) # call the initialization method of BaseModel self.opt = opt if opt.d_loss_mode == 'wgan' and not opt.use_gp: raise NotImplementedError( 'using wgan on D must be with use_gp = True.') self.loss_names = [ 'G_real', 'G_fake', 'D_real', 'D_fake', 'D_gp', 'G', 'D' ] self.visual_names = ['real_visual', 'gen_visual'] if self.isTrain: # only defined during training time self.model_names = ['G', 'D'] else: self.model_names = ['G'] if self.opt.cgan: probs = np.ones(self.opt.cat_num) / self.opt.cat_num self.CatDis = Categorical(torch.tensor(probs)) # define networks self.netG = networks.define_G(opt.z_dim, opt.output_nc, opt.ngf, opt.netG, opt.g_norm, opt.cgan, opt.cat_num, not opt.no_dropout, opt.init_type, opt.init_gain, self.gpu_ids) if self.isTrain: # define a discriminator; conditional GANs need to take both input and output images; Therefore, #channels for D is input_nc + output_nc self.netD = networks.define_D(opt.input_nc, opt.ndf, opt.netD, opt.d_norm, opt.cgan, opt.cat_num, opt.init_type, opt.init_gain, self.gpu_ids) if self.isTrain: # only defined during training time # define loss functions self.criterionG = networks.GANLoss(opt.g_loss_mode, 'G', opt.which_D).to(self.device) self.criterionD = networks.GANLoss(opt.d_loss_mode, 'D', opt.which_D).to(self.device) # initialize optimizers self.optimizer_G = torch.optim.Adam(self.netG.parameters(), lr=opt.lr_g, betas=(opt.beta1, opt.beta2)) self.optimizer_D = torch.optim.Adam(self.netD.parameters(), lr=opt.lr_d, betas=(opt.beta1, opt.beta2)) self.optimizers.append(self.optimizer_G) self.optimizers.append(self.optimizer_D) # visulize settings self.N = int(np.trunc(np.sqrt(min(opt.batch_size, 64)))) if self.opt.z_type == 'Gaussian': self.z_fixed = torch.randn(self.N * self.N, opt.z_dim, 1, 1, device=self.device) elif self.opt.z_type == 'Uniform': self.z_fixed = torch.rand( self.N * self.N, opt.z_dim, 1, 1, device=self.device) * 2. - 1. if self.opt.cgan: yf = self.CatDis.sample([self.N * self.N]) self.y_fixed = one_hot(yf, [self.N * self.N, self.opt.cat_num]) def set_input(self, input): """input: a dictionary that contains the data itself and its metadata information.""" self.input_imgs = input['image'].to(self.device) if self.opt.cgan: self.input_targets = input['target'].to(self.device) def forward(self, batch_size=None): bs = self.opt.batch_size if batch_size is None else batch_size if self.opt.z_type == 'Gaussian': z = torch.randn(bs, self.opt.z_dim, 1, 1, device=self.device) elif self.opt.z_type == 'Uniform': z = torch.rand(bs, self.opt.z_dim, 1, 1, device=self.device) * 2. - 1. if not self.opt.cgan: self.gen_imgs = self.netG(z) else: y = self.CatDis.sample([bs]) self.y_ = one_hot(y, [bs, self.opt.cat_num]) self.gen_imgs = self.netG(z, self.y_) def backward_G(self): # pass D if not self.opt.cgan: self.fake_out = self.netD(self.gen_imgs) self.real_out = self.netD(self.real_imgs) else: self.fake_out = self.netD(self.gen_imgs, self.y_) self.real_out = self.netD(self.real_imgs, self.targets) self.loss_G_fake, self.loss_G_real = self.criterionG( self.fake_out, self.real_out) self.loss_G = self.loss_G_fake + self.loss_G_real self.loss_G.backward() def backward_D(self): self.gen_imgs = self.gen_imgs.detach() # pass D if not self.opt.cgan: self.fake_out = self.netD(self.gen_imgs) self.real_out = self.netD(self.real_imgs) else: self.fake_out = self.netD(self.gen_imgs, self.y_) self.real_out = self.netD(self.real_imgs, self.targets) self.loss_D_fake, self.loss_D_real = self.criterionD( self.fake_out, self.real_out) if self.opt.use_gp is True: self.loss_D_gp = networks.cal_gradient_penalty(self.netD, self.real_imgs, self.gen_imgs, self.device, type='mixed', constant=1.0, lambda_gp=10.0)[0] else: self.loss_D_gp = 0. self.loss_D = self.loss_D_fake + self.loss_D_real + self.loss_D_gp self.loss_D.backward() def optimize_parameters(self): for i in range(self.opt.D_iters + 1): self.real_imgs = self.input_imgs[i * self.opt.batch_size:(i + 1) * self.opt.batch_size, :, :, :] if self.opt.cgan: self.targets = self.input_target[i * self.opt.batch_size:(i + 1) * self.opt.batch_size, :] self.forward() # update G if i == 0: self.set_requires_grad(self.netD, False) self.optimizer_G.zero_grad() self.backward_G() self.optimizer_G.step() # update D else: self.set_requires_grad(self.netD, True) self.optimizer_D.zero_grad() self.backward_D() self.optimizer_D.step()
def __init__(self, opt): """Initialize this model class. Parameters: opt -- training/test options A few things can be done here. - (required) call the initialization function of BaseModel - define loss function, visualization images, model names, and optimizers """ BaseModel.__init__(self, opt) # call the initialization method of BaseModel self.opt = opt if opt.d_loss_mode == 'wgan' and not opt.use_gp: raise NotImplementedError( 'using wgan on D must be with use_gp = True.') self.loss_names = [ 'G_real', 'G_fake', 'D_real', 'D_fake', 'D_gp', 'G', 'D' ] self.visual_names = ['real_visual', 'gen_visual'] if self.isTrain: # only defined during training time self.model_names = ['G', 'D'] else: self.model_names = ['G'] if self.opt.cgan: probs = np.ones(self.opt.cat_num) / self.opt.cat_num self.CatDis = Categorical(torch.tensor(probs)) # define networks self.netG = networks.define_G(opt.z_dim, opt.output_nc, opt.ngf, opt.netG, opt.g_norm, opt.cgan, opt.cat_num, not opt.no_dropout, opt.init_type, opt.init_gain, self.gpu_ids) if self.isTrain: # define a discriminator; conditional GANs need to take both input and output images; Therefore, #channels for D is input_nc + output_nc self.netD = networks.define_D(opt.input_nc, opt.ndf, opt.netD, opt.d_norm, opt.cgan, opt.cat_num, opt.init_type, opt.init_gain, self.gpu_ids) if self.isTrain: # only defined during training time # define loss functions self.criterionG = networks.GANLoss(opt.g_loss_mode, 'G', opt.which_D).to(self.device) self.criterionD = networks.GANLoss(opt.d_loss_mode, 'D', opt.which_D).to(self.device) # initialize optimizers self.optimizer_G = torch.optim.Adam(self.netG.parameters(), lr=opt.lr_g, betas=(opt.beta1, opt.beta2)) self.optimizer_D = torch.optim.Adam(self.netD.parameters(), lr=opt.lr_d, betas=(opt.beta1, opt.beta2)) self.optimizers.append(self.optimizer_G) self.optimizers.append(self.optimizer_D) # visulize settings self.N = int(np.trunc(np.sqrt(min(opt.batch_size, 64)))) if self.opt.z_type == 'Gaussian': self.z_fixed = torch.randn(self.N * self.N, opt.z_dim, 1, 1, device=self.device) elif self.opt.z_type == 'Uniform': self.z_fixed = torch.rand( self.N * self.N, opt.z_dim, 1, 1, device=self.device) * 2. - 1. if self.opt.cgan: yf = self.CatDis.sample([self.N * self.N]) self.y_fixed = one_hot(yf, [self.N * self.N, self.opt.cat_num])
def logprob(self, datas, value_data): distribution = Categorical(datas) return distribution.log_prob(value_data).float().to(device)
def entropy(self, datas): distribution = Categorical(datas) return distribution.entropy().float().to(device)
def sample(self, datas): distribution = Categorical(datas) return distribution.sample().float().to(device)
def train_eos(train_data, enc, eos, ldis, rnn=True, device='cpu'): enc, _ = enc eos, eos_optim = eos ldis, dis_optim = ldis for data, _, _ in train_data: temporal_output = [] log_prob = [] past_actions = [] eos_states = [torch.zeros([1, 1, h], device=device) for h in eos.hidden_sizes] if rnn: enc_states = [torch.zeros([1, 1, h], device=device) for h in enc.hidden_sizes] data = data.unsqueeze(1) for d in data: d = d.view(1, 1, -1) encoded, enc_states = enc(d, enc_states) h, eos_states = eos(encoded, eos_states) softmax_output = F.softmax(h, -1) dist = Categorical(softmax_output) action_taken = dist.sample() log_prob.append(dist.log_prob(action_taken).squeeze()) temporal_output.append(encoded.squeeze()) past_actions.append(action_taken) else: encoded = enc(data) for e in encoded: e = e.view(1, 1, -1) h, enc_states = eos(e, eos_states) softmax_output = F.softmax(h, -1) dist = Categorical(softmax_output) action_taken = dist.sample() log_prob.append(dist.log_prob(action_taken).squeeze()) temporal_output.append(e.squeeze()) past_actions.append(action_taken) shuffled_output = shuffle(past_actions, temporal_output).unsqueeze_(1) if rnn: enc_states = [torch.zeros([1, 1, h], device=device) for h in enc.hidden_sizes] encoded, _ = enc(data, enc_states) else: encoded = enc(data) encoded = encoded.unsqueeze_(1) dis_states = [torch.zeros([1, 1, h], device=device) for h in ldis.hidden_sizes] F_output, _ = ldis(shuffled_output, dis_states.copy()) T_output, _ = ldis(encoded, dis_states.copy()) loss = F.binary_cross_entropy(torch.sigmoid(F_output[-1]).sum(), torch.tensor(0., device=device)) + \ F.binary_cross_entropy( torch.sigmoid(T_output[-1]).sum(), torch.tensor(0., device=device)) dis_optim.zero_grad() loss.backward(retain_graph=True) dis_optim.step() loss = torch.tensor(0., device=device) for log_p in log_prob: loss -= log_p * torch.sigmoid(F_output[-1]).sum().item() eos_optim.zero_grad() loss.backward() eos_optim.step()
def choose_action(self, state): state = torch.unsqueeze(torch.FloatTensor(state), 0) probs = self.policy(state) c = Categorical(probs) action = c.sample() return int(action.data.numpy())
def get_action_probabilities(self, observation, temperature=1): with torch.no_grad(): return Categorical(logits=self.forward(observation) * temperature)
def active_learning_taylor(func_name, start_rand_idxs=None, bud=None, valid=True, fac_loc_idx=None): start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) start.record() torch.manual_seed(42) np.random.seed(42) model = ResNet18(num_cls) model = model.to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=learning_rate) idxs = start_rand_idxs criterion = nn.CrossEntropyLoss() criterion_nored = nn.CrossEntropyLoss(reduction='none') optimizer = optim.SGD(model.parameters(), lr=learning_rate) if func_name == 'Facility Location': idxs = run_stochastic_Facloc(x_trn, y_trn, bud) facility_loaction_warm_start = copy.deepcopy(idxs) remainList = set([i for i in range(N)]) idxs = list(idxs) remainList = remainList.difference(idxs) subset_trnloader = torch.utils.data.DataLoader( trainset, batch_size=trn_batch_size, shuffle=False, sampler=SubsetRandomSampler(idxs), pin_memory=True) if func_name == 'Taylor Online': print("Starting Online OneStep Run with taylor on loss!") elif func_name == 'Full OneStep': print("Starting Online OneStep Run without taylor!") elif func_name == 'Facloc Regularized': print( "Starting Facility Location Regularized Online OneStep Run with taylor!" ) elif func_name == 'Random Greedy': print("Starting Randomized Greedy Online OneStep Run with taylor!") elif func_name == 'Facility Location': print("Starting Facility Location!") elif func_name == 'Random': print("Starting Random Run!") elif func_name == 'Random Perturbation': print( "Starting Online OneStep Run with taylor with random perturbation!" ) elif func_name == "FASS": print("Filtered Active Submodular Selection(FASS)!") #elif func_name == 'Proximal': #print("Starting Online Proximal OneStep Run with taylor!") #elif func_name == 'Taylor on Logit': # print("Starting Online OneStep Run with taylor on logit!") # if valid: # print("Online OneStep Run with Taylor approximation and with Validation Set",file=logfile) # else: # print("Online OneStep Run with Taylor approximation and without Validation Set",file=logfile) val_accies = np.zeros(no_select) test_accies = np.zeros(no_select) unlab_accies = np.zeros(no_select) # idxs = start_rand_idxs def weight_reset(m): torch.manual_seed(42) torch.cuda.manual_seed(42) np.random.seed(42) random.seed(42) torch.backends.cudnn.deterministic = True if isinstance(m, nn.Linear): #m.reset_parameters() m.weight.data.normal_(0.0, 0.02) m.bias.data.fill_(0) elif isinstance(m, nn.Conv2d): nn.init.xavier_uniform(m.weight.data) if m.bias is not None: nn.init.xavier_uniform(m.bias.data) fn = nn.Softmax(dim=1) for n in range(no_select): model.train() for i in range(num_epochs): accFinal = 0. for batch_idx, (inputs, targets) in enumerate(subset_trnloader): # targets can have non_blocking=True. x, y = inputs.to(device), targets.to(device, non_blocking=True) #x, y = Variable(x.cuda()), Variable(y.cuda()) optimizer.zero_grad() out = model(x) loss = F.cross_entropy(out, y) accFinal += torch.sum( (torch.max(out, 1)[1] == y).float()).data.item() loss.backward() if (i % 50 == 0) and (accFinal < 0.2): # reset if not converging model = model.apply(weight_reset).cuda() optimizer = optim.SGD(model.parameters(), lr=learning_rate) # clamp gradients, just in case for p in filter(lambda p: p.grad is not None, model.parameters()): p.grad.data.clamp_(min=-.1, max=.1) optimizer.step() print(n + 1, 'Time', 'SubsetTrn', loss.item() ) #, ,FullTrn,ValLoss: full_trn_loss.item(), val_loss.item()) curr_X_trn = x_trn[list(remainList)] #curr_Y_trn = y_trn[list(remainList)] model.eval() with torch.no_grad(): '''full_trn_out = model(x_trn) full_trn_loss = criterion(full_trn_out, y_trn).mean() sub_trn_out = model(x_trn[idxs]) sub_trn_loss = criterion(sub_trn_out, y_trn[idxs]).mean()''' correct = 0 total = 0 for batch_idx, (inputs, targets) in enumerate(valloader): inputs, targets = inputs.to(device), targets.to( device, non_blocking=True) outputs = model(inputs) _, val_predict = outputs.max(1) correct += val_predict.eq(targets).sum().item() total += targets.size(0) val_acc = 100 * correct / total correct = 0 total = 0 for batch_idx, (inputs, targets) in enumerate(testloader): inputs, targets = inputs.to(device), targets.to( device, non_blocking=True) outputs = model(inputs) _, tst_predict = outputs.max(1) correct += tst_predict.eq(targets).sum().item() total += targets.size(0) tst_acc = 100.0 * correct / total remloader = torch.utils.data.DataLoader( trainset, batch_size=trn_batch_size, shuffle=False, sampler=SubsetRandomSampler(list(remainList)), pin_memory=True) correct = 0 total = 0 cnt = 0 predictions = [] for batch_idx, (inputs, targets) in enumerate(remloader): inputs, targets = inputs.to(device), targets.to( device, non_blocking=True) outputs = model(inputs) predictions.append(outputs) _, rem_predict = outputs.max(1) if cnt == 0: y_rem_trn = rem_predict cnt = cnt + 1 else: y_rem_trn = torch.cat([y_rem_trn, rem_predict], dim=0) correct += rem_predict.eq(targets).sum().item() total += targets.size(0) rem_acc = 100 * correct / total val_accies[n] = val_acc test_accies[n] = tst_acc unlab_accies[n] = rem_acc #if ((i + 1) % select_every == 0) and func_name not in ['Facility Location','Random']: # val_in, val_t = x_val.to(device), y_val.to(device) # Transfer them to device cached_state_dict = copy.deepcopy(model.state_dict()) clone_dict = copy.deepcopy(model.state_dict()) # Dont put the logs for Selection on logfile!! # print("With Taylor approximation",file=logfile) # print("selEpoch: %d, Starting Selection:" % i, str(datetime.datetime.now()),file=logfile) #t_ng_start = time.time() if func_name == 'Random Greedy': new_idxs = setf_model.naive_greedy_max(curr_X_trn, y_rem_trn, int(0.9 * no_points), clone_dict) new_idxs = list(np.array(list(remainList))[new_idxs]) remainList = remainList.difference(new_idxs) new_idxs.extend( list( np.random.choice(list(remainList), size=int(0.1 * no_points), replace=False))) remainList = remainList.difference(new_idxs) idxs.extend(new_idxs) elif func_name == "FASS": cnt = 0 for pre in predictions: soft = fn(pre) if cnt == 0: entropy2 = Categorical(probs=soft).entropy() cnt = cnt + 1 else: entropy2 = torch.cat( [entropy2, Categorical(probs=soft).entropy()], dim=0) #print(entropy2.shape) if 5 * no_points < entropy2.shape[0]: values, indices = entropy2.topk(5 * no_points) #indices = list(np.array(list(remainList))[indices.cpu()]) else: indices = [i for i in range(entropy2.shape[0]) ] #list(remainList) knn_idxs_flag_val = perform_knnsb_selection(datadir, data_name, curr_X_trn[indices], y_rem_trn[indices], fraction, selUsing='val') #print(knn_idxs_flag_val) #print(len(knn_idxs_flag_val)) ##print(len(knn_idxs_flag_val),len(indices)) knn_idxs_flag_val = list( np.array(list(remainList))[indices.cpu()][knn_idxs_flag_val]) remainList = remainList.difference(knn_idxs_flag_val) idxs.extend(knn_idxs_flag_val) elif func_name == 'Random': state = np.random.get_state() np.random.seed(n * n) new_idxs = np.random.choice(list(remainList), size=no_points, replace=False) np.random.set_state(state) remainList = remainList.difference(new_idxs) idxs.extend(new_idxs) subset_trnloader = torch.utils.data.DataLoader( trainset, batch_size=trn_batch_size, shuffle=False, sampler=SubsetRandomSampler(idxs), pin_memory=True) elif func_name == 'Facility Location': new_idxs = run_stochastic_Facloc(curr_X_trn, rem_predict, no_points) new_idxs = np.array(list(remainList))[new_idxs] remainList = remainList.difference(new_idxs) idxs.extend(new_idxs) subset_trnloader = torch.utils.data.DataLoader( trainset, batch_size=trn_batch_size, shuffle=False, sampler=SubsetRandomSampler(idxs), pin_memory=True) else: new_idxs = setf_model.naive_greedy_max(curr_X_trn, rem_predict, no_points, clone_dict) # , grads_idxs new_idxs = np.array(list(remainList))[new_idxs] remainList = remainList.difference(new_idxs) idxs.extend(new_idxs) '''elif func_name == 'Proximal': previous = torch.zeros(N,device=device) previous[idxs] = 1.0 new_idxs = setf_model.naive_greedy_max(bud, clone_dict,None,previous) idxs = new_idxs''' # print("selEpoch: %d, Selection Ended at:" % (i), str(datetime.datetime.now()),file=logfile) # print("Naive greedy total time with taylor:", time.time()-t_ng_start,file=logfile) model.load_state_dict(cached_state_dict) # Calculate Final SubsetTrn, FullTrn, Val and Test Loss # Calculate Val and Test Accuracy if func_name == 'Facility Location': return val_accies, test_accies, unlab_accies, idxs, facility_loaction_warm_start else: return val_accies, test_accies, unlab_accies, idxs
def call_rsample(): return Categorical(p).rsample()
def select_action(policy): probs = policy() m = Categorical(probs) action = m.sample() #policy.saved_log_probs.append(m.log_prob(action)) return m.log_prob(action), action.cpu().tolist()
def forward(self, state): output = F.relu(self.linear1(state)) output = F.relu(self.linear2(output)) output = self.linear3(output) distribution = Categorical(F.softmax(output, dim=-1)) return distribution
def main(args): #use_cuda = torch.cuda.is_available() use_cuda = False # Faster on cpu device = torch.device("cuda:0" if use_cuda else "cpu") # Environment if args.task == 'two_step': task = Two_step_task(args.p_common_dist, args.r_common_dist, args.p_reversal_dist) elif args.task == 'rocket': task = Rocket_task(args.p_reversal_dist, args.p_reward_reversal_dist) elif args.task == 'rooms_grid': task = Rooms_grid_task(args.room_size) state_dim = task.state_dim action_dim = task.action_dim # Model if args.model == 'LSTM': model = LSTM(state_dim=state_dim, action_dim=action_dim, hidden_dim=args.hidden_dim, device=device) if args.load_weights_from is not None: model.load_state_dict(torch.load(args.load_weights_from)) model.to(device) model.eval() # Construct empty dataframe to record testing results if args.task == 'two_step': col_names = [ 'Episode', 'Trial', 'T', 'State', 'Action', 'Reward', 'Rewarded_state' ] elif args.task == 'rocket': col_names = [ 'Episode', 'Trial', 'T', 'State', 'Action', 'Reward', 'Rewarded_state', 'Transition_regime' ] elif args.task == 'rooms_grid': col_names = [ 'Episode', 'Trial', 'T', 'State_row', 'State_col', 'Action', 'Reward', 'Reward_location_row', 'Reward_location_col' ] n_cols = len(col_names) df = np.full_like(np.zeros([1, n_cols]), np.nan) row = 1 # keep track of what row we're in # Testing loop with torch.no_grad(): for episode in range(args.episodes): if episode % args.print_every == 0: print("Starting episode: ", episode) env = task.sample() model.reinitialize() r, a = 0, None for trial in range(args.trials): # Reset environment for new trial env.init_new_trial() s = env.state done = False # Run a trial T = 0 while not done: if T > args.timeout: print("Model timed out at T = ", args.timeout) break T += 1 # Add new row to dataframe df = np.concatenate((df, np.zeros([1, n_cols])), axis=0) # Record some data df[row, 0] = episode #episode number df[row, 1] = trial #trial number df[row, 2] = T #timestep if args.task in ['two_step', 'rocket']: df[row, 3] = np.nonzero(np.array(s))[0][0] #state df[row, 6] = env.rewarded_state #note: recorded before step elif args.task == 'rooms_grid': df[row, 3] = env.state_loc[0] df[row, 4] = env.state_loc[1] df[row, 7] = env.reward_location[0] df[row, 8] = env.reward_location[1] if args.task == 'rocket': df[row, 7] = env.transition_regime # Convert state, previous action and previous reward to torch.tensors s = torch.tensor(s).type(torch.FloatTensor).to(device) a_prev = torch.zeros(action_dim, dtype=torch.float).to(device) if a is not None: a_prev[a] = 1 r_prev = torch.tensor(r).type(torch.FloatTensor).to(device) # Generate action and value prediction probs, v = model(s, a_prev, r_prev) m = Categorical(probs) a = m.sample() # Take a step in the environment s, r, done = env.step(a) # Record the rest of the row's data if args.task in ['two_step', 'rocket']: df[row, 4] = a.item() #action df[row, 5] = r #reward elif args.task == 'rooms_grid': df[row, 5] = a.item() df[row, 6] = r # Update row row += 1 # Write output file df = df[1:, :] # Remove first row of nans print("Writing results to: ", args.out_data_file) np.save(args.out_data_file, df)
def get_entropy(self, obs): probs = self.policy_net(obs) m = Categorical(probs) return m.entropy()
def action_dist(self, state): state = torch.tensor(state).float().to(DEVICE) return Categorical(F.softmax(self.forward(state), -1))
batch_size = 128 loss_curve = [] reward_list = [] for i in trange(300): all_reward = 0 state = env.reset() saved_log_probs = [] rewards = [] while True: state = torch.from_numpy(state).float().unsqueeze(0) #根据概率选择一个action probs = policy(state) m = Categorical(probs) action = m.sample() saved_log_probs.append(m.log_prob(action)) #与环境交互 next_state, reward, done, info = env.step(action.item()) rewards.append(reward) all_reward += reward if done: reward_list.append(all_reward) break state = next_state policy_loss = []
def forward(self, state): state = torch.tensor(state).float().to(DEVICE) state = self.conv(state) state = state.view(state.size(0), -1) action_logit = self.fc(state) return Categorical(F.softmax(action_logit, -1))
def forward(self, jobs, machines, allocable_jobs=None, allocable_machines=None, argmax=False): job_input_size = jobs.size(0) machine_input_size = machines.size(0) E_j, E_m = self.get_embedding(jobs, machines) g_1 = self.last_j j_logits = self.j_att(g_1, E_j) ### selecting processes if allocable_jobs is not None: x = [] for _ in range(job_input_size): if _ not in allocable_jobs: x.append(_) if len(x) > 0: mask = torch.from_numpy(np.array(x, dtype=int)) #print("MASK!", mask) j_logits[mask] = -1e8 job_softmax = torch.softmax(j_logits, 0) job_sampler = Categorical(job_softmax) if argmax: selected_job = torch.argmax(job_softmax) else: try: selected_job = job_sampler.sample() except: print("ERROR at job_logits!!") print("g_1") print(g_1) print("E_J") print(E_j) print("job_logits") print(j_logits) print("job_softmax") print(job_softmax) raise UnboundLocalError; as_i = int(selected_job.detach().numpy()) e_js = E_j[selected_job] m_logits = self.m_att(e_js, E_m) self.last_j = e_js.detach() ### selecting process if allocable_machines is not None: x = [] for _ in range(machine_input_size): if _ not in allocable_machines[as_i]: x.append(_) mask = torch.from_numpy(np.array(x, dtype=int)) m_logits[mask] = -1e8 machine_softmax = torch.softmax(m_logits, 0) #machine_softmax[machine_softmax < 0] = 0 machine_sampler = Categorical(machine_softmax) if argmax: selected_machine = torch.argmax(machine_softmax, -1) else: selected_machine = machine_sampler.sample() logpas = machine_sampler.log_prob(selected_machine) + job_sampler.log_prob(selected_job) return (int(selected_job.detach().numpy()), int(selected_machine.detach().numpy())), logpas
def forward(self, state): state = torch.tensor(state).float().to(DEVICE) out = self.common(state) action_logit, value = self.action_head(out), self.value_head(out) return Categorical(F.softmax(action_logit, -1)), value
class Multinomial(Distribution): r""" Creates a Multinomial distribution parameterized by `total_count` and either `probs` or `logits` (but not both). The innermost dimension of `probs` indexes over categories. All other dimensions index over batches. Note that `total_count` need not be specified if only :meth:`log_prob` is called (see example below) .. note:: :attr:`probs` will be normalized to be summing to 1. - :meth:`sample` requires a single shared `total_count` for all parameters and samples. - :meth:`log_prob` allows different `total_count` for each parameter and sample. Example:: >>> m = Multinomial(100, torch.tensor([ 1., 1., 1., 1.])) >>> x = m.sample() # equal probability of 0, 1, 2, 3 tensor([ 21., 24., 30., 25.]) >>> Multinomial(probs=torch.tensor([1., 1., 1., 1.])).log_prob(x) tensor([-4.1338]) Args: total_count (int): number of trials probs (Tensor): event probabilities logits (Tensor): event log probabilities """ arg_constraints = {'logits': constraints.real} # Let logits be the canonical parameterization. @property def mean(self): return self.probs * self.total_count @property def variance(self): return self.total_count * self.probs * (1 - self.probs) def __init__(self, total_count=1, probs=None, logits=None, validate_args=None): if not isinstance(total_count, Number): raise NotImplementedError('inhomogeneous total_count is not supported') self.total_count = total_count self._categorical = Categorical(probs=probs, logits=logits) batch_shape = self._categorical.batch_shape event_shape = self._categorical.param_shape[-1:] super(Multinomial, self).__init__(batch_shape, event_shape, validate_args=validate_args) def _new(self, *args, **kwargs): return self._categorical._new(*args, **kwargs) @constraints.dependent_property def support(self): return constraints.integer_interval(0, self.total_count) @property def logits(self): return self._categorical.logits @property def probs(self): return self._categorical.probs @property def param_shape(self): return self._categorical.param_shape def sample(self, sample_shape=torch.Size()): sample_shape = torch.Size(sample_shape) samples = self._categorical.sample(torch.Size((self.total_count,)) + sample_shape) # samples.shape is (total_count, sample_shape, batch_shape), need to change it to # (sample_shape, batch_shape, total_count) shifted_idx = list(range(samples.dim())) shifted_idx.append(shifted_idx.pop(0)) samples = samples.permute(*shifted_idx) counts = samples.new(self._extended_shape(sample_shape)).zero_() counts.scatter_add_(-1, samples, torch.ones_like(samples)) return counts.type_as(self.probs) def log_prob(self, value): if self._validate_args: self._validate_sample(value) logits, value = broadcast_all(self.logits.clone(), value) log_factorial_n = torch.lgamma(value.sum(-1) + 1) log_factorial_xs = torch.lgamma(value + 1).sum(-1) logits[(value == 0) & (logits == -float('inf'))] = 0 log_powers = (logits * value).sum(-1) return log_factorial_n - log_factorial_xs + log_powers
def main(args): # Build data loader if not os.path.isdir(args.model_path): os.makedirs(args.model_path) data_loader,ds_class = get_loader(args.data_dir,args.batch_size, shuffle=True, num_workers=args.num_workers, ds = args.ds) # Build eval data loader if hasattr(ds_class, 'lbl2id'): eval_data_loader,_ = get_loader(args.data_dir_test, args.batch_size, shuffle=True, num_workers=args.num_workers, ds = args.ds, lbl2id = ds_class.lbl2id) else: eval_data_loader,_ = get_loader(args.data_dir_test, args.batch_size, shuffle=True, num_workers=args.num_workers, ds = args.ds) # Loss and Optimizer model_base = SkeletonAction(args.input_size, args.hidden_size, args.num_class, args.num_action, args.num_layers, dropout = args.dropout) model_value = ValueNetwork( args.hidden_size ) model_policy = PolicyNetwork( args.hidden_size, args.num_action ) model_c = CoreClassification( args.hidden_size, args.num_class ) criterion = nn.CrossEntropyLoss() criterion_value = nn.SmoothL1Loss() if torch.cuda.is_available(): model_base.cuda() model_value.cuda() model_policy.cuda() model_c.cuda() criterion = criterion.cuda() criterion_value = criterion_value.cuda() params = list(model_base.parameters()) + list(model_c.parameters()) + list(model_value.parameters()) \ + list(model_policy.parameters()) #opt = torch.optim.Adam(params, lr=args.learning_rate, weight_decay = args.weight_decay) opt = torch.optim.Adam(params, lr=args.learning_rate) #opt_value = torch.optim.Adam(model_value.parameters(), lr = args.learning_rate) #opt_policy = torch.optim.Adam(model_policy.parameters(), lr = args.learning_rate) #opt_c = torch.optim.Adam(model_c.parameters(), lr = args.learning_rate) # Load the trained model parameters # Now, we try to find the latest encoder and decoder model. if os.path.isdir(args.model_path) and os.listdir(args.model_path): m_fn = max(glob.glob(os.path.join(args.model_path, 'model*')), key = os.path.getctime) if m_fn: logging.info("Loading model from %s", m_fn) model.load_state_dict(torch.load(m_fn)) # Train the Models total_step = len(data_loader) # Initialize some variables. h_tensor = torch.zeros(args.batch_size, args.hidden_size) if torch.cuda.is_available(): h_tensor = h_tensor.cuda() for epoch in range(args.num_epochs): total_train = 0 total_correct = 0 total_train_2 = 0 total_correct_2 = 0 for i_step, (lbl, data, length) in enumerate(data_loader): # Set mini-batch dataset lbl = Variable(lbl.squeeze()) data = Variable(data) mask = torch.zeros(data.size(0), data.size(1)) for i,m in zip(length, mask): m[0:i[0]] = 1 mask = Variable(mask) if torch.cuda.is_available(): lbl = lbl.cuda() data = data.cuda() mask = mask.cuda() h_tensor.resize_(data.size(0), data.size(1)) init_h = Variable(h_tensor) init_hs = [ init_h for i in range( args.num_layers ) ] init_cs = init_hs zero = torch.zeros(data.size(0),) zero = Variable(zero) if torch.cuda.is_available(): zero = zero.cuda() hs = [] action_probs = [] actions = [] ht, ct = model_base( data[:,0,:], zero, init_hs, init_cs) hs.append(ht[-1]) action_prob = model_policy(ht[-1]) action_probs.append(action_prob) action = Categorical(action_prob) action = action.sample() actions.append(action) for j_step in range(1, data.shape[1]): ht, ct = model_base( data[:,j_step,:], actions[j_step-1].float(), ht, ct) hs.append(ht[-1]) action_prob = model_policy(ht[-1]) # We need to smooth the probability action = Categorical((action_prob + action_probs[j_step-1]) / 2) action = action.sample() actions.append(action) action_probs.append(action_prob) # now, we have finished all the actions. # need to bp. # the award only returns at the end of the episode. hs_t = torch.stack(hs, dim = 1) hs_t = (hs_t * mask.unsqueeze(2) ).sum(dim = 1) / mask.sum(dim = 1).unsqueeze(1) logits = model_c(hs_t) #log_p = F.log_softmax(logits, dim = 1) loss_ent = criterion(logits, lbl) #loss = - (mask.squeeze() * log_p[long_idx, lbl.squeeze().data]).sum() / mask.sum() pred_lbl = logits.max(dim = 1)[1] reward = Variable((pred_lbl.data == lbl.data).float()) reward = reward.view(data.size(0), 1) reward = reward.repeat(1, data.size(1)) loss_value = [] loss_policy = [] actions = torch.stack(actions, dim = 1) action_probs = torch.stack(action_probs, dim = 1) hs = torch.stack(hs, dim = 1) hs = hs.view(-1, hs.size(-1)) exp_reward = model_value(hs) exp_reward = exp_reward.view(data.size(0), data.size(1)) loss_value =( exp_reward - reward ) ** 2 loss_value = (loss_value * mask).sum() / mask.sum() pdb.set_trace() advantage = reward - Variable(exp_reward.data) idx = torch.LongTensor(range(data.size(0))) idx = idx.view(data.size(0), 1) idx = idx.repeat(1, data.size(1)) idx = idx.view(data.size(0) * data.size(1)) if torch.cuda.is_available(): idx = idx.cuda() action_probs = action_probs.view(action_probs.size(0) * action_probs.size(1),action_probs.size(-1)) actions = actions.view(actions.size(0) * actions.size(1)) log_prob = action_probs[idx, actions] log_prob = log_prob.view(mask.size(0), mask.size(1)) pdb.set_trace() loss_policy = -torch.log(log_prob + 1e-7) * mask * advantage pdb.set_trace() loss_policy = loss_policy.sum() / mask.sum() loss = loss_ent + loss_policy + loss_value # Now we update the value network #for j_step, (h, action, action_prob) in enumerate(zip(hs, actions, action_probs)): # # total reward. # target = reward * discount ** (data.size(0) - j_step) # exp_reward = model_value(h) # logging.info('exp_reward: %.4f, target: %.4f', exp_reward.mean().data[0], target.mean().data[0]) # l_value = criterion_value(exp_reward, target) # loss_value.append( l_value ) # advantage = target - exp_reward # c = Categorical(action_prob) # l_policy = -c.log_prob(action) * advantage # loss_policy.append( l_policy.mean() ) #loss_value = torch.stack(loss_value).mean() #loss_policy = torch.stack(loss_policy).mean() #loss += loss_value + loss_policy opt.zero_grad() loss.backward() old_norm = clip_grad_norm(params, args.grad_clip) opt.step() total_train += data.size(0) total_correct += (pred_lbl.data.cpu().squeeze() == lbl.data.cpu().squeeze()).sum() # Use grad clip. # Eval the trained model #logging.info('Epoch [%d/%d], Loss: %.4f, reward: %5.4f, loss_value: %5.4f, loss_policy: %5.4f', # epoch, args.num_epochs, # loss_ent.data[0], reward.mean().data[0], loss_value.data[0], loss_policy.data[0]) if i_step % args.log_step == 0: accuracy = total_correct * 1.0 / total_train logging.info('Epoch [%d/%d], Loss: %.4f, reward: %5.4f, loss_value: %5.4f, loss_policy: %5.4f, accuracy: %5.4f', epoch, args.num_epochs, loss_ent.data[0], reward.mean().data[0], loss_value.data[0], loss_policy.data[0], accuracy) #logging.info('Epoch [%d/%d], Loss: %.4f, accuracy: %5.4f, reward: %5.4f' # ,epoch, args.num_epochs, # loss_ent.data[0], accuracy, reward.mean().data[0]) if i_step % args.eval_step == 0: model_base.eval() model_c.eval() model_policy.eval() total_num = 0 correct_num = 0 for k_step, (lbl, data, length) in enumerate(eval_data_loader): lbl = Variable(lbl.squeeze()) data = Variable(data) mask = torch.zeros(data.size(0), data.size(1)) for i,m in zip(length, mask): m[0:i[0]] = 1 if torch.cuda.is_available(): lbl = lbl.cuda() data = data.cuda() mask = mask.cuda() mask = Variable(mask) h_tensor.resize_(data.size(0), data.size(1)) init_h = Variable(h_tensor) init_hs = [ init_h for i in range( args.num_layers ) ] init_cs = init_hs zero = torch.zeros(data.size(0),) zero = Variable(zero) if torch.cuda.is_available(): zero = zero.cuda() hs = [] action_probs = [] actions = [] ht, ct = model_base( data[:,0,:], zero, init_hs, init_cs) hs.append(ht[-1]) action_prob = model_policy(ht[-1]) action_probs.append(action_prob) action = Categorical(action_prob) action = action.sample() actions.append(action) for j_step in range(1, data.shape[1]): ht, ct = model_base( data[:,j_step,:], action.float(), ht, ct) hs.append(ht[-1]) action_prob = model_policy(ht[-1]) action = Categorical(action_prob) action = action.sample() actions.append(action) # now, we have finished all the actions. # need to bp. # the award only returns at the end of the episode. hs_t = torch.stack(hs, dim = 1) hs_t = (hs_t * mask.unsqueeze(2) ).sum(dim = 1) / mask.sum(dim = 1).unsqueeze(1) logits = model_c(hs_t) log_p = F.log_softmax(logits, dim = 1) pred_lbl = logits.max(dim = -1)[1].data.cpu() total_num += data.size(0) correct_num += (pred_lbl.squeeze() == lbl.data.cpu().squeeze()).sum() loss = criterion(logits, lbl) accuracy = correct_num * 1.0 / total_num logging.info('Validating [%d], Loss: %.4f, accuracy: %.4f' ,epoch, loss.data[0], accuracy) model_base.train() model_c.train() model_policy.eval() accuracy = total_correct * 1.0 / total_train logging.info('Epoch [%d/%d], Loss: %.4f, accuracy: %5.4f, reward: %5.4f' ,epoch, args.num_epochs, loss_ent.data[0], accuracy, reward.mean().data[0])
class GMMExperiments(object): def __init__(self, n_obs, mu0, sigma0, n_clusters, hidden_dim = 30): # dimension of the problem self.dim = len(mu0) self.n_clusters = n_clusters self.n_obs = n_obs # prior parameters self.mu0 = mu0 self.sigma0 = torch.Tensor([sigma0]) # uniform prior on weights self.prior_weights = torch.ones(self.n_clusters) / self.n_clusters # true parameters self.set_true_params() self.cat_rv = Categorical(probs = self.prior_weights) # the encoder # self.gmm_encoder = GMMEncoder(data_dim = self.dim, # n_classes = self.n_clusters, # hidden_dim = hidden_dim) # # self.var_params = {'encoder_params': self.gmm_encoder.parameters()} # other variational paramters: we use point masses for # the means and variances self.set_random_var_params() # draw data self.n_obs = n_obs self.y, self.z = self.draw_data(n_obs = n_obs) def set_var_params(self, init_mu, init_log_sigma): self.var_params['centroids'] = init_mu self.var_params['log_sigma'] = init_log_sigma def set_random_var_params(self): init_mu = torch.randn((self.n_clusters, self.dim)) * self.sigma0 + self.mu0 init_mu.requires_grad_(True) init_log_sigma = torch.log(torch.Tensor([self.true_sigma]))# torch.log(torch.rand(1)) init_log_sigma.requires_grad_(True) self.init_free_class_weights = torch.rand((self.n_obs, self.n_clusters)) init_free_class_weights = deepcopy(self.init_free_class_weights) init_free_class_weights = init_free_class_weights.requires_grad_(True) self.var_params = {'free_class_weights': init_free_class_weights} self.set_var_params(init_mu, init_log_sigma) def set_kmeans_init_var_params(self, n_kmeans_init = 10): for i in range(n_kmeans_init): km = KMeans(n_clusters = self.n_clusters).fit(self.y) enertia = km.inertia_ if (i == 0): enertia_best = enertia km_best = deepcopy(km) elif (enertia < enertia_best): enertia_best = enertia km_best = deepcopy(km) init_free_class_weights = torch.zeros((self.n_obs, self.n_clusters)) for n in range(len(km_best.labels_)): init_free_class_weights[n, km_best.labels_[n]] = 3.0 self.init_free_class_weights = deepcopy(init_free_class_weights) init_free_class_weights.requires_grad_(True) self.var_params['free_class_weights'] = init_free_class_weights # init_centroids = torch.Tensor(km_best.cluster_centers_) # init_centroids.requires_grad_(True) # self.var_params['centroids'] = init_centroids def set_true_params(self): # draw means from the prior # each row is a cluster mean self.true_mus = torch.randn((self.n_clusters, self.dim)) * self.sigma0 + self.mu0 # just set a data variance self.true_sigma = 1.0 def draw_data(self, n_obs = 1): y = torch.zeros((n_obs, self.dim)) z = torch.zeros(n_obs) for i in range(n_obs): # class belonging z_sample = self.cat_rv.sample() z[i] = z_sample # observed data y[i, :] = self.true_mus[z_sample, :] + torch.randn(2) * self.true_sigma # some indices we cache and use later self.seq_tensor = torch.LongTensor([i for i in range(n_obs)]) return y, z def get_log_q(self): # self.log_class_weights = self.gmm_encoder.forward(self.y) fudge_lower_bdd = torch.Tensor([-8]) self.log_class_weights = log_softmax(torch.max(self.var_params['free_class_weights'], fudge_lower_bdd)) # return self.log_class_weights def _get_centroid_mask(self, z): mask = torch.zeros((self.n_obs, self.n_clusters)) mask[self.seq_tensor, z] = 1 return mask.detach() def f_z(self, z): centroids = self.var_params['centroids'] # log_sigma = torch.log(torch.Tensor([self.true_sigma])) # # print('centroids', centroids) # print('logsigma', log_sigma) # print('log_class_weights', self.log_class_weights) centroid_mask = self._get_centroid_mask(z) centroids_masked = torch.matmul(centroid_mask, centroids) loglik_z = get_normal_loglik(self.y, centroids_masked, log_sigma).sum(dim = 1) mu_prior_term = get_normal_loglik(centroids, self.mu0, torch.log(self.sigma0)).mean() z_prior_term = 0.0 # torch.log(self.prior_weights[z]) z_entropy_term = (- torch.exp(self.log_class_weights) * self.log_class_weights).mean() # print('z_ent_term', z_entropy_term) # print('mu_prior_term', mu_prior_term) # print('loglik', loglik_z) return - (loglik_z + mu_prior_term + z_prior_term + z_entropy_term) def get_pm_loss(self, alpha, topk, use_baseline): log_q = self.get_log_q() pm_loss = pm_lib.get_partial_marginal_loss(self.f_z, log_q, alpha, topk, use_baseline = use_baseline, use_term_one_baseline = True) return pm_loss def get_full_loss(self): log_q = self.get_log_q() class_weights = torch.exp(log_q) return pm_lib.get_full_loss(self.f_z, class_weights)
def hightrain(self): buffer, buffer_capacity, batch_size = self.highmemory.show() s = torch.tensor(buffer['s'], dtype=torch.double).to(self.device) pre_option = torch.tensor(buffer['pre_option'], dtype=torch.double).view(-1, 1).to(self.device) s_ = torch.tensor(buffer['s_'], dtype=torch.double).to(self.device) option = torch.tensor(buffer['option'], dtype=torch.double).view(-1, 1).to(self.device) option_logp = torch.tensor(buffer['option_logp'], dtype=torch.double).view(-1, 1).to(self.device) r = torch.tensor(buffer['r'], dtype=torch.double).view(-1, 1).to(self.device) done = torch.tensor(buffer['done'], dtype=torch.double).view(-1, 1).to(self.device) action_loss_record, value_loss_record, entropy_record, loop_record = 0, 0, 0, 0 with torch.no_grad(): value_next = self.highnet(s_)['value'] option_change_next = torch.where(option > 5, torch.zeros_like(option), option) value_next_zeros = torch.gather(value_next, 1, option_change_next.long()) value_next = torch.where( option > 5, value_next.sum(dim=1, keepdim=True) / self.config.get('num_options'), value_next_zeros) value_now = self.highnet(s)['value'] option_change_now = torch.where(pre_option > 5, torch.zeros_like(pre_option), pre_option) value_now_zeros = torch.gather(value_now, 1, option_change_now.long()) value_now = torch.where( pre_option > 5, value_now.sum(dim=1, keepdim=True) / self.config.get('num_options'), value_now_zeros) delta = r + ( 1 - done) * self.config.get('gamma') * value_next - value_now adv = torch.zeros_like(delta) adv[-1] = delta[-1] # GAE for i in reversed(range(buffer_capacity - 1)): adv[i] = delta[i] + self.config.get('tau') * ( 1 - done[i]) * adv[i + 1] target_v = value_now + adv adv = (adv - adv.mean()) / (adv.std() + np.finfo(np.float).eps ) # Normalize advantage for _ in range(self.config.get('ppoepoch')): for index in BatchSampler( SubsetRandomSampler(range(buffer_capacity)), batch_size, False): q_short, beta_short = self.highnet( s[index])['q'], self.highnet(s[index])['beta'] pre_option_short = pre_option[index] pi_hat_option = self.sample_option_multi( q_short, beta_short, pre_option_short) pi_hat_p = torch.gather(pi_hat_option, 1, option[index].long()) ratio = pi_hat_p / torch.exp(option_logp[index]) surr1 = ratio * adv[index] surr2 = torch.clamp( ratio, 1.0 - self.config.get('clip_param'), 1.0 + self.config.get('clip_param')) * adv[index] action_loss = -torch.min(surr1, surr2).mean() m = Categorical(pi_hat_option) entropy = m.entropy() value_now = self.highnet(s[index])['value'] option_change_now = torch.where( pre_option[index] > 5, torch.zeros_like(pre_option[index]), pre_option[index]) value_now_zeros = torch.gather(value_now, 1, option_change_now.long()) value_now = torch.where( pre_option[index] > 5, value_now.sum(dim=1, keepdim=True) / self.config.get('num_options'), value_now_zeros) value_loss = F.smooth_l1_loss(value_now, target_v[index]) self.highoptimizition.zero_grad() loss = action_loss + value_loss - self.config.get( 'entropy_para_high') * entropy.mean() loss.backward() nn.utils.clip_grad_norm_(self.highnet.parameters(), self.config.get('max_grad_norm')) self.highoptimizition.step() action_loss_record += action_loss.cpu().detach() value_loss_record += value_loss.cpu().detach() entropy_record += entropy.mean().cpu().detach() loop_record += 1 return { 'actionloss': action_loss_record / loop_record, 'valueloss': value_loss_record / loop_record, 'entropy': entropy_record / loop_record, }
def forward(self, state): state = torch.tensor(state).float().to(DEVICE) out = self.conv(state).view(state.size(0), -1) return Categorical(F.softmax(self.policy(out), -1)), self.value(out)
def forward(self, x): value = self.critic(x) probs = self.actor(x) dist = Categorical(probs) return dist, value