def act(self, state_tensor ): # state is a batch of tensors rather than a joint state # value, mu, cov = self.value_action_predictor(state_tensor) # dist = MultivariateNormal(mu, cov) # actions = dist.sample() # action_log_probs = dist.log_prob(actions) # action_to_take = [ActionXY(action[0], action[1]) for action in actions.cpu().numpy()] value, alpha_beta_1, alpha_beta_2 = self.value_action_predictor( state_tensor) vx_dist = Beta(alpha_beta_1[:, 0], alpha_beta_1[:, 1]) vy_dist = Beta(alpha_beta_2[:, 0], alpha_beta_2[:, 1]) actions = torch.cat( [vx_dist.sample().unsqueeze(1), vy_dist.sample().unsqueeze(1)], dim=1) action_log_probs = vx_dist.log_prob( actions[:, 0]).unsqueeze(1) + vy_dist.log_prob( actions[:, 1]).unsqueeze(1) action_to_take = [ ActionXY(action[0] * 2 - 1, action[1] * 2 - 1) for action in actions.cpu().numpy() ] return value, actions, action_log_probs, action_to_take
def test2(): """ beta distribution is a family of continuous random variables defined in the range of 0 and 1. :return: """ from torch.distributions.beta import Beta dist = Beta(torch.tensor([0.5]), torch.tensor(0.5)) dist.sample() # >>> tensor([0.0594])
class MixUp(Callback): #_order = 90 #Runs after normalization and cuda #should introduce before_transform, after_transform and change mixup.begin_batch to after_transform #alternatively make group of callbacks to control order def __init__(self, α: float = 0.4): self.distrib = Beta(tensor([α]), tensor([α])) def begin_fit(self, e: Event): self.old_loss_func, e.learn.loss_func = e.learn.loss_func, self.loss_func self.learn = e.learn def after_preprocessing(self, e: Event): if not e.learn.in_train: return #Only mixup things during training λ = self.distrib.sample( (e.learn.yb.size(0), )).squeeze().to(e.learn.xb.device) λ = torch.stack([λ, 1 - λ], 1) self.λ = unsqueeze(λ.max(1)[0], [1, 2, 3]) shuffle = torch.randperm(e.learn.yb.size(0)).to(e.learn.xb.device) xb1, self.yb1 = e.learn.xb[shuffle], e.learn.yb[shuffle] e.learn.xb = lerp(e.learn.xb, xb1, self.λ) def after_fit(self, e: Event): e.learn.loss_func = self.old_loss_func def loss_func(self, pred, yb): if not self.learn.in_train: return self.old_loss_func(pred, yb) with NoneReduce(self.old_loss_func) as loss_func: loss1 = loss_func(pred, yb) loss2 = loss_func(pred, self.yb1) loss = lerp(loss1, loss2, self.λ) return reduce_loss(loss, getattr(self.old_loss_func, 'reduction', 'mean'))
class MixUp(Callback): run_after,run_valid = [Normalize, Cuda],False def __init__(self, alpha=0.4): self.distrib = Beta(tensor(alpha), tensor(alpha)) def begin_fit(self): self.stack_y = getattr(self.learn.loss_func, 'y_int', False) if self.stack_y: self.old_lf,self.learn.loss_func = self.learn.loss_func,self.lf def after_fit(self): if self.stack_y: self.learn.loss_func = self.old_lf def begin_batch(self): lam = self.distrib.sample((self.y.size(0),)).squeeze().to(self.x.device) lam = torch.stack([lam, 1-lam], 1) self.lam = lam.max(1)[0] shuffle = torch.randperm(self.y.size(0)).to(self.x.device) xb1,self.yb1 = tuple(L(self.xb).itemgot(shuffle)),tuple(L(self.yb).itemgot(shuffle)) nx_dims = len(self.x.size()) self.learn.xb = tuple(L(xb1,self.xb).map_zip(torch.lerp,weight=unsqueeze(self.lam, n=nx_dims-1))) if not self.stack_y: ny_dims = len(self.y.size()) self.learn.yb = tuple(L(self.yb1,self.yb).map_zip(torch.lerp,weight=unsqueeze(self.lam, n=ny_dims-1))) def lf(self, pred, *yb): if not self.training: return self.old_lf(pred, *yb) with NoneReduce(self.old_lf) as lf: loss = torch.lerp(lf(pred,*self.yb1), lf(pred,*yb), self.lam) return reduce_loss(loss, getattr(self.old_lf, 'reduction', 'mean'))
class MixUp(Callback): _order = 90 #Runs after normalization and cuda def __init__(self, α=0.4): self.distrib = Beta(tensor([α]), tensor([α])) def begin_fit(self): self.old_loss_func,self.run.loss_func = self.run.loss_func,self.loss_func def begin_batch(self): if not self.in_train: return #Only mixup things during training λ = self.distrib.sample((self.yb.size(0),)).squeeze().to(self.xb.device) λ = torch.stack([λ, 1-λ], 1) self.λ = unsqueeze(λ.max(1)[0], (1,2,3)) shuffle = torch.randperm(self.yb.size(0)).to(self.xb.device) xb1,self.yb1 = self.xb[shuffle],self.yb[shuffle] self.run.xb = lin_comb(self.xb, xb1, self.λ) def after_fit(self): self.run.loss_func = self.old_loss_func def loss_func(self, pred, yb): if not self.in_train: return self.old_loss_func(pred, yb) with NoneReduce(self.old_loss_func) as loss_func: loss1 = loss_func(pred, yb) loss2 = loss_func(pred, self.yb1) loss = lin_comb(loss1, loss2, self.λ) return reduce_loss(loss, getattr(self.old_loss_func, 'reduction', 'mean'))
class MixupBlending(BaseMiniBatchBlending): """Implementing Mixup in a mini-batch. This module is proposed in `mixup: Beyond Empirical Risk Minimization <https://arxiv.org/abs/1710.09412>`_. Code Reference https://github.com/open-mmlab/mmclassification/blob/master/mmcls/models/utils/mixup.py # noqa Args: num_classes (int): The number of classes. alpha (float): Parameters for Beta distribution. """ def __init__(self, num_classes, alpha=.2): super().__init__(num_classes=num_classes) self.beta = Beta(alpha, alpha) def do_blending(self, imgs, label, **kwargs): """Blending images with mixup.""" assert len(kwargs) == 0, f'unexpected kwargs for mixup {kwargs}' lam = self.beta.sample() batch_size = imgs.size(0) rand_index = torch.randperm(batch_size) mixed_imgs = lam * imgs + (1 - lam) * imgs[rand_index, :] mixed_label = lam * label + (1 - lam) * label[rand_index, :] return mixed_imgs, mixed_label
def augmentAndMix(x_orig, k, alpha, preprocess): # k : number of chains # alpha : sampling constant x_temp = x_orig # back up for skip connection x_aug = torch.zeros_like(preprocess(x_orig)) mixing_weight_dist = Dirichlet(torch.empty(k).fill_(alpha)) mixing_weights = mixing_weight_dist.sample() for i in range(k): sampled_augs = random.sample(augmentations, k) aug_chain_length = random.choice(range(1, k + 1)) aug_chain = sampled_augs[:aug_chain_length] for aug in aug_chain: severity = random.choice(range(1, 6)) x_temp = aug(x_temp, severity) x_aug += mixing_weights[i] * preprocess(x_temp) skip_conn_weight_dist = Beta(torch.tensor([alpha]), torch.tensor([alpha])) skip_conn_weight = skip_conn_weight_dist.sample() x_augmix = skip_conn_weight * x_aug + ( 1 - skip_conn_weight) * preprocess(x_orig) return x_augmix
class MixUp(Callback): _order = 90 def __init__(self, alpha=0.4): self.distrib = Beta(tensor([alpha]), tensor([alpha])) def begin_fit(self): self.old_loss_func, self.run.loss_func = self.run.loss_func, self.loss_func def begin_batch(self): if not self.in_train: return lamb = self.distrib.sample( (self.yb.size(0), )).squeeze().to(self.xb.device) lamb = torch.stack([lamb, 1 - lamb], 1) self.lamb = unsqueeze(lamb.max(1)[0], (1, 2, 3)) shuffle = torch.randperm(self.yb.size(0)).to(self.xb.device) xb1, self.yb1 = self.xb[shuffle], self.yb[shuffle] self.run.xb = lin_comb(self.xb, xb1, self.lamb) def after_fit(self): self.run.loss_func = self.old_loss_func def loss_func(self, pred, yb): if not self.in_train: return self.old_loss_func(pred, yb) with NoneReduce(self.old_loss_func) as loss_func: loss1 = loss_func(pred, yb) loss2 = loss_func(pred, self.yb1) loss = lin_comb(loss1, loss2, self.lamb) return reduce_loss(loss, getattr(self.old_loss_func, 'reduction', 'mean'))
class MixUp(Callback): run_after = [Normalize, Cuda] def __init__(self, alpha=0.4): self.distrib = Beta(tensor([alpha]), tensor([alpha])) def begin_fit(self): self.old_loss_func, self.learn.loss_func = self.learn.loss_func, self.loss_func def begin_batch(self): if not self.training: return #Only mixup things during training lam = self.distrib.sample( (self.y.size(0), )).squeeze().to(self.x.device) lam = torch.stack([lam, 1 - lam], 1) self.lam = lam.max(1)[0][:, None, None, None] shuffle = torch.randperm(self.y.size(0)).to(self.x.device) xb1, self.yb1 = tuple(x[shuffle] for x in self.xb), tuple(y[shuffle] for y in self.yb) self.learn.xb = tuple( torch.lerp(x1, x, self.lam) for x, x1 in zip(self.xb, xb1)) def after_fit(self): self.run.loss_func = self.old_loss_func def loss_func(self, pred, *yb): if not self.in_train: return self.old_loss_func(pred, *yb) with NoneReduce(self.old_loss_func) as loss_func: loss1 = loss_func(pred, *yb) loss2 = loss_func(pred, *self.yb1) loss = torch.lerp(loss2, loss1, self.lam) return reduce_loss(loss, getattr(self.old_loss_func, 'reduction', 'mean'))
class BatchMixupLayer(BaseMixupLayer): """Mixup layer for batch mixup. Args: alpha (float): Parameters for Beta distribution. num_classes (int): The number of classes. """ def __init__(self, alpha, num_classes): super(BatchMixupLayer, self).__init__() assert isinstance(alpha, float) assert isinstance(num_classes, int) self.alpha = alpha self.num_classes = num_classes self.Beta = Beta(self.alpha, self.alpha) def mixup(self, img, gt_label): lam = self.Beta.sample() batch_size = img.size(0) index = torch.randperm(batch_size) one_hot_gt_label = F.one_hot(gt_label, num_classes=self.num_classes) mixed_img = lam * img + (1 - lam) * img[index, :] mixed_gt_label = lam * one_hot_gt_label + ( 1 - lam) * one_hot_gt_label[index, :] return mixed_img, mixed_gt_label def __call__(self, img, gt_label): return self.mixup(img, gt_label)
class Cutmix(Callback): def __init__(self, alpha=0.3): self.distrib = Beta(tensor(alpha), tensor(alpha)) def on_train_batch_start(self, batch, batch_idx, dataloader_idx): xb, yb = batch w, h = self.xb.size(3), self.xb.size(2) lam = self.distrib.sample((1,)).squeeze().to(self.xb.device) self.lam = lam.max() shuffle = torch.randperm(self.y.size(0)).to(self.xb.device) xb1, yb1 = xb[shuffle], yb[shuffle] n_dims = len(self.xb.size()) x1, y1, x2, y2 = self.rand_bbox(w, h, self.lam) xb[:,:,x1:x2, y1:y2] = xb1[:, :, x1:x2, y1:y2] self.lam = (1 - ((x2 - x1) *(y2-y1))/float(w*h)).item() @staticmethod def lf(self, pred, yb1, yb2): loss = torch.lerp(self.loss_fn(pred, yb1), self.loss_fn(pred, yb2), self.lam) return loss def rand_bbox(self, w, h, lam): cut_rat = torch.sqrt(1. - lam) cut_w = (w * cut_rat).type(torch.long) cut_h = (h * cut_rat).type(torch.long) cx = torch.randint(0, w, (1,)).to(self.xb.device) cy = torch.randint(0, h, (1,)).to(self.xb.device) x1 = torch.clamp(cx - cut_w // 2, 0, w) y1 = torch.clamp(cy - cut_h // 2, 0, h) x2 = torch.clamp(cx + cut_w // 2, 0, w) y2 = torch.clamp(cy + cut_h // 2, 0, h) return x1, y1, x2, y2
class MixUp(Callback): _order = 90 #Runs after normalization and cuda def __init__(self, alpha=0.4): self.distrib = Beta(tensor([alpha]), tensor([alpha])) def begin_fit(self): self.old_loss_func, self.learn.loss_func = self.loss_func, self.loss_func def begin_batch(self): if not self.training: return #Only mixup things during training lam = self.distrib.sample( (self.yb.size(0), )).squeeze().to(self.xb.device) lam = torch.stack([lam, 1 - lam], 1) self.lam = lam.max(1)[0][:, None, None, None] shuffle = torch.randperm(self.yb.size(0)).to(self.xb.device) xb1, self.yb1 = self.xb[shuffle], self.yb[shuffle] self.learn.xb = torch.lerp(xb1, self.xb, self.lam) def after_fit(self): self.run.loss_func = self.old_loss_func def loss_func(self, pred, yb): if not self.in_train: return self.old_loss_func(pred, yb) with NoneReduce(self.old_loss_func) as loss_func: loss1 = loss_func(pred, yb) loss2 = loss_func(pred, self.yb1) loss = torch.lerp(loss2, loss1, self.lam) return reduce_loss(loss, getattr(self.old_loss_func, 'reduction', 'mean'))
def __getitem__(self, idx): # idx only acts as a counter while generating batches. prob = 0.5 * torch.ones([self.input_seq_len, self.seq_width], dtype=torch.float64) seq = Binomial(1, prob).sample() # Extra input channel for providing priority value input_seq = torch.zeros([self.input_seq_len, self.seq_width + 1]) input_seq[:self.input_seq_len, :self.seq_width] = seq # torch's Uniform function draws samples from the half-open interval # [low, high) but in the paper the priorities are drawn from [-1,1]. # This minor difference is being ignored here as supposedly it doesn't # affects the task. if not self.uniform: alpha = torch.tensor([2.0]) beta = torch.tensor([5.0]) if self.random_distr: alpha_beta_gen = Uniform(torch.tensor([0.0]), torch.tensor([100.0])) alpha = alpha_beta_gen.sample() beta = alpha_beta_gen.sample() priority = Beta(alpha, beta) else: priority = Uniform(torch.tensor([-1.0]), torch.tensor([1.0])) for i in range(self.input_seq_len): input_seq[i, self.seq_width] = priority.sample() sorted_index = torch.sort(input_seq[:, -1], descending=True)[1] target_seq = input_seq[sorted_index][:self.target_seq_len, :self. seq_width] return {'input': input_seq, 'target': target_seq}
def train_step( self, sample, model, criterion, optimizer, update_num, ignore_grad=False): model.train() model.set_num_updates(update_num) shuffled_ids = np.array(list(range(len(sample["id"])))) np.random.shuffle(shuffled_ids) net_input_a = sample["net_input"] net_input_b = {"src_tokens": net_input_a["src_tokens"][shuffled_ids], "prev_output_tokens": net_input_a["prev_output_tokens"][shuffled_ids], "src_lengths": net_input_a["src_lengths"][shuffled_ids]} pair_sample = { "id": sample["id"], "nsentences": sample["nsentences"], "ntokens": sample["ntokens"], "net_input_a": net_input_a, "net_input_b": net_input_b, "target_a": sample["target"], "target_b": sample["target"][shuffled_ids], } dist = Beta(self.args.alpha, self.args.alpha) bsz = len(shuffled_ids) lambda_ = dist.sample(sample_shape=[bsz]).to("cuda") lambda_ = torch.max(lambda_, 1 - lambda_) if self.args.fp16: lambda_ = lambda_.half() loss, sample_size, logging_output = criterion(model, pair_sample, lambda_=lambda_) if ignore_grad: loss *= 0 optimizer.backward(loss) return loss, sample_size, logging_output
class MixUp(Callback): _order = 90 #Runs after normalization and cuda def __init__(self, alpha: float = 0.4): self.distrib = Beta(tensor([alpha]), tensor([alpha])) def begin_fit(self): self.old_loss_func, self.run.loss_func = self.run.loss_func, self.loss_func def begin_batch(self): if not self.in_train: return #Only mixup things during training lambd = self.distrib.sample( (self.yb.size(0), )).squeeze().to(self.xb.device) self.lambd = torch.cat([lambd[:, None], 1 - lambd[:, None]], 1).max(1)[0] shuffle = torch.randperm(self.yb.size(0)).to(self.xb.device) xb1, self.yb1 = self.xb[shuffle], self.yb[shuffle] self.run.xb = self.xb * self.lambd[:, None, None, None] + xb1 * ( 1 - self.lambd)[:, None, None, None] def after_fit(self): self.run.loss_func = self.old_loss_func def loss_func(self, pred, yb): if not self.in_train: return self.old_loss_func(pred, yb) with NoneReduce(self.old_loss_func) as loss_func: loss1 = loss_func(pred, yb) loss2 = loss_func(pred, self.yb1) return (loss1 * self.lambd + loss2 * (1 - self.lambd)).mean()
def mixup_data(x: torch.FloatTensor, y: torch.LongTensor, alpha: float = 1.0): if not len(x) == len(y): raise ValueError( "The size of `x` and `y` must match in the first dim.") if alpha > 0.: alpha = float(alpha) beta_dist = Beta(torch.tensor([alpha]), torch.tensor([alpha])) lam = beta_dist.sample().item() else: lam = 1. batch_size, num_channels, _, _ = x.size() index = torch.randperm(batch_size).to(x.device) # For WM811K, the input tensors `x` have two channels, where # the first channel has values of either one (for fail) or zero (for pass), # while the second channel has values of either one (for valid bins) or zeros (null bins). if num_channels == 2: mixed_x0 = \ lam * x[:, 0, :, :] + (1 - lam) * x[index, 0, :, :] # (B, H, W) mixed_x1 = (x[:, 1, :, :] + x[index, 1, :, :]) # (B, H, W) mixed_x1 = torch.clamp(mixed_x1, min=0, max=1) # (B, H, W) mixed_x = torch.stack([mixed_x0, mixed_x1], dim=1) # (B, 2, H, W) else: raise NotImplementedError y_a, y_b = y, y[index] return mixed_x, y_a, y_b, lam
def get_random_domainess(cur_iter, total_iter, batch): alpha = np.exp((cur_iter - (0.5 * total_iter)) / (0.25 * total_iter)) distribution = Beta(alpha, 1) z = distribution.sample((batch, 1)) z2 = z * torch.rand(1) output = torch.cat([1 - z, z2, z - z2], dim=1) return output
def forward(self, data): grid_vec = data['grid_vec'] target = data['target'] pos = target['pos'] prev_pos = target['previous_pos'] params = next(self.parameters()) # process grid grid_enc = self.grid_enc(grid_vec.to(params).flatten()) # concat grid with other inputs self.prev_actions = [] self.prev_dist = [] enc = torch.cat([grid_enc.squeeze(), target, pos], dim=0) dense = self.trunk(enc) params = torch.abs(self.action_output(dense)) len_cont = len(self.cont_actions) * 2 len_binary = len(self.binary_actions) len_cat = len(self.categorical_actions) beta_params = params[:len_cont] binary_params = torch.sigmoid(params[len_cont: len_cont + len_binary]) cat_params = torch.nn.functional.softmax(params[len_cont + len_binary:]) actions = [] beta_params = beta_params.reshape((len(beta_params) // 2, 2)) # actions for a, param in zip(self.cont_actions, beta_params): dist = Beta(*param) self.prev_dist.append(dist) act = dist.sample() self.prev_actions.append(act) actions.append(a.to_string(a.scale(act))) for a, param in zip(self.binary_actions, binary_params): dist = Categorical(torch.as_tensor([param, 1 - param])) self.prev_dist.append(dist) act = dist.sample() self.prev_actions.append(act) actions.append(a.to_string(act)) if self.categorical_actions: assert(len(self.categorical_actions) == 1) a = self.categorical_actions[0] dist = Categorical(cat_params) self.prev_dist.append(dist) act = dist.sample() self.prev_actions.append(act) actions.append(a.to_string(act)) return actions
def reinforce(env, policy_estimator, num_episodes=2000, batch_size=10, gamma=0.99): total_rewards = [] days_counter = [] batch_rewards = [] batch_states = [] batch_actions = [] counter = 0 ep = 0 days = 0 while ep < num_episodes: # print(ep) s_0 = env.reset() days = 0 states = [] rewards = [] actions = [] done = False while done == False: if days > 1000: print(days) processed_state = process(s_0, 50000) a, b = policy_estimator.foward(processed_state) distribution = Beta(a, b) action = distribution.sample().detach().numpy() s_1, r, done, _ = env.step(action) states.append(processed_state) rewards.append(r) actions.append(action) days += 1 counter += 1 s_0 = s_1 ep += 1 total_rewards.append(sum(rewards)) days_counter.append(days) if counter > 256 and done: # print("reached") returns = discount_rewards(rewards, gamma) batch_states.extend(states) batch_rewards.extend(returns) batch_actions.extend(actions) state_tensor = torch.FloatTensor(batch_states) reward_tensor = torch.FloatTensor(batch_rewards) a_tnsr, b_tnsr = policy_estimator.foward(state_tensor) action_tensor = torch.FloatTensor(batch_actions) policy_estimator.update(a_tnsr, b_tnsr, action_tensor, reward_tensor) batch_rewards = [] batch_actions = [] batch_states = [] counter = 0 # print("finished") return total_rewards, days_counter
def select_action(self, state, deterministic, reparameterize=False): alpha, beta = self.forward(state) dist = Beta(concentration1=alpha, concentration0=beta) if reparameterize: action = dist.rsample() # (bsize, action_dim) else: action = dist.sample() # (bsize, action_dim) return action, dist
def sample_action(self, s): s_T = T.tensor(s).unsqueeze(0) act = self.forward(s_T) c1 = F.sigmoid(act[:, :self.act_dim]) * 5 c2 = F.sigmoid(act[:, self.act_dim:]) * 5 beta_dist = Beta(c1, c2) rnd_act = beta_dist.sample() return rnd_act.detach().squeeze(0).numpy()
def get_lambda(self, batch_size): """ Sample lambda given batch size. """ dist = Beta(self.args.alpha, self.args.alpha) lambda_ = dist.sample(sample_shape=[bsz]).to("cuda") lambda_ = torch.max(lambda_, 1 - lambda_) return lambda_
class OutputMixup(Callback): """ Callback that mixes the output of the last layer and the target. NOTE: this callback is not suitable for regression problems """ run_after, run_valid = [Normalize], False def __init__(self, alpha: float = 0.4): "`alpha` is the parameter for the beta law." alpha = float( alpha) # insures that alpha is a float as an int would crash Beta self.distrib = Beta(tensor(alpha), tensor(alpha)) def begin_fit(self): "Injects the new loss function" if getattr(self.learn.loss_func, 'y_int', False): # classification type of output self.old_loss_func = self.learn.loss_func self.learn.loss_func = self.mixed_loss print(f'Output mixup: the loss function is now properly wrapped.') else: # the output type seem unfit for instrumentation raise Exception( "You cannot use output mixup for regression problems.") def after_fit(self): "Restores the original loss function." self.learn.loss_func = self.old_loss_func def mixed_loss(self, pred, *yb): """ Loss function that mixes the prediction before computing the loss and weighting it. This requires that the softmax / loss function is done fully inside the loss and not in the network. """ if not self.training: return self.old_loss_func(pred, *yb) with NoneReduce(self.old_loss_func) as lf: # shuffles used to match batch elements shuffle = torch.randperm(len(*yb)).to(pred.device) # lambda used for linear combinaison lam = self.distrib.sample((len(*yb), )).squeeze().to(pred.device) lam = torch.stack([lam, 1 - lam], 1) lam = lam.max(1)[0] # shuffled prediction pred_dims = len(pred.size()) pred_mixed = torch.lerp(pred[shuffle], pred, weight=unsqueeze(lam, n=pred_dims - 1)) # shuffled targets yb_shuffled = tuple(L(yb).itemgot(shuffle)) # final loss loss = torch.lerp(lf(pred_mixed, *yb_shuffled), lf(pred_mixed, *yb), lam) return reduce_loss(loss, getattr(self.old_loss_func, 'reduction', 'mean'))
class CutMix(Callback): "Implementation of `https://arxiv.org/abs/1905.04899`" run_after, run_valid = [Normalize], False def __init__(self, alpha=1.): self.distrib = Beta(tensor(alpha), tensor(alpha)) def begin_fit(self): self.stack_y = getattr(self.learn.loss_func, 'y_int', False) if self.stack_y: self.old_lf, self.learn.loss_func = self.learn.loss_func, self.lf def after_fit(self): if self.stack_y: self.learn.loss_func = self.old_lf def begin_batch(self): W, H = self.xb[0].size(3), self.xb[0].size(2) lam = self.distrib.sample((1, )).squeeze().to(self.x.device) lam = torch.stack([lam, 1 - lam]) self.lam = lam.max() shuffle = torch.randperm(self.y.size(0)).to(self.x.device) xb1, self.yb1 = tuple(L(self.xb).itemgot(shuffle)), tuple( L(self.yb).itemgot(shuffle)) nx_dims = len(self.x.size()) x1, y1, x2, y2 = self.rand_bbox(W, H, self.lam) self.learn.xb[0][:, :, x1:x2, y1:y2] = xb1[0][:, :, x1:x2, y1:y2] self.lam = (1 - ((x2 - x1) * (y2 - y1)) / (W * H)).type(torch.float) if not self.stack_y: ny_dims = len(self.y.size()) self.learn.yb = tuple( L(self.yb1, self.yb).map_zip(torch.lerp, weight=unsqueeze(self.lam, n=ny_dims - 1))) def lf(self, pred, *yb): if not self.training: return self.old_lf(pred, *yb) with NoneReduce(self.old_lf) as lf: loss = torch.lerp(lf(pred, *self.yb1), lf(pred, *yb), self.lam) return reduce_loss(loss, getattr(self.old_lf, 'reduction', 'mean')) def rand_bbox(self, W, H, lam): cut_rat = torch.sqrt(1. - lam) cut_w = (W * cut_rat).type(torch.long) cut_h = (H * cut_rat).type(torch.long) # uniform cx = torch.randint(0, W, (1, )).to(self.x.device) cy = torch.randint(0, H, (1, )).to(self.x.device) x1 = torch.clamp(cx - cut_w // 2, 0, W) y1 = torch.clamp(cy - cut_h // 2, 0, H) x2 = torch.clamp(cx + cut_w // 2, 0, W) y2 = torch.clamp(cy + cut_h // 2, 0, H) return x1, y1, x2, y2
class CutmixBlending(BaseMiniBatchBlending): """Implementing Cutmix in a mini-batch. This module is proposed in `CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features <https://arxiv.org/abs/1905.04899>`_. Code Reference https://github.com/clovaai/CutMix-PyTorch Args: num_classes (int): The number of classes. alpha (float): Parameters for Beta distribution. """ def __init__(self, num_classes, alpha=.2): super().__init__(num_classes=num_classes) self.beta = Beta(alpha, alpha) @staticmethod def rand_bbox(img_size, lam): """Generate a random boudning box.""" w = img_size[-1] h = img_size[-2] cut_rat = torch.sqrt(1. - lam) cut_w = torch.tensor(int(w * cut_rat)) cut_h = torch.tensor(int(h * cut_rat)) # uniform cx = torch.randint(w, (1, ))[0] cy = torch.randint(h, (1, ))[0] bbx1 = torch.clamp(cx - cut_w // 2, 0, w) bby1 = torch.clamp(cy - cut_h // 2, 0, h) bbx2 = torch.clamp(cx + cut_w // 2, 0, w) bby2 = torch.clamp(cy + cut_h // 2, 0, h) return bbx1, bby1, bbx2, bby2 def do_blending(self, imgs, label, **kwargs): """Blending images with cutmix.""" assert len(kwargs) == 0, f'unexpected kwargs for cutmix {kwargs}' batch_size = imgs.size(0) rand_index = torch.randperm(batch_size) lam = self.beta.sample() bbx1, bby1, bbx2, bby2 = self.rand_bbox(imgs.size(), lam) imgs[:, ..., bby1:bby2, bbx1:bbx2] = imgs[rand_index, ..., bby1:bby2, bbx1:bbx2] lam = 1 - (1.0 * (bbx2 - bbx1) * (bby2 - bby1) / (imgs.size()[-1] * imgs.size()[-2])) label = lam * label + (1 - lam) * label[rand_index, :] return imgs, label
class Mixup(Callback): run_valid = False def __init__(self, alpha=0.4): self.distrib = Beta(tensor(alpha), tensor(alpha)) def before_batch(self): self.t = self.distrib.sample((self.y.size(0),)).squeeze().to(self.x.device) shuffle = torch.randperm(self.y.size(0)).to(self.x.device) x1, self.y1 = self.x[shuffle], self.y[shuffle] self.learn.xb = (x1 * (1 - self.t[:, None, None, None]) + self.x * self.t[:, None, None, None],) def after_loss(self): with NoneReduce(self.loss_func) as lf: loss = lf(self.pred, self.y1) * (1 - self.t) + lf(self.pred, self.y) * self.t self.learn.loss = loss.mean()
class Prior: def __init__(self): self.r_bottom = Beta(4, 96) self.r_ee50 = Normal(-50, 15 ** 2) self.r_slope = Normal(-0.15, 0.1 ** 2) self.r_top = Beta(25, 75) def sample(self, sample_shape=torch.Size()): bottom_samples = self.r_bottom.sample(sample_shape).view(-1, 1) ee50_samples = self.r_ee50.sample(sample_shape).view(-1, 1) slope_samples = self.r_slope.sample(sample_shape).view(-1, 1) top_samples = self.r_top.sample(sample_shape).view(-1, 1) samples = torch.cat([ bottom_samples, ee50_samples, slope_samples, top_samples], dim=1) return samples def log_prob(self, sample): raise IntractableException
def train_on_batch(self, batch): """perform optimization step. Args: batch (tuple): tuple of batches of environment observations, calling programs, lstm's hidden and cell states Returns: policy loss, value loss, total loss combining policy and value losses """ e_t = torch.FloatTensor(np.stack(batch[0])) i_t = batch[1] lstm_states = batch[2] h_t, c_t = zip(*lstm_states) h_t, c_t = torch.squeeze(torch.stack(list(h_t))), torch.squeeze( torch.stack(list(c_t))) policy_labels = torch.squeeze(torch.stack(batch[3])) value_labels = torch.stack(batch[4]).view(-1, 1) self.optimizer.zero_grad() policy_predictions, value_predictions, _, _ = self.predict_on_batch( e_t, i_t, h_t, c_t) # policy_loss = -torch.mean(policy_labels * torch.log(policy_predictions), dim=-1).mean() beta = Beta(policy_predictions[0], policy_predictions[1]) policy_action = beta.sample() prob_action = beta.log_prob(policy_action) log_mcts = self.temperature * torch.log(policy_labels) with torch.no_grad(): modified_kl = prob_action - log_mcts policy_loss = -modified_kl * (torch.log(modified_kl) + prob_action) entropy_loss = self.entropy_lambda * beta.entropy() policy_network_loss = policy_loss + entropy_loss value_network_loss = torch.pow(value_predictions - value_labels, 2).mean() total_loss = (policy_network_loss + value_network_loss) / 2 total_loss.backward() self.optimizer.step() return policy_network_loss, value_network_loss, total_loss
class AugMix(nn.Module): def __init__(self, k=3, alpha=1, severity=3): super(AugMix, self).__init__() self.k = k self.alpha = alpha self.severity = severity self.dirichlet = Dirichlet(torch.full(torch.Size([k]), alpha, dtype=torch.float32)) self.beta = Beta(alpha, alpha) self.augs = augmentations self.kl = nn.KLDivLoss(reduction='batchmean') def augment_and_mix(self, images, preprocess): ''' Args: images: PIL Image preprocess: transform[ToTensor, Normalize] Returns: AugmentAndMix Tensor ''' mix = torch.zeros_like(preprocess(images)) w = self.dirichlet.sample() for i in range(self.k): aug = images.copy() depth = np.random.randint(1, 4) for _ in range(depth): op = np.random.choice(self.augs) aug = op(aug, 3) mix = mix + w[i] * preprocess(aug) m = self.beta.sample() augmix = m * preprocess(images) + (1 - m) * mix return augmix def jensen_shannon(self, logits_o, logits_1, logits_2): p_o = F.softmax(logits_o, dim=1) p_1 = F.softmax(logits_1, dim=1) p_2 = F.softmax(logits_2, dim=1) # kl(q.log(), p) -> KL(p, q) M = torch.clamp((p_o + p_1 + p_2) / 3, 1e-7, 1) # to avoid exploding js = (self.kl(M.log(), p_o) + self.kl(M.log(), p_1) + self.kl(M.log(), p_2)) / 3 return js
class MixUpCallback(Callback): order = 90 #Runs after normalization and cuda def __init__(self, α: float = 0.4): super().__init__() self.distrib = Beta(tensor([α]), tensor([α])) self.old_loss_func = None self.λ, self.yb1 = None, None def begin_fit(self): self.old_loss_func, self.run.loss_func = self.run.loss_func, self.loss_func def begin_batch(self): ''' Mix the x_batch and y_batch at the beginning of each batch ''' if not self.in_train: return #Only mixup things during training λ = self.distrib.sample( (self.y_batch.size(0), )).squeeze().to(self.x_batch.device) λ = torch.stack([λ, 1 - λ], 1) self.λ = unsqueeze(λ.max(1)[0], (1, 2, 3)) shuffle = torch.randperm(self.y_batch.size(0)).to(self.x_batch.device) xb1, self.yb1 = self.x_batch[shuffle], self.y_batch[shuffle] self.run.xb = lin_comb(self.x_batch, xb1, self.λ) def after_fit(self): ''' Returns the loss function to the original loss function ''' self.run.loss_func = self.old_loss_func def loss_func(self, pred, yb): ''' The loss function for the mixup ''' if not self.in_train: return self.old_loss_func(pred, yb) with NoneReduce(self.old_loss_func) as loss_func: loss1 = loss_func(pred, yb) loss2 = loss_func(pred, self.yb1) loss = lin_comb(loss1, loss2, self.λ) return reduce_loss(loss, getattr(self.old_loss_func, 'reduction', 'mean'))