def __init__(self, config, model, train_loader, val_loader): super(Trainer, self).__init__() self.epoch = config.epoch self.train_loader = train_loader self.val_loader = val_loader self.train_batch_size = train_loader.batch_size self.train_epoch_step = train_loader.__len__() self.check_point = config.check_point if config.check_point < self.train_epoch_step else self.train_epoch_step self.save_point = config.save_point if config.save_point < self.train_epoch_step else self.train_epoch_step self.get_ce_loss = nn.CrossEntropyLoss() self.get_nonreduce_celoss = nn.CrossEntropyLoss(reduction='none') self.get_kld_loss = nn.KLDivLoss(reduction='batchmean') self.get_l1_loss = nn.L1Loss() self.get_smooth_l1_loss = nn.SmoothL1Loss() self.get_sim_loss_noreduce = nn.CosineEmbeddingLoss(reduction='none') self.get_sim_loss = nn.CosineEmbeddingLoss() self.get_sim = nn.CosineSimilarity() self.model = model self.optim = optim.SGD(self.model.parameters(), lr=config.base_lr, momentum=0.9) self.scheduler = lr_scheduler.MultiStepLR(self.optim, milestones=[20, 40, 60, 80], gamma=0.1) self.epoch_num = 0 self.step = 0
def compute_cycle_loss(feat1, feat2, paired=True, device='cuda'): if paired: loss = nn.CosineEmbeddingLoss(0.3)(feat1, feat2, torch.ones( feat1.shape[0]).to(device)) else: loss = nn.CosineEmbeddingLoss(0.3)( feat1, feat2, -torch.ones(feat1.shape[0]).to(device)) return loss
def __init__(self, margin=None, dist="euc"): self.margin = margin if margin is not None: if dist == "euc": self.ranking_loss = nn.MarginRankingLoss(margin=margin) elif dist == "cos": self.ranking_loss = nn.CosineEmbeddingLoss(margin=margin) else: if dist == "euc": self.ranking_loss = self.ranking_loss = nn.SoftMarginLoss() elif dist == "cos": self.ranking_loss = nn.CosineEmbeddingLoss(margin=0)
def __init__(self, d_dim, margin, lamb, dim1, dim2, dim_label, dim_domain, num_epochs, batch_size, model_path,exp_id, use_gpu=True, validation=True): #Setup network super(ADGTrainer, self).__init__(d_dim, dim1, dim2, dim_label, num_epochs, batch_size, model_path, exp_id,use_gpu, validation) self.lamb = lamb self.dim_domain = dim_domain if(use_gpu): self.D = scDGN(self.d_dim, self.dim1, self.dim2, self.dim_label, self.dim_domain).cuda() else: self.D = scDGN(self.d_dim, self.dim1, self.dim2, self.dim_label, self.dim_domain) self.L_L = nn.CrossEntropyLoss().cuda() self.decoder_loss1 = nn.CosineEmbeddingLoss().cuda() self.decoder_loss2 = nn.CosineEmbeddingLoss().cuda() self.L_D = ContrastiveLoss(margin=margin).cuda() self.optimizer = optim.SGD([{'params':self.D.parameters()}], lr=1e-3, momentum=0.9, weight_decay=1e-6, nesterov=True)
def __init__(self, config: Dict): super().__init__() self.config = config self.model_config = DistilBertConfig(**self.config["model"]) self.model = DistilBertModel(self.model_config) self.criterion = nn.CosineEmbeddingLoss(margin=0.0, reduction='mean')
def loss_func(data, decoded): cossim_loss = nn.CosineEmbeddingLoss() # Pytorch built-in Cosine similarity for calculating loss y = torch.tensor(np.ones((data.shape[0], 1)), dtype=torch.float) mse_loss = nn.MSELoss() loss = cossim_loss(data, decoded, y) return loss
def cosine_embedding_loss(device, probe_embeddings, labels, gallery_loader, train_idx_to_class): criterion = nn.CosineEmbeddingLoss() y1 = torch.ones(1).to(device) y2 = -torch.ones(1).to(device) loss1 = 0 loss2 = [] flag = False for idx, probe_embd in enumerate(probe_embeddings): probe_target = train_idx_to_class[labels[idx].item()] for gallery_embedding, gallery_target in gallery_loader: if probe_target == gallery_target[0]: loss1 = criterion(probe_embd.reshape([1, 512]), gallery_embedding.to(device), y1) flag = True else: if len(loss2) >= 10 and flag == True: break elif len(loss2) >= 10 and flag == False: continue else: tmp_loss = criterion(probe_embd.reshape([1, 512]), gallery_embedding.to(device), y2) loss2.append(tmp_loss) return loss1 + (sum(loss2) / len(loss2))
def embedding_expander(source, target, logger): source_words = set(source.vocab.keys()) target_words = set(target.vocab.keys()) intersection = source_words.intersection(target_words) logger.info(f"Intersection words: {len(intersection)}") logger.info(f"Creating loader...") loader = create_loader(intersection, source, target, "cpu") model = VectorTransformer(source.vector_size, target.vector_size) model.to("cpu") optimizer = optim.Adam(model.parameters()) loss_fn = nn.CosineEmbeddingLoss() logger.info(f"Training Vector Transformer...") for i in range(20): model.train() avg_loss = 0. for (x_batch, y_batch) in loader: y_pred = model(x_batch) dummy = torch.ones((y_batch.size(0), )) loss = loss_fn(y_pred, y_batch, dummy) optimizer.zero_grad() loss.backward() optimizer.step() avg_loss += loss.item() / len(loader) logger.info(f"Epoch {i + 1} avg_loss: {avg_loss:.4f}") source_only_words = source_words - intersection expanded_embedding = dict() for word in source_only_words: emb = source.get_vector(word) tensor = torch.tensor(emb, dtype=torch.float32).to("cpu") pred = model(tensor).detach().numpy() expanded_embedding[word] = pred return expanded_embedding
def train(sen, optimizer, train_set, nspeakers, batch_size, alpha=0.5): criterion = nn.CosineEmbeddingLoss() cuda = torch.cuda.is_available() print("CUDA", cuda) sampler = ContrastiveBatchSampler(train_set.labels, nspeakers, batch_size) loader = DataLoader(train_set, batch_sampler=sampler, num_workers=20, pin_memory=True) epoch_loss = 0 for utterance_batch1, utterance_batch2, label_batch1, label_batch2 in loader: optimizer.zero_grad() if cuda: utterance_batch1, utterance_batch2, label_batch1, label_batch2 = utterance_batch1.cuda( ), utterance_batch2.cuda(), label_batch1.cuda(), label_batch2.cuda( ) out1, out2 = sen(utterance_batch1, utterance_batch2) pred1, embed1 = out1 pred2, embed2 = out2 embed_labels = 2 * (label_batch1 == label_batch2).type( torch.cuda.FloatTensor) - 1 embed_loss = criterion(embed1, embed2, embed_labels) closs = classification_loss(pred1, label_batch1) + classification_loss( pred2, label_batch2) loss = alpha * closs + (1 - alpha) * embed_loss epoch_loss += loss.item() loss.backward() optimizer.step() return epoch_loss
def __init__(self, n_cla_per_tsk: Union[np.ndarray, List[int]], class_names_to_idx: Dict[str, int], config: Dict): super(Model, self).__init__(n_cla_per_tsk, class_names_to_idx, config) self.sigma = True device = next(self.net.parameters()).device self.net.model.output_layer = cosine_linear.CosineLinear(in_features=self.latent_dim, out_features=n_cla_per_tsk[0], sigma=self.sigma).to(device) self.reset_optimizer_and_scheduler() self.old_net = copy_freeze(self.net) # type: Union[ResNet, ResNetCIFAR] self.batch_size = config["batch_size"] self.lambda_base = config["lucir_lambda"] self.lambda_cur = self.lambda_base self.K = 2 self.margin_1 = config["lucir_margin_1"] self.margin_2 = config["lucir_margin_2"] # setup losses # self.loss_classification = nn.CrossEntropyLoss(reduction="mean") self.loss_classification = nn.BCEWithLogitsLoss(reduction="mean") self.loss_distill = nn.CosineEmbeddingLoss(reduction="mean") # several losses to allow for the use of different margins self.loss_mr_1 = nn.MarginRankingLoss(margin=self.margin_1, reduction="mean") self.loss_mr_2 = nn.MarginRankingLoss(margin=self.margin_2, reduction="mean") self.method_variables.extend(["lambda_base", "lambda_cur", "K", "margin_1", "margin_2", "sigma"])
def train_caltech(n_epoch=500, dataset_cls=Caltech10): dataset_name = dataset_cls.__name__ models.caltech.set_out_features(key='softmax', value=int(dataset_name.lstrip("Caltech"))) kwta = KWinnersTakeAllSoft(sparsity=0.05) model = models.caltech.resnet18(kwta=kwta) data_loader = DataLoader(dataset_cls) if kwta: criterion = ContrastiveLossSampler(nn.CosineEmbeddingLoss(margin=0.5)) optimizer, scheduler = get_optimizer_scheduler(model) kwta_scheduler = KWTAScheduler(model=model, step_size=10, gamma_sparsity=0.7, min_sparsity=0.05, gamma_hardness=2, max_hardness=20) trainer = TrainerEmbeddingKWTA(model=model, criterion=criterion, data_loader=data_loader, optimizer=optimizer, scheduler=scheduler, kwta_scheduler=kwta_scheduler) else: criterion = nn.CrossEntropyLoss() optimizer, scheduler = get_optimizer_scheduler(model) trainer = TrainerGrad(model=model, criterion=criterion, data_loader=data_loader, optimizer=optimizer, scheduler=scheduler) trainer.train(n_epochs=n_epoch)
def Cosloss(inputs1, inputs2, targets, size_avarage=False): mask = targets != 0 num_ratings = torch.sum(mask.float()) criterion = nn.CosineEmbeddingLoss(size_average=size_avarage) return criterion(inputs1 * mask.float(), inputs2 * mask.float(), targets), Variable( torch.Tensor([1.0])) if size_avarage else num_ratings
def align_loss(otmap_gather_list, pred_gather_list, ctbank_gather_list, err_ctbank_gather_list, use_structure, use_context, structure_max): get_entropy = Entropy() get_sim_loss = nn.CosineEmbeddingLoss() get_smooth_l1_loss = nn.SmoothL1Loss() entropy_val = torch.tensor(0.0) map_loss = torch.tensor(0.0) otmap_len = len(otmap_gather_list) if otmap_len > 0 and use_structure: otmap_gather_stack = torch.stack(otmap_gather_list) otmap_best_label = [torch.eye(structure_max) for x in range(otmap_len)] otmap_best_label = torch.stack(otmap_best_label).cuda() otmap_best_label = Variable(otmap_best_label) entropy_val = get_entropy(otmap_gather_stack) map_loss = get_smooth_l1_loss(otmap_gather_stack, otmap_best_label) #map_loss = get_ce_loss(otmap_gather_stack.view( ct_cor_loss = torch.tensor(0.0) ct_err_loss = torch.tensor(0.0) if len(pred_gather_list) > 0 and use_context: pred_gather_stack = torch.stack(pred_gather_list) ctbank_gather_stack = torch.stack(ctbank_gather_list) err_ctbank_gather_stack = torch.stack(err_ctbank_gather_list) ct_cor_loss = get_sim_loss(pred_gather_stack, ctbank_gather_stack, torch.ones(len(pred_gather_list), 1).cuda()) ct_err_loss = get_sim_loss( pred_gather_stack, err_ctbank_gather_stack, torch.zeros(len(pred_gather_list), 1).cuda()) return entropy_val.cuda(), map_loss.cuda(), ct_cor_loss.cuda( ), ct_err_loss.cuda()
def main(): model = im2recipe(inst_emb=opts.inst_emb) model.visionMLP = torch.nn.DataParallel(model.visionMLP) model.to(device) # define loss function (criterion) and optimizer # cosine similarity between embeddings -> input1, input2, target cosine_crit = nn.CosineEmbeddingLoss(0.1).to(device) if opts.semantic_reg: weights_class = torch.Tensor(opts.numClasses).fill_(1) weights_class[0] = 0 # the background class is set to 0, i.e. ignore # CrossEntropyLoss combines LogSoftMax and NLLLoss in one single class class_crit = nn.CrossEntropyLoss(weight=weights_class).to(device) # we will use two different criteria criterion = [cosine_crit, class_crit] else: criterion = cosine_crit print("=> loading checkpoint '{}'".format(opts.model_path)) if device.type == 'cpu': checkpoint = torch.load(opts.model_path, encoding='latin1', map_location='cpu') else: checkpoint = torch.load(opts.model_path, encoding='latin1') opts.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})".format( opts.model_path, checkpoint['epoch'])) # data preparation, loaders normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # preparing test loader test_loader = torch.utils.data.DataLoader( ImagerLoader( opts.img_path, transforms.Compose([ transforms.Resize( 256 ), # rescale the image keeping the original aspect ratio transforms.CenterCrop( 224), # we get only the center of that rescaled transforms.ToTensor(), normalize, ]), data_path=data_path, sem_reg=opts.semantic_reg, partition='test', n_samples=opts.n_samples), batch_size=opts.batch_size, shuffle=False, num_workers=opts.workers, pin_memory=True) print('Test loader prepared.') # run test test(test_loader, model, criterion)
def main(args): net = Net(args.vocab_size, args.hidden_size, args.embed_size) data = load_data() critic = nn.CosineEmbeddingLoss() params = filter(lambda p: p.requires_grad, net.parameters()) optimizer = Adam(params, lr=args.lr) max_words = args.vocab_size vocab, _ = make_vocab(data[2], max_words) train_loader, test_loader = get_loaders(data, args.batch_size, 0.1, False) sent2num_func = make_sent2num(vocab, args.seq_len) step = 0 for e in range(args.max_epochs): for i, batch_data in enumerate(train_loader): step += 1 optimizer.zero_grad() description_tensor = batch2nums(batch_data["description"], sent2num_func) title_tensor = batch2nums(batch_data["title"], sent2num_func) y = Variable(torch.LongTensor(batch_data["related"])) context_vec, title_vec, att = net(description_tensor, title_tensor) loss = critic(context_vec, title_vec, y) loss.backward() optimizer.step() log_value("loss", loss.data[0], step=step)
def get_loss(criteria, criteria_mask, ehr, ehr_mask, demo, label, query_network, ehr_network, ec_network, device): memory = ehr_network(ehr, demo, ehr_mask) # batch_size, class_num criteria_embd = ec_network(criteria, criteria_mask) #ec_num, mem_dim similarity_label = [] label_mask = [] for i in range(len(label)): if label[i] == 0: similarity_label.append(1) label_mask.append(1) elif label[i] == 1: similarity_label.append(-1) label_mask.append(1) elif label[i] == 2: similarity_label.append(1) label_mask.append(0) similarity_label = torch.tensor(similarity_label, dtype=torch.long).to(device) label_mask = torch.tensor(label_mask, dtype=torch.float32).to(device) ce_loss = nn.CrossEntropyLoss() sm_loss = nn.CosineEmbeddingLoss(margin=0.3, reduction='none') output, response, query, attention = query_network(memory, criteria_embd) #bs, 3 pred = torch.softmax(output, dim=-1) loss = ce_loss(output, label) similarity = sm_loss(response, query, similarity_label) similarity = similarity * label_mask similarity = torch.sum(similarity) / torch.sum(label_mask) return loss, similarity, pred, attention, response, query
def __init__(self, caption_vec_size, image_vec_size, n_keys, cm_val): super().__init__() self.save_hyperparameters("caption_vec_size", "image_vec_size", "n_keys") self.cm_val = cm_val # cap_hsize = 2*caption_vec_size # self.cap_translator = nn.Sequential( # nn.Linear(caption_vec_size, cap_hsize), # nn.Dropout(p=.3), # nn.Sigmoid(), # nn.Linear(cap_hsize, image_vec_size), # nn.Dropout(p=.3) # ) self.cap_translator = lambda x: x # self.img_translator = #nn.Sequential(#nn.LayerNorm(torch.Size([image_vec_size])), self.img_translator = KVMapping( k_size=image_vec_size, n_keys=n_keys, # similar to kd tree values v_size=caption_vec_size, ) # ) # self.img_translator = lambda x : x# worked better than training both at the same time ## may need to tune them separately self.loss = nn.CosineEmbeddingLoss()
def __init__(self, config: dict): super().__init__(epoch=config.get('epoch', 30)) self.lr_init = config.get('lr_init', 3e-4) self.batch_size = config.get('batch_size', 10) data_root = config.get('data_root', 'data_in') data_name = config.get('data_name', None) # must be specified self.sample_length = config.get('sample_length', 100) self.input_size = (1, self.sample_length) model = TSNet(in_channel=1, middle_channel=50, out_channel=10, num_layers=5) self.add_model('tsnet', model, input_size=self.input_size) self.add_optimizer( 'adam', optim.Adam(model.parameters(), lr=self.lr_init, weight_decay=0.01)) self.add_criterion('cosine_embedding_loss', nn.CosineEmbeddingLoss(margin=0.3)) self.dataloader_maker = UCRDataLoaderBuilder( data_root, data_name, self.batch_size, sample_length=self.sample_length) self.set_dataloaders(self.dataloader_maker)
def __init__(self, discriminator, generator, utils, embedder): super(CycleGAN, self).__init__() self.D = discriminator self.G = generator self.R = copy.deepcopy(generator) self.D_opt = torch.optim.Adam(self.D.parameters()) # self.G_opt = torch.optim.Adam(self.G.parameters()) self.G_opt = NoamOpt( utils.emb_mat.shape[1], 1, 4000, torch.optim.Adam(self.G.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)) # self.R_opt = torch.optim.Adam(self.R.parameters()) self.R_opt = NoamOpt( utils.emb_mat.shape[1], 1, 4000, torch.optim.Adam(self.R.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)) self.embed = embedder self.utils = utils self.criterion = nn.CrossEntropyLoss(ignore_index=-1) self.mse = nn.MSELoss() self.cos = nn.CosineSimilarity(dim=-1) self.cosloss = nn.CosineEmbeddingLoss() self.r_criterion = LabelSmoothing(size=utils.emb_mat.shape[0], padding_idx=0, smoothing=0.0) self.r_loss_compute = SimpleLossCompute(self.R.generator, self.r_criterion, self.R_opt)
def __init__(self, criterion: str = None, temperature: float = 1., metric_key: str = "diff_loss"): """ KL Div loss on output callback. Args: criterion: criterion for loss on outputs. Can be kl, mse or cos. temperature: temperature for logits. metric_key: key for metric in batch_metrics dict. Raises: TypeError: if criterion is not correct. """ super().__init__(CallbackOrder.Metric) if criterion is None: criterion = "kl" self.criterion = criterion if criterion == "kl": self.criterion_fn = nn.KLDivLoss() self.temperature = temperature elif criterion == "mse": self.criterion_fn = nn.MSELoss(reduction="sum") elif criterion == "cos": self.criterion_fn = nn.CosineEmbeddingLoss(reduction="mean") else: raise TypeError( f"Criterion should be string one of the kl, mse or cos") if not (self.temperature == 1. or self.criterion == "kl"): Warning("Temperature affects only if criterion is kl") self.metric_key = metric_key
def nomal_loss(pred, targetN, params, depthI, depthJ): depthI = depthI.permute(0, 2, 3, 1) depthJ = depthJ.permute(0, 2, 3, 1) predN_1 = torch.zeros_like(targetN) predN_2 = torch.zeros_like(targetN) f = params[:, :, :, 0] cx = params[:, :, :, 1] cy = params[:, :, :, 2] z1 = depthJ - pred z1 = torch.squeeze(z1) depthJ = torch.squeeze(depthJ) predN_1[:, :, :, 0] = ((MatJ - cx) * z1 + depthJ) * 1.0 / f predN_1[:, :, :, 1] = (MatI - cy) * z1 * 1.0 / f predN_1[:, :, :, 2] = z1 z2 = depthI - pred z2 = torch.squeeze(z2) depthI = torch.squeeze(depthI) predN_2[:, :, :, 0] = (MatJ - cx) * z2 * 1.0 / f predN_2[:, :, :, 1] = ((MatI - cy) * z2 + depthI) * 1.0 / f predN_2[:, :, :, 2] = z2 predN = torch.cross(predN_1, predN_2) pred_n = F.normalize(predN) pred_n = pred_n.contiguous().view(-1, 3) target_n = targetN.contiguous().view(-1, 3) loss_function = nn.CosineEmbeddingLoss() loss = loss_function( pred_n, target_n, Variable(torch.Tensor(pred_n.size(0)).cuda().fill_(1.0))) return loss
def train(self, dataloader, training=True, device='cpu'): total_loss = 0 aug_loss_fn = nn.CosineEmbeddingLoss(reduction='sum') for i, (x, lengths) in enumerate(dataloader): loss = 0 aug_loss = 0 self.optim.zero_grad() B, S = x.size() state = self.encoder(x, lengths) # (B, 2H) inp = th.LongTensor([[self.vocab("<bos>")]]*B).view(B, 1).to(device) # (B, 1) inp = self.encoder.embedding(inp).view(B, -1) # (B, E) for t in range(S): vec , logit, state = self.decoder(inp, state) loss += self.loss_fn(logit, x[:,t]) # Normal seq2seq inp = self.encoder.embedding(x[:, t]) # Training using embedding #inp = vec.view(B, -1) #aug_loss += aug_loss_fn(vec, self.encoder.embedding(x[:, t]).data, th.ones((B,1))) #loss += aug_loss loss /= lengths.sum().item() if training: loss.backward() self.optim.step() total_loss += loss.item() print("\rbatch:{}/{}, loss:{}, total loss:{}".format(i, len(dataloader), loss.item(), total_loss / (i + 1)), end='')
def __init__(self, predictor: TracePredictor, vocab_train: ScreenVocab, vocab_test: ScreenVocab, dataloader_train, dataloader_test, l_rate: float, neg_samp: int, loss_type='cel'): """ predictor: TracePredictor module vocab_train: a ScreenVocab from which to find a negative sample for the training data vocab_test: a ScreenVocab from which to find a negative sample for the testing data dataloader_train, dataloader_test: dataloaders l_rate: learning rate for optimizer neg_samp: number of negative samples to compare against for training data """ self.predictor = predictor self.loss_type = loss_type if self.loss_type == 'cel': self.loss = nn.CrossEntropyLoss(reduction='sum') elif self.loss_type == 'cossim': self.loss = nn.CosineEmbeddingLoss(reduction='sum') self.optimizer = Adam(self.predictor.parameters(), lr=l_rate) self.vocab_train = vocab_train self.vocab_test = vocab_test self.train_data = dataloader_train self.test_data = dataloader_test self.neg_sample_num = neg_samp
def __init__(self, config): ''' seqlen = 16 person_num = 150 rnn_type = 'RNN' learning_rate = 0.001 lr_decay_epoch = 300 cuda = True ''' self.config = config self.config['cuda'] = torch.cuda.is_available() and self.config['cuda'] self.classify_loss = nn.NLLLoss() self.hinge_loss = nn.HingeEmbeddingLoss(self.config['margin']) self.cos_loss = nn.CosineEmbeddingLoss(0.1) self.model = full_model(self.config) if self.config['cuda'] is True: self.model.cuda() self.optimizer = optim.SGD(self.model.parameters(), lr=self.config['learning_rate'], momentum=0.9) # self.optimizer = optim.Adam(self.model.parameters(), lr=self.config['learning_rate']) self.FloatTensor = torch.cuda.FloatTensor if self.config[ 'cuda'] else torch.Tensor self.LongTensor = torch.cuda.LongTensor if self.config[ 'cuda'] else torch.LongTensor
def cosine_loss( s_hidden_states: FloatTensor, t_hidden_states: FloatTensor, attention_mask: LongTensor = None, ) -> FloatTensor: """Cosine loss between hidden states. Args: s_hidden_states (FloatTensor): student hiddens t_hidden_states (FloatTensor): teacher hiddens attention_mask (LongTensor, optional): attention mask if you are using transformers. Defaults to None. Returns: FloatTensor: [description] """ if attention_mask is not None: # HF transformers case return _cosine_loss_hf( s_hidden_states=s_hidden_states, t_hidden_states=t_hidden_states, attention_mask=attention_mask, ) loss_fn = nn.CosineEmbeddingLoss() hidden_dim = s_hidden_states.size(-1) s_hidden_states = s_hidden_states.reshape(-1, hidden_dim) t_hidden_states = t_hidden_states.reshape(-1, hidden_dim) assert s_hidden_states.shape == t_hidden_states.shape target = torch.ones(t_hidden_states.size(0)) return loss_fn(s_hidden_states, t_hidden_states, target)
def __init__(self, args, Y, dicts): super(MultiResCNN, self).__init__() self.word_rep = WordRep(args, Y, dicts) self.conv = nn.ModuleList() filter_sizes = args.filter_size.split(',') self.filter_num = len(filter_sizes) for filter_size in filter_sizes: filter_size = int(filter_size) one_channel = nn.ModuleList() tmp = nn.Conv1d(self.word_rep.feature_size, self.word_rep.feature_size, kernel_size=filter_size, padding=int(floor(filter_size / 2))) xavier_uniform(tmp.weight) one_channel.add_module('baseconv', tmp) conv_dimension = self.word_rep.conv_dict[args.conv_layer] for idx in range(args.conv_layer): tmp = ResidualBlock(conv_dimension[idx], conv_dimension[idx + 1], filter_size, 1, True, args.dropout) one_channel.add_module('resconv-{}'.format(idx), tmp) self.conv.add_module('channel-{}'.format(filter_size), one_channel) self.mapper = Mapper(self.filter_num * args.num_filter_maps, Y) self.output_layer = OutputLayer(args, Y, dicts, self.filter_num * args.num_filter_maps) self.sim = nn.CosineEmbeddingLoss()
def forward(self, new_outputs, new_targets, old_features, new_features, num_classes): '''Args: outputs: torch.tensor(). Size = [64, num_classes]. Use slicing to separate distillation and classification parts. targets: torch.tensor(). Size = [64, num_classes]. Use slicing to separate distillation and classification parts. ''' BATCH_SIZE = 64 lambda_base = 5 # from paper cur_lambda = lambda_base * sqrt( num_classes - 10 / num_classes) # from paper # EPS = 1e-10 # sigmoid= nn.Sigmoid() # clf_loss = torch.mean(-new_targets[:, :num_classes-10]*torch.log(sigmoid(outputs[:, num_classes-10:])+EPS)\ # + (1-new_targets[:, num_classes-10:])* torch.pow(sigmoid(outputs[:, num_classes-10:]), 2)) clf_criterion = nn.BCEWithLogitsLoss() clf_loss = clf_criterion(new_outputs, new_targets) if num_classes == 10: return clf_loss dist_criterion = nn.CosineEmbeddingLoss() dist_loss = dist_criterion(new_features, old_features, torch.ones(BATCH_SIZE).cuda()) dist = (num_classes - 10) / num_classes clf = 10 / num_classes loss = clf * clf_loss + dist * dist_loss * cur_lambda return loss
def __init__(self, margin=0.0): super(CosineSIM, self).__init__() self.criterion = nn.CosineEmbeddingLoss(margin=margin, size_average=None, reduce=None, reduction='sum') logging.info('built criterion (cosine)')
def cosine2by2(representation1, representation2,device = 'cpu'): loss_func = nn.CosineEmbeddingLoss() x1 = torch.cat([representation1,representation1]) x2 = torch.cat([representation2,torch.flipud(representation2)]) y = torch.tensor([1,1,-1,-1],) idx = np.random.choice(len(y),len(y),replace = False) return loss_func(x1[idx].to(device),x2[idx].to(device),y[idx].to(device))
def forward(self, outputs): #import math """oss = torch.max(F.pairwise_distance(outputs[0][0], outputs[1][0])) print("Loss shape = " + str(loss.size(0))) print("Loss value = " + str(loss.data)) loss = torch.max(nn.PairwiseDistance(outputs[0][0], outputs[1][0]) -nn.PairwiseDistance(outputs[0][0], outputs[2][0]) + self.margin, 0) \ + torch.max(nn.PairwiseDistance(outputs[0][1], outputs[2][1]) -nn.PairwiseDistance(outputs[0][1], outputs[1][1]) + self.margin, 0) \ + nn.BCELoss(outputs[0][2], outputs[0][3]) + nn.BCELoss(outputs[1][2], outputs[1][3]) + nn.BCELoss(outputs[2][2], outputs[2][3]) """ loss_func = nn.MSELoss() # cos = nn.CosineSimilarity(dim=1, eps=1e-6) cos = nn.CosineEmbeddingLoss(margin=0.3) # loss = torch.mean(cos(outputs[0][0], outputs[1][0]) - cos(outputs[0][0], outputs[2][0])) + torch.mean(cos(outputs[0][1], outputs[2][1]) - cos(outputs[0][1], outputs[1][1])) + loss_func(outputs[0][2], outputs[0][3]) + loss_func(outputs[1][2], outputs[1][3]) + loss_func(outputs[2][2], outputs[2][3]) N, L = outputs[0][0].size() # print(outputs[0][0].size()) Z = Variable(torch.zeros(N).cuda()) # loss = torch.sum(torch.max(cos(outputs[0][0], outputs[1][0]) - cos(outputs[0][0], outputs[2][0]), Z)) + torch.sum(torch.max(cos(outputs[0][1], outputs[2][1]) - cos(outputs[0][1], outputs[1][1]), Z)) + loss_func(outputs[0][2], outputs[0][3]) + loss_func(outputs[1][2], outputs[1][3]) + loss_func(outputs[2][2], outputs[2][3]) # loss = torch.sum(torch.max(torch.abs(cos(outputs[0][0], outputs[1][0])) - torch.abs(cos(outputs[0][0], outputs[2][0])) + self.margin, Z)) + torch.sum(torch.max(torch.abs(cos(outputs[0][1], outputs[2][1])) - torch.abs(cos(outputs[0][1], outputs[1][1])) + self.margin, Z)) + loss_func(outputs[0][2], outputs[0][3]) + loss_func(outputs[1][2], outputs[1][3]) + loss_func(outputs[2][2], outputs[2][3]) loss = cos(outputs[0][0], outputs[1][0], Variable( torch.ones(N).cuda())) + cos( outputs[0][1], outputs[1][1], Variable(-1 * torch.ones(N).cuda())) + cos( outputs[0][1], outputs[2][1], Variable( torch.ones(N).cuda())) + cos( outputs[0][0], outputs[2][0], Variable(-1 * torch.ones(N).cuda())) + loss_func( outputs[0][2], outputs[0][3]) """ loss = cos(outputs[0][0], outputs[1][0]) - cos(outputs[0][0], outputs[2][0]) + cos(outputs[0][1], outputs[2][1]) - cos(outputs[0][1], outputs[1][1]) + nn.BCELoss(outputs[0][2], outputs[0][3]) + nn.BCELoss(outputs[1][2], outputs[1][3]) + nn.BCELoss(outputs[2][2], outputs[2][3]) """ return loss