def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, ): r""" labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): Labels for computing the ELECTRA loss. Input should be a sequence of tokens (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``. ``0`` indicates the token is an original token, ``1`` indicates the token was replaced. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Total loss of the ELECTRA objective. scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`) Prediction scores of the head (scores for each token before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import ElectraTokenizer, ElectraForPreTraining import torch tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator') model = ElectraForPreTraining.from_pretrained('google/electra-small-discriminator') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) prediction_scores, seq_relationship_scores = outputs[:2] """ discriminator_hidden_states = self.electra( input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, output_attentions, ) discriminator_sequence_output = discriminator_hidden_states[0] logits = self.discriminator_predictions(discriminator_sequence_output, attention_mask) output = (logits, ) if labels is not None: loss_fct = nn.BCEWithLogitsLoss() if attention_mask is not None: active_loss = attention_mask.view( -1, discriminator_sequence_output.shape[1]) == 1 active_logits = logits.view( -1, discriminator_sequence_output.shape[1])[active_loss] active_labels = labels[active_loss] loss = loss_fct(active_logits, active_labels.float()) else: loss = loss_fct( logits.view(-1, discriminator_sequence_output.shape[1]), labels.float()) output = (loss, ) + output output += discriminator_hidden_states[1:] return output # (loss), scores, (hidden_states), (attentions)
method = "ap-perf" # uncomment if we want to use ap-perf objective # method = "bce-loss" # uncomment if we want to use bce-loss objective torch.manual_seed(1) use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") nvar = X_tr.shape[1] model = Net(nvar).to(device) if method == "ap-perf": criterion = MetricLayer(f2).to(device) lr = 3e-3 weight_decay = 1e-3 else: criterion = nn.BCEWithLogitsLoss().to(device) lr = 1e-2 weight_decay = 1e-3 optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay) for epoch in range(100): for i, (inputs, labels) in enumerate(trainloader): inputs = inputs.to(device) labels = labels.to(device) optimizer.zero_grad() output = model(inputs) loss = criterion(output, labels)
def train_model_2_class(directory, f, opt): filepath = directory + f print('Loading data...') train_ldrs, test_ldrs = load_data(filepath + '.csv') tr_loss = [] tr_acc = [] vl_loss = [] vl_acc = [] train_sizes = [] test_sizes = [] for train_ldr, test_ldr in zip(train_ldrs, test_ldrs): train_sizes.append(len(train_ldr.dataset)) test_sizes.append(len(test_ldr.dataset)) net = utils.SmallNet(2).to(device) net.size = net_size net.n_filters = net.size criterion = nn.BCEWithLogitsLoss() if opt == 'SGD': optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9, weight_decay=0.0005, nesterov=True) elif opt == 'Adam': optimizer = optim.Adam(net.parameters(), lr=0.001, betas=(0.9, 0.999), weight_decay=0.0005, amsgrad=False) elif opt == 'RMSprop': optimizer = optim.RMSprop(net.parameters(), lr=0.01, weight_decay=0.0005, momentum=0.9) else: raise ValueError('Invalid optimizer selected. Choose \'SGD\' or ' '\'Adam\'.') scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=n_schedule, gamma=0.1) print('Training...') print('Filters per layer:', net.n_filters) print('Criterion:', criterion) print(optimizer) losses = [[], [100]] accs = [[], []] early_stopping = 0 for epoch in range(n_epochs): # Training net.training = True train_correct = 0 train_total = 0 train_loss = 0.0 for local_batch, local_labels in train_ldr: # Transfer to GPU local_batch = local_batch.to(device, dtype=torch.float) local_labels = local_labels.view(-1, 1).to(device, dtype=torch.float) # Train optimizer.zero_grad() # Forward + backward + optimize logits = net(local_batch).view(-1, 1) loss = criterion(logits, local_labels) loss.backward() optimizer.step() # Tracking train_loss += loss.item() outputs = torch.sigmoid(logits) predicted = (outputs >= 0.5).view(-1).to(device, dtype=torch.long) local_labels = local_labels.view(-1).to(device, dtype=torch.long) train_total += local_labels.size(0) train_correct += (predicted == local_labels).sum().item() train_acc = train_correct / train_total scheduler.step() # Validation net.training = False val_correct = 0 val_total = 0 val_loss = 0 with torch.no_grad(): for local_batch, local_labels in test_ldr: # Transfer to GPU local_batch = local_batch.to(device, dtype=torch.float) local_labels = local_labels.view(-1, 1).to(device, dtype=torch.float) # Test logits = net(local_batch).view(-1, 1) loss = criterion(logits, local_labels) # Tracking val_loss += loss.item() outputs = torch.sigmoid(logits) predicted = (outputs >= 0.5).view(-1).to(device, dtype=torch.long) local_labels = local_labels.view(-1).to(device, dtype=torch.long) val_total += local_labels.size(0) val_correct += (predicted == local_labels).sum().item() val_acc = val_correct / val_total losses[0].append(train_loss) losses[1].append(val_loss) accs[0].append(train_acc) accs[1].append(val_acc) if val_loss >= losses[1][-2]: early_stopping += 1 elif early_stopping > 0: early_stopping -= 1 early = False if early_stopping >= n_early and epoch > min_epochs: early = True if epoch % 10 == 9 or early: print('Epoch:', epoch + 1, '| Train Acc:', round(train_acc, 8), '| Train Loss:', round(train_loss, 8), '| Val Acc:', round(val_acc, 8), '| Val Loss:', round(val_loss, 8), '| Early:', early_stopping) if early: print('Early stopping.') break losses[1] = losses[1][1:] tr_loss.append(losses[0]) tr_acc.append(accs[0]) vl_loss.append(losses[1]) vl_acc.append(accs[1]) best = [mean(a[-10:]) for a in vl_acc] if plot_: # Plot loss and accuracy savedir_ = savedir + '\cnn-2d\\' + f[1:] + '\\' plot(savedir_, f, tr_loss, tr_acc, vl_loss, vl_acc, best) return best, train_sizes, test_sizes
def compute_loss(p, targets, model): # predictions, targets, model ft = torch.cuda.FloatTensor if p[0].is_cuda else torch.Tensor lcls, lbox, lobj = ft([0]), ft([0]), ft([0]) tcls, tbox, indices, anchor_vec = build_targets(model, targets) h = model.hyp # hyperparameters arc = model.arc # # (default, uCE, uBCE) detection architectures # Define criteria BCEcls = nn.BCEWithLogitsLoss(pos_weight=ft([h['cls_pw']])) BCEobj = nn.BCEWithLogitsLoss(pos_weight=ft([h['obj_pw']])) BCE = nn.BCEWithLogitsLoss() CE = nn.CrossEntropyLoss() # weight=model.class_weights if 'F' in arc: # add focal loss g = h['fl_gamma'] BCEcls, BCEobj, BCE, CE = FocalLoss(BCEcls, g), FocalLoss( BCEobj, g), FocalLoss(BCE, g), FocalLoss(CE, g) # Compute losses for i, pi in enumerate(p): # layer index, layer predictions b, a, gj, gi = indices[i] # image, anchor, gridy, gridx tobj = torch.zeros_like(pi[..., 0]) # target obj # Compute losses nb = len(b) if nb: # number of targets ps = pi[b, a, gj, gi] # prediction subset corresponding to targets tobj[b, a, gj, gi] = 1.0 # obj # ps[:, 2:4] = torch.sigmoid(ps[:, 2:4]) # wh power loss (uncomment) # GIoU pxy = torch.sigmoid( ps[:, 0:2]) # pxy = pxy * s - (s - 1) / 2, s = 1.5 (scale_xy) pbox = torch.cat( (pxy, torch.exp(ps[:, 2:4]).clamp(max=1E4) * anchor_vec[i]), 1) # predicted box giou = bbox_iou(pbox.t(), tbox[i], x1y1x2y2=False, GIoU=True) # giou computation lbox += (1.0 - giou).mean() # giou loss if 'default' in arc and model.nc > 1: # cls loss (only if multiple classes) t = torch.zeros_like(ps[:, 5:]) # targets t[range(nb), tcls[i]] = 1.0 lcls += BCEcls(ps[:, 5:], t) # BCE # lcls += CE(ps[:, 5:], tcls[i]) # CE # Instance-class weighting (use with reduction='none') # nt = t.sum(0) + 1 # number of targets per class # lcls += (BCEcls(ps[:, 5:], t) / nt).mean() * nt.mean() # v1 # lcls += (BCEcls(ps[:, 5:], t) / nt[tcls[i]].view(-1,1)).mean() * nt.mean() # v2 # Append targets to text file # with open('targets.txt', 'a') as file: # [file.write('%11.5g ' * 4 % tuple(x) + '\n') for x in torch.cat((txy[i], twh[i]), 1)] if 'default' in arc: # separate obj and cls lobj += BCEobj(pi[..., 4], tobj) # obj loss elif 'BCE' in arc: # unified BCE (80 classes) t = torch.zeros_like(pi[..., 5:]) # targets if nb: t[b, a, gj, gi, tcls[i]] = 1.0 lobj += BCE(pi[..., 5:], t) elif 'CE' in arc: # unified CE (1 background + 80 classes) t = torch.zeros_like(pi[..., 0], dtype=torch.long) # targets if nb: t[b, a, gj, gi] = tcls[i] + 1 lcls += CE(pi[..., 4:].view(-1, model.nc + 1), t.view(-1)) lbox *= h['giou'] lobj *= h['obj'] lcls *= h['cls'] loss = lbox + lobj + lcls return loss, torch.cat((lbox, lobj, lcls, loss)).detach()
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token] model.static_embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM) model.static_embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM) else: model.non_static_embedding.weight.data.copy_(pretrained_embeddings) UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token] model.non_static_embedding.weight.data[UNK_IDX] = torch.zeros( EMBEDDING_DIM) model.non_static_embedding.weight.data[PAD_IDX] = torch.zeros( EMBEDDING_DIM) # setting optimizer optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters())) # setting loss function criterion = nn.BCEWithLogitsLoss() criterion = criterion.to(device) # setting some vars N_EPOCHS = 20 best_valid_loss = float('inf') last_valid_loss = float('inf') model = model.to(device) for epoch in range(N_EPOCHS): start_time = time.time() train_loss, train_acc = utils.train(model, train_iterator, optimizer, criterion)
def build_model(self): """ DataLoader """ train_transform = transforms.Compose([ transforms.RandomHorizontalFlip(), # resize to +15 for 128 pix image, +30 for 256 pix image transforms.Resize((self.img_size + 15, self.img_size + 15)), transforms.RandomCrop(self.img_size), transforms.ToTensor(), transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) ]) test_transform = transforms.Compose([ transforms.Resize((self.img_size, self.img_size)), transforms.ToTensor(), transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) ]) self.trainA = ImageFolder( os.path.join('dataset', self.dataset, 'trainA'), train_transform) self.trainB = ImageFolder( os.path.join('dataset', self.dataset, 'trainB'), train_transform) self.testA = ImageFolder( os.path.join('dataset', self.dataset, 'testA'), test_transform) self.testB = ImageFolder( os.path.join('dataset', self.dataset, 'testB'), test_transform) self.trainA_loader = DataLoader(self.trainA, batch_size=self.batch_size, shuffle=True) self.trainB_loader = DataLoader(self.trainB, batch_size=self.batch_size, shuffle=True) self.testA_loader = DataLoader(self.testA, batch_size=1, shuffle=False) self.testB_loader = DataLoader(self.testB, batch_size=1, shuffle=False) """ Define Generator, Discriminator """ self.genA2B = ResnetGenerator( input_nc=3, output_nc=3, ngf=self.ch, n_blocks=self.n_res, img_size=self.img_size, light=self.light, ).to(self.device) self.genB2A = ResnetGenerator( input_nc=3, output_nc=3, ngf=self.ch, n_blocks=self.n_res, img_size=self.img_size, light=self.light, ).to(self.device) self.disGA = Discriminator( input_nc=3, ndf=self.ch, n_layers=7, ).to(self.device) self.disGB = Discriminator( input_nc=3, ndf=self.ch, n_layers=7, ).to(self.device) self.disLA = Discriminator( input_nc=3, ndf=self.ch, n_layers=5, ).to(self.device) self.disLB = Discriminator( input_nc=3, ndf=self.ch, n_layers=5, ).to(self.device) """ Define Loss """ self.L1_loss = nn.L1Loss().to(self.device) self.MSE_loss = nn.MSELoss().to(self.device) self.BCE_loss = nn.BCEWithLogitsLoss().to(self.device) """ Trainer """ self.G_optim = torch.optim.Adam( itertools.chain( self.genA2B.parameters(), self.genB2A.parameters(), ), lr=self.lr, betas=(0.5, 0.999), weight_decay=self.weight_decay, ) self.D_optim = torch.optim.Adam( itertools.chain( self.disGA.parameters(), self.disGB.parameters(), self.disLA.parameters(), self.disLB.parameters(), ), lr=self.lr, betas=(0.5, 0.999), weight_decay=self.weight_decay, ) """ Define Rho clipper to constraint the value of rho in AdaILN and ILN""" self.Rho_clipper = RhoClipper(0, 1)
def __init__( self, document_embeddings: flair.embeddings.DocumentEmbeddings, label_dictionary: Dictionary, label_type: str = None, multi_label: bool = None, multi_label_threshold: float = 0.5, beta: float = 1.0, loss_weights: Dict[str, float] = None, ): """ Initializes a TextClassifier :param document_embeddings: embeddings used to embed each data point :param label_dictionary: dictionary of labels you want to predict :param multi_label: auto-detected by default, but you can set this to True to force multi-label prediction or False to force single-label prediction :param multi_label_threshold: If multi-label you can set the threshold to make predictions :param beta: Parameter for F-beta score for evaluation and training annealing :param loss_weights: Dictionary of weights for labels for the loss function (if any label's weight is unspecified it will default to 1.0) """ super(TextClassifier, self).__init__() self.document_embeddings: flair.embeddings.DocumentEmbeddings = document_embeddings self.label_dictionary: Dictionary = label_dictionary self.label_type = label_type if multi_label is not None: self.multi_label = multi_label else: self.multi_label = self.label_dictionary.multi_label self.multi_label_threshold = multi_label_threshold self.beta = beta self.weight_dict = loss_weights # Initialize the weight tensor if loss_weights is not None: n_classes = len(self.label_dictionary) weight_list = [1. for i in range(n_classes)] for i, tag in enumerate(self.label_dictionary.get_items()): if tag in loss_weights.keys(): weight_list[i] = loss_weights[tag] self.loss_weights = torch.FloatTensor(weight_list).to(flair.device) else: self.loss_weights = None self.decoder = nn.Linear(self.document_embeddings.embedding_length, len(self.label_dictionary)) nn.init.xavier_uniform_(self.decoder.weight) if self.multi_label: self.loss_function = nn.BCEWithLogitsLoss(weight=self.loss_weights) else: self.loss_function = nn.CrossEntropyLoss(weight=self.loss_weights) # auto-spawn on GPU if available self.to(flair.device)
def forward(self, teacher_features, features, y_pred, labels): consistency_loss = nn.MSELoss()(teacher_features.view(-1), features.view(-1)) cls_loss = nn.BCEWithLogitsLoss()(y_pred, labels) loss = self.weights[0] * consistency_loss + self.weights[1] * cls_loss return loss
def train_loop(folds, fold): if CFG.device == 'GPU': LOGGER.info(f"========== fold: {fold} training ==========") elif CFG.device == 'TPU': if CFG.nprocs == 1: LOGGER.info(f"========== fold: {fold} training ==========") elif CFG.nprocs == 8: xm.master_print(f"========== fold: {fold} training ==========") # ==================================================== # loader # ==================================================== trn_idx = folds[folds['fold'] != fold].index val_idx = folds[folds['fold'] == fold].index train_folds = folds.loc[trn_idx].reset_index(drop=True) valid_folds = folds.loc[val_idx].reset_index(drop=True) train_folds = train_folds[train_folds['StudyInstanceUID'].isin(train_annotations['StudyInstanceUID'].unique())].reset_index(drop=True) valid_labels = valid_folds[CFG.target_cols].values train_dataset = TrainDataset(train_folds, train_annotations, use_annot=True, transform=get_transforms(data='train')) valid_dataset = TrainDataset(valid_folds, train_annotations, use_annot=False, transform=get_transforms(data='valid')) if CFG.device == 'GPU': train_loader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True, num_workers=CFG.num_workers, pin_memory=True, drop_last=True) valid_loader = DataLoader(valid_dataset, batch_size=CFG.batch_size * 2, shuffle=False, num_workers=CFG.num_workers, pin_memory=True, drop_last=False) elif CFG.device == 'TPU': train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=CFG.batch_size, sampler=train_sampler, drop_last=True, num_workers=CFG.num_workers) valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=False) valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=CFG.batch_size * 2, sampler=valid_sampler, drop_last=False, num_workers=CFG.num_workers) # ==================================================== # scheduler # ==================================================== def get_scheduler(optimizer): if CFG.scheduler=='ReduceLROnPlateau': scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=CFG.factor, patience=CFG.patience, verbose=True, eps=CFG.eps) elif CFG.scheduler=='CosineAnnealingLR': scheduler = CosineAnnealingLR(optimizer, T_max=CFG.T_max, eta_min=CFG.min_lr, last_epoch=-1) elif CFG.scheduler=='CosineAnnealingWarmRestarts': scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=CFG.T_0, T_mult=1, eta_min=CFG.min_lr, last_epoch=-1) return scheduler # ==================================================== # model & optimizer # ==================================================== if CFG.device == 'TPU': device = xm.xla_device() elif CFG.device == 'GPU': device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') teacher_model = CustomSeResNet152D(CFG.model_name, pretrained=False) teacher_model.to(device) state = torch.load(CFG.teacher) teacher_model.load_state_dict(state['model']) for param in teacher_model.parameters(): param.requires_grad = False teacher_model.eval() # teacher_model.to(device) model = CustomSeResNet152D_WLF(CFG.model_name, pretrained=True) model.to(device) # state = torch.load(CFG.student) # model.load_state_dict(state['model']) optimizer = Adam(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay, amsgrad=False) scheduler = get_scheduler(optimizer) # ==================================================== # loop # ==================================================== train_criterion = CustomLoss(weights=CFG.weights) valid_criterion = nn.BCEWithLogitsLoss() best_score = 0. best_loss = np.inf for epoch in range(CFG.epochs): start_time = time.time() # train if CFG.device == 'TPU': if CFG.nprocs == 1: avg_loss = train_fn(train_loader, teacher_model, model, train_criterion, optimizer, epoch, scheduler, device) elif CFG.nprocs == 8: para_train_loader = pl.ParallelLoader(train_loader, [device]) avg_loss = train_fn(para_train_loader.per_device_loader(device), teacher_model, model, train_criterion, optimizer, epoch, scheduler, device) elif CFG.device == 'GPU': avg_loss = train_fn(train_loader, teacher_model, model, train_criterion, optimizer, epoch, scheduler, device) # eval if CFG.device == 'TPU': if CFG.nprocs == 1: avg_val_loss, preds, _ = valid_fn(valid_loader, model, valid_criterion, device) elif CFG.nprocs == 8: para_valid_loader = pl.ParallelLoader(valid_loader, [device]) avg_val_loss, preds, valid_labels = valid_fn(para_valid_loader.per_device_loader(device), model, valid_criterion, device) preds = idist.all_gather(torch.tensor(preds)).to('cpu').numpy() valid_labels = idist.all_gather(torch.tensor(valid_labels)).to('cpu').numpy() elif CFG.device == 'GPU': avg_val_loss, preds, _ = valid_fn(valid_loader, model, valid_criterion, device) if isinstance(scheduler, ReduceLROnPlateau): scheduler.step(avg_val_loss) elif isinstance(scheduler, CosineAnnealingLR): scheduler.step() elif isinstance(scheduler, CosineAnnealingWarmRestarts): scheduler.step() # scoring score, scores = get_score(valid_labels, preds) elapsed = time.time() - start_time if CFG.device == 'GPU': LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} time: {elapsed:.0f}s') LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f} Scores: {np.round(scores, decimals=4)}') elif CFG.device == 'TPU': if CFG.nprocs == 1: LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} time: {elapsed:.0f}s') LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f} Scores: {np.round(scores, decimals=4)}') elif CFG.nprocs == 8: xm.master_print(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} time: {elapsed:.0f}s') xm.master_print(f'Epoch {epoch+1} - Score: {score:.4f} Scores: {np.round(scores, decimals=4)}') if score > best_score: best_score = score if CFG.device == 'GPU': LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model') torch.save({'model': model.state_dict(), 'preds': preds}, OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best_score.pth') elif CFG.device == 'TPU': if CFG.nprocs == 1: LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model') elif CFG.nprocs == 8: xm.master_print(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model') xm.save({'model': model, 'preds': preds}, OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best_score.pth') if avg_val_loss < best_loss: best_loss = avg_val_loss if CFG.device == 'GPU': LOGGER.info(f'Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model') torch.save({'model': model.state_dict(), 'preds': preds}, OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best_loss.pth') elif CFG.device == 'TPU': if CFG.nprocs == 1: LOGGER.info(f'Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model') elif CFG.nprocs == 8: xm.master_print(f'Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model') xm.save({'model': model, 'preds': preds}, OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best_loss.pth') # # inference用に全て保存しておく # if CFG.device == 'TPU': # xm.save({'model': model.state_dict()}, OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_epoch{epoch+1}.pth') # elif CFG.device == 'GPU': # torch.save({'model': model.state_dict()}, OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_epoch{epoch+1}.pth') if CFG.nprocs != 8: check_point = torch.load(OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best_score.pth') for c in [f'pred_{c}' for c in CFG.target_cols]: valid_folds[c] = np.nan valid_folds[[f'pred_{c}' for c in CFG.target_cols]] = check_point['preds'] return valid_folds
def bce_loss(output, target): criterion = nn.BCEWithLogitsLoss(reduction='mean') return criterion(output, target)
def __init__(self, batch=True): super(dice_bce_loss, self).__init__() self.batch = batch # self.bce_loss = nn.BCELoss() self.bce_loss = nn.BCEWithLogitsLoss()
G = VAE_SR(input_dim=3, dim=64, scale_factor=opt.upscale_factor) D = discriminator(num_channels=3, base_filter=64, image_size=opt.patch_size * opt.upscale_factor) feat_extractor = VGGFeatureExtractor(feature_layer=34, use_bn=False, use_input_norm=True, device='cuda') denoiser = torch.nn.DataParallel(denoiser, device_ids=gpus_list) G = torch.nn.DataParallel(G, device_ids=gpus_list) D = torch.nn.DataParallel(D, device_ids=gpus_list) feat_extractor = torch.nn.DataParallel(feat_extractor, device_ids=gpus_list) L1_loss = nn.L1Loss() BCE_loss = nn.BCEWithLogitsLoss() print('---------- Generator architecture -------------') print_network(G) print('---------- Discriminator architecture -------------') print_network(D) print('----------------------------------------------') model_denoiser = os.path.join(opt.save_folder + 'VAE_denoiser.pth') denoiser.load_state_dict( torch.load(model_denoiser, map_location=lambda storage, loc: storage)) print('Pre-trained Denoiser model is loaded.') if opt.pretrained: model_G = os.path.join(opt.save_folder + opt.pretrained_sr) model_D = os.path.join(opt.save_folder + opt.pretrained_D)
def __init__(self, num_hard=0): super(Loss, self).__init__() self.classify_loss = nn.BCEWithLogitsLoss()
def train_model(model, tag_name, target_cols_now, fine_tune_scheduler=None): print(f'=={tag_name}==') train_losses = list() valid_losses = list() x_train, y_train = train_df[feature_cols].values, train_df[ target_cols_now].values x_valid, y_valid = valid_df[feature_cols].values, valid_df[ target_cols_now].values train_dataset = MoADataset(x_train, y_train) valid_dataset = MoADataset(x_valid, y_valid) trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True) validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False) optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY[tag_name]) scheduler = optim.lr_scheduler.OneCycleLR( optimizer=optimizer, steps_per_epoch=len(trainloader), pct_start=PCT_START, div_factor=DIV_FACTOR[tag_name], max_lr=MAX_LR[tag_name], epochs=EPOCHS) loss_fn = nn.BCEWithLogitsLoss() loss_tr = SmoothCrossEntropyLoss(smoothing=smoothing) oof = np.zeros((len(train), len(target_cols_now))) best_loss = np.inf # for epoch in range(EPOCHS): # if fine_tune_scheduler is not None: # fine_tune_scheduler.step(epoch, model) # print(f'-----EPOCH{epoch+1}-----') # train_loss, train_metric = train_fn(model, optimizer, scheduler, loss_tr, loss_fn, trainloader, DEVICE) # print(f'train_loss: {train_loss:.5f}, train_metric: {train_metric:.5f}') # train_losses.append(train_loss) # valid_loss, valid_metric, valid_preds = valid_fn(model, loss_tr, loss_fn, validloader, DEVICE) # print(f'valid_loss: {valid_loss:.5f}, valid_metric: {valid_metric:.5f}') # valid_losses.append(valid_loss) # if valid_loss < best_loss: # best_loss = valid_loss # oof[val_idx] = valid_preds # torch.save(model.state_dict(), f'Simple_Deep_FOLD{fold+1}_SEED{seed}.pth') # plt.plot(train_losses, label='train_losses') # plt.plot(valid_losses, label='valid_losses') # plt.xlabel('epochs') # plt.ylabel('loss') # if tag_name == 'ALL_TARGETS': # plt.ylim([6e-3, 1.5e-2]) # else: # plt.ylim([1e-2, 2e-2]) # plt.title(f'fold{fold+1} losses') # plt.show() valid_loss, valid_metric, valid_preds = valid_fn( model, loss_tr, loss_fn, validloader, DEVICE) oof[val_idx] = valid_preds return oof
def main(): args.cuda = not args.no_cuda and torch.cuda.is_available() if args.cuda: cudnn.benchmark = True noise_list = ['airport_', 'babble_', 'car_', 'destroyerengine_', 'F16_cockpit_', 'factory_', 'machinegun_', 'street_', 'train_', 'volvo_'] SNR_list = ['SNR-5', 'SNR0', 'SNR5', 'SNR10'] DB_list = [] for i in range(len(noise_list)): for j in range(len(SNR_list)): DB_list.append(noise_list[i] + SNR_list[j]) LOG_DIR = args.log_dir + str( args.seed) + '/Padding-{}/Atype-{}_Loss-{}_gamma-{}'.format(args.padding_time, args.attention_type, args.loss, args.gamma) print(LOG_DIR) input_size = c.FILTER_BANK model = Model(rnn_model=args.RNN_model, input_size=input_size, rnn_hidden_size=args.hidden_size, num_layers=args.num_layers, dnn_hidden_size=c.P_DNN_HIDDEN_SIZE, seq_len=args.seq_len, attention_type=args.attention_type) test_DB = read_DB_structure(os.path.join(c.MFB_DIR + '_' + str(1.0), 'test_folder'), 'test') device_num = 'cuda:' + args.gpu_id device = torch.device(device_num) if args.cuda: model.to(device) print('=> loading checkpoint: CP_NUM = ' + str(args.cp_num)) checkpoint = torch.load(LOG_DIR + '/checkpoint ' + str(args.cp_num) + '.pth') model.load_state_dict(checkpoint['state_dict']) model.eval() criterion = nn.BCEWithLogitsLoss() snr_files = np.zeros(4) snr_AUC = np.zeros(4) five_files = np.zeros(4) five_noises_auc = np.zeros(4) for i in range(len(DB_list)): selected_DB = select_test_DB(test_DB, DB_list[i]) print(DB_list[i]) m_Acc, m_AUC, m_EER, m_cost, temp_AUC, n_files = test(model, selected_DB, criterion) snr = DB_list[i].split('_')[-1][3:] if snr == '-5': snr_files[0] = snr_files[0] + n_files snr_AUC[0] = snr_AUC[0] + temp_AUC if DB_list[i].split('_')[0] == 'babble' or DB_list[i].split('_')[0] == 'destroyerengine' or \ DB_list[i].split('_')[0] == 'F16_cockpit' or DB_list[i].split('_')[0] == 'factory' or DB_list[i].split('_')[0] == 'street': five_noises_auc[0] = five_noises_auc[0] + temp_AUC five_files[0] = five_files[0] + n_files elif snr == '0': snr_files[1] = snr_files[1] + n_files snr_AUC[1] = snr_AUC[1] + temp_AUC if DB_list[i].split('_')[0] == 'babble' or DB_list[i].split('_')[0] == 'destroyerengine' or \ DB_list[i].split('_')[0] == 'F16_cockpit' or DB_list[i].split('_')[0] == 'factory' or DB_list[i].split('_')[0] == 'street': five_noises_auc[1] = five_noises_auc[1] + temp_AUC five_files[1] = five_files[1] + n_files elif snr == '5': snr_files[2] = snr_files[2] + n_files snr_AUC[2] = snr_AUC[2] + temp_AUC if DB_list[i].split('_')[0] == 'babble' or DB_list[i].split('_')[0] == 'destroyerengine' or \ DB_list[i].split('_')[0] == 'F16_cockpit' or DB_list[i].split('_')[0] == 'factory' or DB_list[i].split('_')[0] == 'street': five_noises_auc[2] = five_noises_auc[2] + temp_AUC five_files[2] = five_files[2] + n_files elif snr == '10': snr_files[3] = snr_files[3] + n_files snr_AUC[3] = snr_AUC[3] + temp_AUC if DB_list[i].split('_')[0] == 'babble' or DB_list[i].split('_')[0] == 'destroyerengine' or \ DB_list[i].split('_')[0] == 'F16_cockpit' or DB_list[i].split('_')[0] == 'factory' or DB_list[i].split('_')[0] == 'street': five_noises_auc[3] = five_noises_auc[3] + temp_AUC five_files[3] = five_files[3] + n_files print('-'*7 + 'All Noises' + '-'*7) print(tabulate([['-5dB AUC', 100*(snr_AUC[0] / snr_files[0])], [' 0dB AUC', 100*(snr_AUC[1] / snr_files[1])], [' 5dB AUC', 100*(snr_AUC[2] / snr_files[2])], ['10dB AUC', 100*(snr_AUC[3] / snr_files[3])], ['-5,0dB AVG', 100*((snr_AUC[0]/snr_files[0] + snr_AUC[1]/snr_files[1])/2)], ['Total AVG', 100*((snr_AUC[0]/snr_files[0] + snr_AUC[1]/snr_files[1] + snr_AUC[2]/snr_files[2] + snr_AUC[3]/snr_files[3])/4)]], tablefmt='grid')) print('-' * 7 + '5 Noises' + '-' * 7) print(tabulate([['-5dB AUC', 100 * (five_noises_auc[0] / five_files[0])], [' 0dB AUC', 100 * (five_noises_auc[1] / five_files[1])], [' 5dB AUC', 100 * (five_noises_auc[2] / five_files[2])], ['10dB AUC', 100 * (five_noises_auc[3] / five_files[3])], ['-5,0dB AVG', 100*((five_noises_auc[0]/five_files[0] + five_noises_auc[1]/five_files[1])/2)], ['Total AVG', 100*((five_noises_auc[0]/five_files[0] + five_noises_auc[1]/five_files[1] + five_noises_auc[2]/five_files[2] + five_noises_auc[3]/five_files[3])/4)]], tablefmt='grid'))
train_dataset = SleepDataset('/beegfs/ga4493/projects/groupb/data/training/RECORDS', '/beegfs/ga4493/projects/groupb/data/training/', 100, 150) train_loaders = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=1, shuffle=True) model_v1 = Model_V1(window_size, hanning_window, window_size) if torch.cuda.is_available(): print('using cuda') model_v1.cuda() # TODO change this to BCEWithLogitsLoss # TODO CHANGE size_average to false, taking the mean is probably not a good idea :( criterion = nn.BCEWithLogitsLoss(size_average=False) optimizer = torch.optim.Adam(model_v1.parameters(), lr=learning_rate)#, momentum=0.9)#, weight_decay=1e-3) sig = nn.Sigmoid() test_dataset = SleepDatasetTest('/beegfs/ga4493/projects/groupb/data/training/RECORDS', '/beegfs/ga4493/projects/groupb/data/training/', 0, 10) test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=1, shuffle=False) # i, ((data, cent), v_l) = next(enumerate(test_loader)) losses = [] v_losses = [] l = None for epoch in range(50): loss_t = 0.0
def train_model(X_train, y_train, subject_indices, model="binary", criterion="BCE", optimizer="SGD", smote=False, activation_function="relu", batch_size=128, hidden_layer_dims=[150], epochs=1000, learning_rate=0.0001, dropout=0.5, weight_decay=0.5, split_val=0.33, verbose=True, random_state=SEED): """ Modular Function for initializing and training a model. Args: X_train (np.array): training samples y_train (np.array): training labels subject_indices (np.array): indices of distinct subjects model (string): type of model; currently implemented: ["binary"] criterion (string): loss function; currently implemented: ["BCE"] optimizer (string): optimizer; currently implemented: ["Adam"] smote (bool): whether to perform SMOTE class equalization activation_function (string): which activation function; currently implemented: ["relu"] batch_size (int): size of training batches hidden_layer_dims (list of ints): the dimensions (and amount) of hidden layer in the neural network. Ordered from input to output (without input and output layer sizes) epochs (int): max number of training epochs learning_rate (float): learning rate for the model dropout (float): dropout rate weight_decay (float): weight decay rate (i.e. regularization) split_val (float): validation dataset size verbose (bool): print information random_state (int): seed for random functions Returns: model (nn.Module): A trained model """ # split them into train and test according to the groups gss = GroupShuffleSplit(n_splits=1, train_size=1 - split_val, random_state=SEED) # since we only split once, use this command to get the # corresponding train and test indices for train_idx, val_idx in gss.split(X_train, y_train, subject_indices): continue X_val = X_train[val_idx] y_val = y_train[val_idx] X_train = X_train[train_idx] y_train = y_train[train_idx] # apply class imbalance equalization using SMOTE if smote: oversample = SMOTE(random_state=random_state) X_train, y_train = oversample.fit_resample(X_train, y_train) y_train = y_train.reshape((-1, 1)) # create torch data loaders for training and validation train_loader = get_data_loader(X_train, y_train, batch_size) val_loader = get_data_loader(X_val, y_val, X_val.shape[0]) # initiate the correct model assert model in "binary", "Model not implemented yet!" if model == "binary": model = BinaryClassification([X_train.shape[1], *hidden_layer_dims, 1], dropout=dropout, activation_function=activation_function) # cpu or gpu, depending on setup model.to(device) # print model summary if verbose: print(model) # initiate the correct criterion/loss function if criterion == "BCE": criterion = nn.BCEWithLogitsLoss() # initiate the correct optimizer if optimizer == "Adam": optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) elif optimizer == "SGD": optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay, momentum=0.5) # saves validation losses over all epochs val_losses_epochs = [] val_acc_epochs = [] val_auc_epochs = [] # used to store weights of the best model best_model = None # indicates best index of the epoch that produced the best model best_epoch = -1 for epoch in range(1, epochs + 1): # set pytorch model into training mode (relevant for batch norm, dropout, ..) model.train() # go through all batches for X_batch, y_batch in train_loader: # map them to cpu/gpu tensors X_batch, y_batch = X_batch.to(device), y_batch.to(device) # set up optimizer optimizer.zero_grad() # make a prediction with the model y_pred = model(X_batch) # taking care of dims if y_pred.ndim == 2: y_pred = y_pred[:, 0] if y_batch.ndim == 2: y_batch = y_batch[:, 0] # calculate the loss of the prediction loss = criterion(y_pred, y_batch) # perform backpropagation loss.backward() # use the optimizer to update the weights for this batch optimizer.step() # put model into evaluation mode model.eval() # validation run for X_batch, y_batch in val_loader: # make a prediction y_pred = model(X_batch) # taking care of dims if y_pred.ndim == 2: y_pred = y_pred[:, 0] if y_batch.ndim == 2: y_batch = y_batch[:, 0] # print(y_pred) # print(y_batch) loss = criterion(y_pred, y_batch) # calculate acc and auc of the prediction acc = binary_acc(y_pred, y_batch) # calculate the auc, while taking into consideration, that # a batch might not contain positive samples if len(np.unique(y_batch)) == 1: auc = "None" else: auc = area_under_the_curve(y_pred.detach().numpy(), y_batch.detach().numpy()) # add the average loss, acc and auc to lists val_losses_epochs.append(np.mean(loss.item())) val_acc_epochs.append(acc) val_auc_epochs.append(auc) # for printing indicator_string = "" # update the best model, if the current epoch produced lowest auc/loss # use auc, if we never had a case that a batch contained only 0s try_acc = False if not ("None" in val_auc_epochs) and (np.max(val_auc_epochs) == val_auc_epochs[-1]): best_epoch = epoch best_model = model.state_dict() indicator_string += "!" elif "None" in val_auc_epochs: try_acc = True # otherwise use loss if try_acc and np.min(val_losses_epochs) == val_losses_epochs[-1]: best_epoch = epoch best_model = model.state_dict() indicator_string += "!" if verbose: if isinstance(auc, str): print( f'Epoch {epoch + 0:03}: | Validation Loss: {val_losses_epochs[-1]:.3f} | ACC: {val_acc_epochs[-1]:.3f} | AUC: {val_auc_epochs[-1]} {indicator_string}' ) else: print( f'Epoch {epoch + 0:03}: | Validation Loss: {val_losses_epochs[-1]:.3f} | ACC: {val_acc_epochs[-1]:.3f} | AUC: {val_auc_epochs[-1]:.3f} {indicator_string}' ) # convergence criterion: at least found a new best model in the last 5 epochs if (epoch - best_epoch) >= 5: break # load the best model from memory model.load_state_dict(best_model) return model
def train_superres(load_trained): # logdir logdir = os.path.join(Hyper.logdir, "superres") if not os.path.exists(logdir): os.makedirs(logdir) if not os.path.exists(os.path.join(logdir, "pkg")): os.mkdir(os.path.join(logdir, "pkg")) # device device = Hyper.device_superres # graph graph = SuperRes().to(device) graph.train() # load data names, lengths, texts = load_data() batch_maker = BatchMaker(Hyper.batch_size, names, lengths, texts) # loss criterion_mags = nn.L1Loss().to(device) criterion_bd2 = nn.BCEWithLogitsLoss().to(device) lossplot_mags = LogHelper("mag_l1", logdir) lossplot_bd2 = LogHelper("mag_BCE", logdir) # optim optimizer = torch.optim.Adam(graph.parameters(), lr=Hyper.adam_alpha, betas=Hyper.adam_betas, eps=Hyper.adam_eps) # load global_step = 0 if load_trained > 0: print("load model trained for {}k batches".format(load_trained)) global_step = load( os.path.join(logdir, "pkg/save_{}k.pkg".format(load_trained)), graph, { "mags": criterion_mags, "bd2": criterion_bd2 }, optimizer) for loop_cnt in range( int(Hyper.num_batches / batch_maker.num_batches() + 0.5)): print("loop", loop_cnt) bar = PrettyBar(batch_maker.num_batches()) bar.set_description("training...") loss_str0 = MovingAverage() loss_str1 = MovingAverage() for bi in bar: batch = batch_maker.next_batch() # low res mels = torch.FloatTensor(batch["mels"]).to(device) # high res mags = torch.FloatTensor(batch["mags"]).to(device) # forward mag_logits, mag_pred = graph(mels) # loss loss_mags = criterion_mags(mag_pred, mags) loss_bd2 = criterion_bd2(mag_logits, mags) loss = loss_mags + loss_bd2 # backward graph.zero_grad() optimizer.zero_grad() loss.backward() # clip grad nn.utils.clip_grad_value_(graph.parameters(), 1) optimizer.step() # log loss_str0.add(loss_mags.cpu().data.mean()) loss_str1.add(loss_bd2.cpu().data.mean()) lossplot_mags.add(loss_str0(), global_step) lossplot_bd2.add(loss_str1(), global_step) bar.set_description("gs: {}, mags: {}, bd2: {}".format( global_step, loss_str0(), loss_str1())) # plot if global_step % 100 == 0: gs = 0 plot_spectrum(mag_pred[0].cpu().data, "pred", gs, dir=logdir) plot_spectrum(mags[0].cpu().data, "true", gs, dir=logdir) plot_spectrum(mels[0].cpu().data, "input", gs, dir=logdir) if global_step % 100 == 0: lossplot_mags.plot() lossplot_bd2.plot() if global_step % 10000 == 0: save( os.path.join(logdir, "pkg/save_{}k.pkg").format( global_step // 1000), graph, { "mags": criterion_mags, "bd2": criterion_bd2 }, optimizer, global_step, True) global_step += 1
nn.Linear(256, 256), nn.LeakyReLU(0.2), nn.Linear(256, 1)) return net #生成网络 def generator(noise_dim=NOISE_DIM): net = nn.oSequential(nn.Linear(noise_dim, 1024), nn.ReLU(True), nn.Linear(1024, 1024), nn.ReLU(True), nn.Linear(1024, 784), nn.Tanh()) return net #判别器的 loss 就是将真实数据的得分判断为 1,假的数据的得分判断为 0,而生成器的 loss 就是将假的数据判断为 1 bce_loss = nn.BCEWithLogitsLoss() #交叉熵损失函数 def discriminator_loss(logits_real, logits_fake): # 判别器的 loss size = logits_real.shape[0] true_labels = Variable(torch.ones(size, 1)).float() false_labels = Variable(torch.zeros(size, 1)).float() loss = bce_loss(logits_real, true_labels) + bce_loss( logits_fake, false_labels) return loss def generator_loss(logits_fake): # 生成器的 loss size = logits_fake.shape[0] true_labels = Variable(torch.ones(size, 1)).float() loss = bce_loss(logits_fake, true_labels)
def train_text2mel(load_trained): # create log dir logdir = os.path.join(Hyper.logdir, "text2mel") if not os.path.exists(logdir): os.makedirs(logdir) if not os.path.exists(os.path.join(logdir, "pkg")): os.mkdir(os.path.join(logdir, "pkg")) # device ##cuda:0 device = Hyper.device_text2mel graph = Text2Mel().to(device) # set the training flag graph.train() # load data and get batch maker names, lengths, texts = load_data() batch_maker = BatchMaker(Hyper.batch_size, names, lengths, texts) criterion_mels = nn.L1Loss().to(device) criterion_bd1 = nn.BCEWithLogitsLoss().to(device) criterion_atten = nn.L1Loss().to(device) optimizer = torch.optim.Adam(graph.parameters(), lr=Hyper.adam_alpha, betas=Hyper.adam_betas, eps=Hyper.adam_eps) lossplot_mels = LogHelper("mel_l1", logdir) lossplot_bd1 = LogHelper("mel_BCE", logdir) lossplot_atten = LogHelper("atten", logdir) dynamic_guide = float(Hyper.guide_weight) global_step = 0 # check if load if load_trained > 0: print("load model trained for {}k batches".format(load_trained)) global_step = load( os.path.join(logdir, "pkg/save_{}k.pkg".format(load_trained)), graph, { "mels": criterion_mels, "bd1": criterion_bd1, "atten": criterion_atten }, optimizer) dynamic_guide *= Hyper.guide_decay**(load_trained * 1000) for loop_cnt in range( int(Hyper.num_batches / batch_maker.num_batches() + 0.5)): print("loop", loop_cnt) bar = PrettyBar(batch_maker.num_batches()) bar.set_description("training...") loss_str0 = MovingAverage() loss_str1 = MovingAverage() loss_str2 = MovingAverage() for bi in bar: batch = batch_maker.next_batch() # make batch texts = torch.LongTensor(batch["texts"]).to(device) # shift mel shift_mels = torch.FloatTensor( np.concatenate((np.zeros( (batch["mels"].shape[0], batch["mels"].shape[1], 1)), batch["mels"][:, :, :-1]), axis=2)).to(device) # ground truth mels = torch.FloatTensor(batch["mels"]).to(device) # forward pred_logits, pred_mels = graph(texts, shift_mels) # loss if False: loss_mels = sum( criterion_mels( torch.narrow(pred_mels[i], -1, 0, batch["mel_lengths"] [i]), torch.narrow(mels[i], -1, 0, batch["mel_lengths"][i])) for i in range(batch_maker.batch_size())) / float( batch_maker.batch_size()) loss_bd1 = sum( criterion_bd1( torch.narrow(pred_logits[i], -1, 0, batch["mel_lengths"][i]), torch.narrow(mels[i], -1, 0, batch["mel_lengths"][i])) for i in range(batch_maker.batch_size())) / float( batch_maker.batch_size()) else: loss_mels = criterion_mels(pred_mels, mels) loss_bd1 = criterion_bd1(pred_logits, mels) # guide attention atten_guide = torch.FloatTensor(batch["atten_guides"]).to(device) atten_mask = torch.FloatTensor(batch["atten_masks"]).to(device) atten_mask = torch.ones_like(graph.attention) loss_atten = criterion_atten( atten_guide * graph.attention * atten_mask, torch.zeros_like(graph.attention)) * dynamic_guide loss = loss_mels + loss_bd1 + loss_atten # backward graph.zero_grad() optimizer.zero_grad() loss.backward() # clip grad nn.utils.clip_grad_value_(graph.parameters(), 1) optimizer.step() # log loss_str0.add(loss_mels.cpu().data.mean()) loss_str1.add(loss_bd1.cpu().data.mean()) loss_str2.add(loss_atten.cpu().data.mean()) lossplot_mels.add(loss_str0(), global_step) lossplot_bd1.add(loss_str1(), global_step) lossplot_atten.add(loss_str2(), global_step) # adjust dynamic_guide # dynamic_guide = float((loss_mels + loss_bd1).cpu().data.mean() / loss_atten.cpu().data.mean()) dynamic_guide *= Hyper.guide_decay if dynamic_guide < Hyper.guide_lowbound: dynamic_guide = Hyper.guide_lowbound bar.set_description( "gs: {}, mels: {}, bd1: {}, atten: {}, scale: {}".format( global_step, loss_str0(), loss_str1(), loss_str2(), "%4f" % dynamic_guide)) # plot if global_step % 100 == 0: gs = 0 plot_spectrum(mels[0].cpu().data, "mel_true", gs, dir=logdir) plot_spectrum(shift_mels[0].cpu().data, "mel_input", gs, dir=logdir) plot_spectrum(pred_mels[0].cpu().data, "mel_pred", gs, dir=logdir) plot_spectrum(graph.query[0].cpu().data, "query", gs, dir=logdir) plot_attention(graph.attention[0].cpu().data, "atten", gs, True, dir=logdir) plot_attention((atten_guide)[0].cpu().data, "atten_guide", gs, True, dir=logdir) if global_step % 500 == 0: lossplot_mels.plot() lossplot_bd1.plot() lossplot_atten.plot() if global_step % 10000 == 0: save( os.path.join(logdir, "pkg/save_{}k.pkg").format( global_step // 1000), graph, { "mels": criterion_mels, "bd1": criterion_bd1, "atten": criterion_atten }, optimizer, global_step, True) # increase global step global_step += 1
def main(args): # ----------------------------------------------------------------------------- # Create model # ----------------------------------------------------------------------------- if args.model == 'dicenet': from model.classification import dicenet as net model = net.CNNModel(args) elif args.model == 'espnetv2': from model.classification import espnetv2 as net model = net.EESPNet(args) elif args.model == 'shufflenetv2': from model.classification import shufflenetv2 as net model = net.CNNModel(args) else: print_error_message('Model {} not yet implemented'.format(args.model)) exit() if args.finetune: # laod the weights for finetuning if os.path.isfile(args.weights_ft): pretrained_dict = torch.load(args.weights_ft, map_location=torch.device('cpu')) print_info_message('Loading pretrained basenet model weights') model_dict = model.state_dict() overlap_dict = { k: v for k, v in model_dict.items() if k in pretrained_dict } total_size_overlap = 0 for k, v in enumerate(overlap_dict): total_size_overlap += torch.numel(overlap_dict[v]) total_size_pretrain = 0 for k, v in enumerate(pretrained_dict): total_size_pretrain += torch.numel(pretrained_dict[v]) if len(overlap_dict) == 0: print_error_message( 'No overlaping weights between model file and pretrained weight file. Please check' ) print_info_message('Overlap ratio of weights: {:.2f} %'.format( (total_size_overlap * 100.0) / total_size_pretrain)) model_dict.update(overlap_dict) model.load_state_dict(model_dict, strict=False) print_info_message('Pretrained basenet model loaded!!') else: print_error_message('Unable to find the weights: {}'.format( args.weights_ft)) # ----------------------------------------------------------------------------- # Writer for logging # ----------------------------------------------------------------------------- if not os.path.isdir(args.savedir): os.makedirs(args.savedir) writer = SummaryWriter(log_dir=args.savedir, comment='Training and Validation logs') writer.add_graph(model, input_to_model=torch.randn(1, 3, args.inpSize, args.inpSize)) # network properties num_params = model_parameters(model) flops = compute_flops(model) print_info_message('FLOPs: {:.2f} million'.format(flops)) print_info_message('Network Parameters: {:.2f} million'.format(num_params)) # ----------------------------------------------------------------------------- # Optimizer # ----------------------------------------------------------------------------- optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint best_acc = 0.0 num_gpus = torch.cuda.device_count() device = 'cuda' if num_gpus >= 1 else 'cpu' if args.resume: if os.path.isfile(args.resume): print_info_message("=> loading checkpoint '{}'".format( args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_acc = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict'], map_location=torch.device(device)) optimizer.load_state_dict(checkpoint['optimizer']) print_info_message("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print_warning_message("=> no checkpoint found at '{}'".format( args.resume)) # ----------------------------------------------------------------------------- # Loss Fn # ----------------------------------------------------------------------------- if args.dataset == 'imagenet': criterion = nn.CrossEntropyLoss() acc_metric = 'Top-1' elif args.dataset == 'coco': criterion = nn.BCEWithLogitsLoss() acc_metric = 'F1' else: print_error_message('{} dataset not yet supported'.format( args.dataset)) if num_gpus >= 1: model = torch.nn.DataParallel(model) model = model.cuda() criterion = criterion.cuda() if torch.backends.cudnn.is_available(): import torch.backends.cudnn as cudnn cudnn.benchmark = True cudnn.deterministic = True # ----------------------------------------------------------------------------- # Data Loaders # ----------------------------------------------------------------------------- # Data loading code if args.dataset == 'imagenet': train_loader, val_loader = img_loader.data_loaders(args) # import the loaders too from utilities.train_eval_classification import train, validate elif args.dataset == 'coco': from data_loader.classification.coco import COCOClassification train_dataset = COCOClassification(root=args.data, split='train', year='2017', inp_size=args.inpSize, scale=args.scale, is_training=True) val_dataset = COCOClassification(root=args.data, split='val', year='2017', inp_size=args.inpSize, is_training=False) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=args.workers) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=args.workers) # import the loaders too from utilities.train_eval_classification import train_multi as train from utilities.train_eval_classification import validate_multi as validate else: print_error_message('{} dataset not yet supported'.format( args.dataset)) # ----------------------------------------------------------------------------- # LR schedulers # ----------------------------------------------------------------------------- if args.scheduler == 'fixed': step_sizes = args.steps from utilities.lr_scheduler import FixedMultiStepLR lr_scheduler = FixedMultiStepLR(base_lr=args.lr, steps=step_sizes, gamma=args.lr_decay) elif args.scheduler == 'clr': from utilities.lr_scheduler import CyclicLR step_sizes = args.steps lr_scheduler = CyclicLR(min_lr=args.lr, cycle_len=5, steps=step_sizes, gamma=args.lr_decay) elif args.scheduler == 'poly': from utilities.lr_scheduler import PolyLR lr_scheduler = PolyLR(base_lr=args.lr, max_epochs=args.epochs) elif args.scheduler == 'linear': from utilities.lr_scheduler import LinearLR lr_scheduler = LinearLR(base_lr=args.lr, max_epochs=args.epochs) elif args.scheduler == 'hybrid': from utilities.lr_scheduler import HybirdLR lr_scheduler = HybirdLR(base_lr=args.lr, max_epochs=args.epochs, clr_max=args.clr_max) else: print_error_message('Scheduler ({}) not yet implemented'.format( args.scheduler)) exit() print_info_message(lr_scheduler) # set up the epoch variable in case resuming training if args.start_epoch != 0: for epoch in range(args.start_epoch): lr_scheduler.step(epoch) with open(args.savedir + os.sep + 'arguments.json', 'w') as outfile: import json arg_dict = vars(args) arg_dict['model_params'] = '{} '.format(num_params) arg_dict['flops'] = '{} '.format(flops) json.dump(arg_dict, outfile) # ----------------------------------------------------------------------------- # Training and Val Loop # ----------------------------------------------------------------------------- extra_info_ckpt = args.model + '_' + str(args.s) for epoch in range(args.start_epoch, args.epochs): lr_log = lr_scheduler.step(epoch) # set the optimizer with the learning rate # This can be done inside the MyLRScheduler for param_group in optimizer.param_groups: param_group['lr'] = lr_log print_info_message("LR for epoch {} = {:.5f}".format(epoch, lr_log)) train_acc, train_loss = train(data_loader=train_loader, model=model, criteria=criterion, optimizer=optimizer, epoch=epoch, device=device) # evaluate on validation set val_acc, val_loss = validate(data_loader=val_loader, model=model, criteria=criterion, device=device) # remember best prec@1 and save checkpoint is_best = val_acc > best_acc best_acc = max(val_acc, best_acc) weights_dict = model.module.state_dict( ) if device == 'cuda' else model.state_dict() save_checkpoint( { 'epoch': epoch + 1, 'state_dict': weights_dict, 'best_prec1': best_acc, 'optimizer': optimizer.state_dict(), }, is_best, args.savedir, extra_info_ckpt) writer.add_scalar('Classification/LR/learning_rate', lr_log, epoch) writer.add_scalar('Classification/Loss/Train', train_loss, epoch) writer.add_scalar('Classification/Loss/Val', val_loss, epoch) writer.add_scalar('Classification/{}/Train'.format(acc_metric), train_acc, epoch) writer.add_scalar('Classification/{}/Val'.format(acc_metric), val_acc, epoch) writer.add_scalar('Classification/Complexity/Top1_vs_flops', best_acc, round(flops, 2)) writer.add_scalar('Classification/Complexity/Top1_vs_params', best_acc, round(num_params, 2)) writer.close()
def compute_loss(p, targets, model): # predictions, targets, model ft = torch.cuda.FloatTensor if p[0].is_cuda else torch.Tensor lcls, lbox, lobj = ft([0]), ft([0]), ft([0]) tcls, tbox, indices, anchor_vec = build_targets(model, targets) h = model.hyp # hyperparameters red = 'mean' # Loss reduction (sum or mean) # Define criteria BCEcls = nn.BCEWithLogitsLoss(pos_weight=ft([h['cls_pw']]), reduction=red) BCEobj = nn.BCEWithLogitsLoss(pos_weight=ft([h['obj_pw']]), reduction=red) # class label smoothing https://arxiv.org/pdf/1902.04103.pdf eqn 3 cp, cn = smooth_BCE(eps=0.0) # focal loss g = h['fl_gamma'] # focal loss gamma if g > 0: BCEcls, BCEobj = FocalLoss(BCEcls, g), FocalLoss(BCEobj, g) # Compute losses np, ng = 0, 0 # number grid points, targets for i, pi in enumerate(p): # layer index, layer predictions b, a, gj, gi = indices[i] # image, anchor, gridy, gridx tobj = torch.zeros_like(pi[..., 0]) # target obj np += tobj.numel() # Compute losses nb = len(b) if nb: # number of targets ng += nb ps = pi[b, a, gj, gi] # prediction subset corresponding to targets # ps[:, 2:4] = torch.sigmoid(ps[:, 2:4]) # wh power loss (uncomment) # GIoU pxy = torch.sigmoid(ps[:, 0:2]) # pxy = pxy * s - (s - 1) / 2, s = 1.5 (scale_xy) pwh = torch.exp(ps[:, 2:4]).clamp(max=1E3) * anchor_vec[i] pbox = torch.cat((pxy, pwh), 1) # predicted box giou = bbox_iou(pbox.t(), tbox[i], x1y1x2y2=False, GIoU=True) # giou computation lbox += (1.0 - giou).sum() if red == 'sum' else (1.0 - giou).mean() # giou loss tobj[b, a, gj, gi] = (1.0 - model.gr) + model.gr * giou.detach().clamp(0).type(tobj.dtype) # giou ratio if model.nc > 1: # cls loss (only if multiple classes) t = torch.full_like(ps[:, 5:], cn) # targets t[range(nb), tcls[i]] = cp lcls += BCEcls(ps[:, 5:], t) # BCE # lcls += CE(ps[:, 5:], tcls[i]) # CE # Append targets to text file # with open('targets.txt', 'a') as file: # [file.write('%11.5g ' * 4 % tuple(x) + '\n') for x in torch.cat((txy[i], twh[i]), 1)] lobj += BCEobj(pi[..., 4], tobj) # obj loss lbox *= h['giou'] lobj *= h['obj'] lcls *= h['cls'] if red == 'sum': bs = tobj.shape[0] # batch size lobj *= 3 / (6300 * bs) * 2 # 3 / np * 2 if ng: lcls *= 3 / ng / model.nc lbox *= 3 / ng loss = lbox + lobj + lcls return loss, torch.cat((lbox, lobj, lcls, loss)).detach()
def __init__(self, alpha=0.05): super(BCEBlurWithLogitsLoss, self).__init__() self.loss_fcn = nn.BCEWithLogitsLoss(reduction='none') # must be nn.BCEWithLogitsLoss() self.alpha = alpha
def main(args): use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(1) device = torch.device('cuda' if use_cuda else 'cpu') train_dataset = WakeWordData(data_json=args.train_data_json, sample_rate=args.sample_rate, valid=False) test_dataset = WakeWordData(data_json=args.test_data_json, sample_rate=args.sample_rate, valid=True) kwargs = { 'num_workers': args.num_workers, 'pin_memory': True } if use_cuda else {} train_loader = data.DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, collate_fn=collate_fn, **kwargs) test_loader = data.DataLoader(dataset=test_dataset, batch_size=args.eval_batch_size, shuffle=True, collate_fn=collate_fn, **kwargs) model_params = { "num_classes": 1, "feature_size": 40, "hidden_size": args.hidden_size, "num_layers": 1, "dropout": 0.1, "bidirectional": False } model = LSTMWakeWord(**model_params, device=device) model = model.to(device) optimizer = optim.AdamW(model.parameters(), lr=args.lr) loss_fn = nn.BCEWithLogitsLoss() scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2) best_train_acc, best_train_report = 0, None best_test_acc, best_test_report = 0, None best_epoch = 0 for epoch in range(args.epochs): print("\nstarting training with learning rate", optimizer.param_groups[0]['lr']) train_acc, train_report = train(train_loader, model, optimizer, loss_fn, device, epoch) test_acc, test_report = test(test_loader, model, device, epoch) # record best train and test if train_acc > best_train_acc: best_train_acc = train_acc if test_acc > best_test_acc: best_test_acc = test_acc # saves checkpoint if metrics are better than last if args.save_checkpoint_path and test_acc >= best_test_acc: checkpoint_path = os.path.join(args.save_checkpoint_path, args.model_name + ".pt") print("found best checkpoint. saving model as", checkpoint_path) save_checkpoint( checkpoint_path, model, optimizer, scheduler, model_params, notes="train_acc: {}, test_acc: {}, epoch: {}".format( best_train_acc, best_test_acc, epoch), ) best_train_report = train_report best_test_report = test_report best_epoch = epoch table = [["Train ACC", train_acc], ["Test ACC", test_acc], ["Best Train ACC", best_train_acc], ["Best Test ACC", best_test_acc], ["Best Epoch", best_epoch]] # print("\ntrain acc:", train_acc, "test acc:", test_acc, "\n", # "best train acc", best_train_acc, "best test acc", best_test_acc) print(tabulate(table)) scheduler.step(train_acc) print("Done Training...") print("Best Model Saved to", checkpoint_path) print("Best Epoch", best_epoch) print("\nTrain Report \n") print(best_train_report) print("\nTest Report\n") print(best_test_report)
def __init__(self): super(BCELoss, self).__init__() self.criterion = nn.BCEWithLogitsLoss()
def main(params: dict): import mlflow print("start params={}".format(params)) logger = get_logger() df = pd.read_pickle("../input/riiid-test-answer-prediction/train_merged.pickle") # df = pd.read_pickle("../input/riiid-test-answer-prediction/split10/train_0.pickle").sort_values(["user_id", "timestamp"]).reset_index(drop=True) if is_debug: df = df.head(30000) column_config = { ("content_id", "content_type_id"): {"type": "category"}, "user_answer": {"type": "category"}, "part": {"type": "category"}, "prior_question_elapsed_time_bin300": {"type": "category"}, "duration_previous_content_bin300": {"type": "category"} } if not load_pickle or is_debug: feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"]["DurationPreviousContent"] = DurationPreviousContent() feature_factory_dict["user_id"]["ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_manager = FeatureFactoryManager(feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id="all", load_feature=not is_debug, save_feature=not is_debug) print("all_predict") df = feature_factory_manager.all_predict(df) df = df[["user_id", "content_id", "content_type_id", "part", "user_answer", "answered_correctly", "prior_question_elapsed_time_bin300", "duration_previous_content_bin300"]] print(df.head(10)) print("data preprocess") train_idx = [] val_idx = [] np.random.seed(0) for _, w_df in df[df["content_type_id"] == 0].groupby("user_id"): if np.random.random() < 0.01: # all val val_idx.extend(w_df.index.tolist()) else: train_num = int(len(w_df) * 0.95) train_idx.extend(w_df[:train_num].index.tolist()) val_idx.extend(w_df[train_num:].index.tolist()) ff_for_transformer = FeatureFactoryForTransformer(column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) ff_for_transformer.make_dict(df=pd.DataFrame()) n_skill = len(ff_for_transformer.embbed_dict[("content_id", "content_type_id")]) if not load_pickle or is_debug: df["is_val"] = 0 df["is_val"].loc[val_idx] = 1 w_df = df[df["is_val"] == 0] w_df["group"] = (w_df.groupby("user_id")["user_id"].transform("count") - w_df.groupby("user_id").cumcount()) // params["max_seq"] w_df["user_id"] = w_df["user_id"].astype(str) + "_" + w_df["group"].astype(str) group = ff_for_transformer.all_predict(w_df) dataset_train = SAKTDataset(group, n_skill=n_skill, max_seq=params["max_seq"]) del w_df gc.collect() ff_for_transformer = FeatureFactoryForTransformer(column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) if not load_pickle or is_debug: group = ff_for_transformer.all_predict(df[df["content_type_id"] == 0]) dataset_val = SAKTDataset(group, is_test=True, n_skill=n_skill, max_seq=params["max_seq"]) os.makedirs("../input/feature_engineering/model051", exist_ok=True) if not is_debug and not load_pickle: with open(f"../input/feature_engineering/model051/train.pickle", "wb") as f: pickle.dump(dataset_train, f) with open(f"../input/feature_engineering/model051/val.pickle", "wb") as f: pickle.dump(dataset_val, f) if not is_debug and load_pickle: with open(f"../input/feature_engineering/model051/train.pickle", "rb") as f: dataset_train = pickle.load(f) with open(f"../input/feature_engineering/model051/val.pickle", "rb") as f: dataset_val = pickle.load(f) print("loaded!") dataloader_train = DataLoader(dataset_train, batch_size=params["batch_size"], shuffle=True, num_workers=1) dataloader_val = DataLoader(dataset_val, batch_size=params["batch_size"], shuffle=False, num_workers=1) model = SAKTModel(n_skill, embed_dim=params["embed_dim"], max_seq=params["max_seq"], dropout=dropout) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=params["lr"], weight_decay=0.01, ) num_train_optimization_steps = int(len(dataloader_train) * epochs) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=params["num_warmup_steps"], num_training_steps=num_train_optimization_steps) criterion = nn.BCEWithLogitsLoss() model.to(device) criterion.to(device) for epoch in range(epochs): loss, acc, auc, auc_val = train_epoch(model, dataloader_train, dataloader_val, optimizer, criterion, scheduler, device) print("epoch - {} train_loss - {:.3f} auc - {:.4f} auc-val: {:.4f}".format(epoch, loss, auc, auc_val)) preds = [] labels = [] for item in tqdm(dataloader_val): x = item["x"].to(device).long() target_id = item["target_id"].to(device).long() part = item["part"].to(device).long() label = item["label"].to(device).float() elapsed_time = item["elapsed_time"].to(device).long() duration_previous_content = item["duration_previous_content"].to(device).long() output = model(x, target_id, part, elapsed_time, duration_previous_content) preds.extend(torch.nn.Sigmoid()(output[:, -1]).view(-1).data.cpu().numpy().tolist()) labels.extend(label[:, -1].view(-1).data.cpu().numpy().tolist()) auc_transformer = roc_auc_score(labels, preds) print("single transformer: {:.4f}".format(auc_transformer)) df_oof = pd.DataFrame() # df_oof["row_id"] = df.loc[val_idx].index df_oof["predict"] = preds df_oof["target"] = df.loc[val_idx]["answered_correctly"].values df_oof.to_csv(f"{output_dir}/transformers1.csv", index=False) """ df_oof2 = pd.read_csv("../output/ex_237/20201213110353/oof_train_0_lgbm.csv") df_oof2.columns = ["row_id", "predict_lgbm", "target"] df_oof2 = pd.merge(df_oof, df_oof2, how="inner") auc_lgbm = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values) print("lgbm: {:.4f}".format(auc_lgbm)) print("ensemble") max_auc = 0 max_nn_ratio = 0 for r in np.arange(0, 1.05, 0.05): auc = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values*(1-r) + df_oof2["predict"].values*r) print("[nn_ratio: {:.2f}] AUC: {:.4f}".format(r, auc)) if max_auc < auc: max_auc = auc max_nn_ratio = r print(len(df_oof2)) """ if not is_debug: mlflow.start_run(experiment_id=10, run_name=os.path.basename(__file__)) for key, value in params.items(): mlflow.log_param(key, value) mlflow.log_metric("auc_val", auc_transformer) mlflow.end_run() torch.save(model.state_dict(), f"{output_dir}/transformers.pth") del model with open(f"{output_dir}/transformer_param.json", "w") as f: json.dump(params, f) if is_make_feature_factory: # feature factory feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"]["DurationPreviousContent"] = DurationPreviousContent(is_partial_fit=True) feature_factory_dict["user_id"]["ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_manager = FeatureFactoryManager(feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id="all", load_feature=not is_debug, save_feature=not is_debug) ff_for_transformer = FeatureFactoryForTransformer(column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) df = pd.read_pickle("../input/riiid-test-answer-prediction/train_merged.pickle") if is_debug: df = df.head(10000) df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True) feature_factory_manager.fit(df) df = feature_factory_manager.all_predict(df) for dicts in feature_factory_manager.feature_factory_dict.values(): for factory in dicts.values(): factory.logger = None feature_factory_manager.logger = None with open(f"{output_dir}/feature_factory_manager.pickle", "wb") as f: pickle.dump(feature_factory_manager, f) ff_for_transformer.fit(df) ff_for_transformer.logger = None with open(f"{output_dir}/feature_factory_manager_for_transformer.pickle", "wb") as f: pickle.dump(ff_for_transformer, f)
def train(net, dataloader, device, config): in_nchannel = len(dataloader.dataset) optimizer = optim.SGD(net.parameters(), lr=config.lr, momentum=config.momentum, weight_decay=config.weight_decay) scheduler = optim.lr_scheduler.ExponentialLR(optimizer, 0.95) crit = nn.BCEWithLogitsLoss() net.train() train_iter = iter(dataloader) # val_iter = iter(val_dataloader) logging.info(f'LR: {scheduler.get_lr()}') for i in range(config.max_iter): s = time() data_dict = train_iter.next() d = time() - s optimizer.zero_grad() init_coords = torch.zeros((config.batch_size, 4), dtype=torch.int) init_coords[:, 0] = torch.arange(config.batch_size) in_feat = torch.zeros((config.batch_size, in_nchannel)) in_feat[torch.arange(config.batch_size), data_dict['labels']] = 1 sin = ME.SparseTensor( feats=in_feat, coords=init_coords, allow_duplicate_coords=True, # for classification, it doesn't matter tensor_stride=config.resolution, ).to(device) # Generate target sparse tensor cm = sin.coords_man target_key = cm.create_coords_key(ME.utils.batched_coordinates( data_dict['xyzs']), force_creation=True, allow_duplicate_coords=True) # Generate from a dense tensor out_cls, targets, sout = net(sin, target_key) num_layers, loss = len(out_cls), 0 losses = [] for out_cl, target in zip(out_cls, targets): curr_loss = crit(out_cl.F.squeeze(), target.type(out_cl.F.dtype).to(device)) losses.append(curr_loss.item()) loss += curr_loss / num_layers loss.backward() optimizer.step() t = time() - s if i % config.stat_freq == 0: logging.info( f'Iter: {i}, Loss: {loss.item():.3e}, Depths: {len(out_cls)} Data Loading Time: {d:.3e}, Tot Time: {t:.3e}' ) if i % config.val_freq == 0 and i > 0: torch.save( { 'state_dict': net.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'curr_iter': i, }, config.weights) scheduler.step() logging.info(f'LR: {scheduler.get_lr()}') net.train()
def __init__(self, args): self.reconstruction_path = args.reconstruction_path if not os.path.exists(self.reconstruction_path): os.makedirs(self.reconstruction_path) self.beta = args.beta self.train_batch_size = args.train_batch_size self.test_batch_size = args.test_batch_size self.epochs = args.epochs self.early_stop = args.early_stop self.early_stop_observation_period = args.early_stop_observation_period self.use_scheduler = False self.print_training = args.print_training self.z_dim = args.z_dim # self.disc_input_dim = int(self.z_dim / 2) self.disc_input_dim = 22 self.class_idx = range(0, 22) self.membership_idx = range(22, 44) self.style_idx = range(44, 64) # self.class_idx = range(0, self.disc_input_dim) # self.membership_idx = range(self.disc_input_dim, self.z_dim) self.nets = dict() if args.dataset in ['MNIST', 'Fashion-MNIST', 'CIFAR-10', 'SVHN']: if args.dataset in ['MNIST', 'Fashion-MNIST']: self.num_channels = 1 elif args.dataset in ['CIFAR-10', 'SVHN']: self.num_channels = 3 self.nets['encoder'] = module.VAEConvEncoder(self.z_dim, self.num_channels) self.nets['decoder'] = module.VAEConvDecoder(self.z_dim, self.num_channels) elif args.dataset in ['adult', 'location']: self.nets['encoder'] = module.VAEFCEncoder(args.encoder_input_dim, self.z_dim) self.nets['decoder'] = module.FCDecoder(args.encoder_input_dim, self.z_dim) self.discs = { 'class_fz': module.ClassDiscriminator(self.z_dim, args.class_num), 'class_cz': module.ClassDiscriminator(self.disc_input_dim, args.class_num), 'class_mz': module.ClassDiscriminator(self.disc_input_dim, args.class_num), 'membership_fz': module.MembershipDiscriminator(self.z_dim, 1), 'membership_cz': module.MembershipDiscriminator(self.disc_input_dim, 1), 'membership_mz': module.MembershipDiscriminator(self.disc_input_dim, 1), } self.recon_loss = self.get_loss_function() self.class_loss = nn.CrossEntropyLoss(reduction='sum') self.membership_loss = nn.BCEWithLogitsLoss(reduction='sum') # optimizer self.optimizer = dict() for net_type in self.nets: self.optimizer[net_type] = optim.Adam(self.nets[net_type].parameters(), lr=args.recon_lr, betas=(0.5, 0.999)) self.discriminator_lr = args.disc_lr for disc_type in self.discs: self.optimizer[disc_type] = optim.Adam(self.discs[disc_type].parameters(), lr=self.discriminator_lr, betas=(0.5, 0.999)) self.weights = { 'recon': args.recon_weight, 'class_cz': args.class_cz_weight, 'class_mz': args.class_mz_weight, 'membership_cz': args.membership_cz_weight, 'membership_mz': args.membership_mz_weight, } self.scheduler_enc = StepLR(self.optimizer['encoder'], step_size=50, gamma=0.1) self.scheduler_dec = StepLR(self.optimizer['decoder'], step_size=50, gamma=0.1) # to device self.device = torch.device("cuda:{}".format(args.gpu_id)) for net_type in self.nets: self.nets[net_type] = self.nets[net_type].to(self.device) for disc_type in self.discs: self.discs[disc_type] = self.discs[disc_type].to(self.device) self.disentangle = (self.weights['class_cz'] + self.weights['class_mz'] + self.weights['membership_cz'] + self.weights['membership_mz'] > 0) self.start_epoch = 0 self.best_valid_loss = float("inf") # self.train_loss = 0 self.early_stop_count = 0 self.acc_dict = { 'class_fz': 0, 'class_cz': 0, 'class_mz': 0, 'membership_fz': 0, 'membership_cz': 0, 'membership_mz': 0, } self.best_acc_dict = {} if 'cuda' in str(self.device): cudnn.benchmark = True if args.resume: print('==> Resuming from checkpoint..') try: self.load() except FileNotFoundError: print('There is no pre-trained model; Train model from scratch')
# In[ ]: try: model_ft.load_state_dict(torch.load('models/{}.pt'.format(filename))) #load weights if already completed except: print('Starting from scratch..') model_ft = model_ft.to(device) df = dframe['train'].iloc[:,5:].copy() df = df.replace(-1,0) pos_weight = torch.Tensor([df[cl].sum()/df.shape[0] for cl in class_names]) if u_approach == 'ignore': #Use masked binary cross-entropy for first run criterion = nn.BCEWithLogitsLoss(reduction='none',pos_weight=pos_weight).to(device) else: criterion = nn.BCEWithLogitsLoss(reduction='sum',pos_weight=pos_weight).to(device) # Observe that all parameters are being optimized optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9) # # Pick AdamW optimizer - https://github.com/mpyrozhok/adamwr # optimizer = adamw.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-5) # Decay LR by a factor of 0.1 every 7 epochs exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1) # epoch_size = np.round(num_samples / batch_size) # number of training examples/batch size # #Cosine annealing: adjusting on batch update rather than epoch - https://github.com/mpyrozhok/adamwr # scheduler = cosine_scheduler.CosineLRWithRestarts(optimizer, batch_size, epoch_size, restart_period=5, t_mult=1.2)
def train_model(self, generator, discriminator, dataloader, num_epochs=300): # GPUが使えるかを確認 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("Device:", device) # 最適化手法の設定 g_lr, d_lr = 0.0001, 0.0004 beta1, beta2 = 0.0, 0.9 g_optimizer = torch.optim.Adam(generator.parameters(), g_lr, [beta1, beta2]) d_optimizer = torch.optim.Adam(discriminator.parameters(), d_lr, [beta1, beta2]) # 誤差関数を定義 criterion = nn.BCEWithLogitsLoss(reduction='mean') # パラメータをハードコーディング mini_batch_size = 64 # ネットワークをGPUへ generator.to(device) discriminator.to(device) generator.train() # モデルを訓練モードに discriminator.train() # モデルを訓練モードに # ネットワークがある程度固定であれば、高速化させる torch.backends.cudnn.benchmark = True # 画像の枚数 num_train_imgs = len(dataloader.dataset) batch_size = dataloader.batch_size # イテレーションカウンタをセット iteration = 1 logs = [] # epochのループ for epoch in range(num_epochs): # 開始時刻を保存 t_epoch_start = time.time() epoch_g_loss = 0.0 # epochの損失和 epoch_d_loss = 0.0 # epochの損失和 print('-------------') print('Epoch {}/{}'.format(epoch, num_epochs)) print('-------------') print('(train)') # データローダーからminibatchずつ取り出すループ for imges, y in dataloader: # -------------------- # 1. Discriminatorの学習 # -------------------- # ミニバッチがサイズが1だと、バッチノーマライゼーションでエラーになるのでさける if imges.size()[0] == 1: continue # GPUが使えるならGPUにデータを送る imges = imges.to(device) # 正解ラベルと偽ラベルを作成 # epochの最後のイテレーションはミニバッチの数が少なくなる mini_batch_size = imges.size()[0] label_real = torch.full((mini_batch_size, ), 1).to(device) label_fake = torch.full((mini_batch_size, ), 0).to(device) # 真の画像を判定 d_out_real = discriminator(imges) # 偽の画像を生成して判定 input_z = torch.randn(mini_batch_size, self.latent_dim).to(device) input_z = input_z.view(input_z.size(0), input_z.size(1), 1, 1) fake_images = generator(input_z) d_out_fake = discriminator(fake_images) # 誤差を計算 d_loss_real = criterion(d_out_real.view(-1), label_real) d_loss_fake = criterion(d_out_fake.view(-1), label_fake) d_loss = d_loss_real + d_loss_fake # バックプロパゲーション g_optimizer.zero_grad() d_optimizer.zero_grad() d_loss.backward() d_optimizer.step() # -------------------- # 2. Generatorの学習 # -------------------- # 偽の画像を生成して判定 input_z = torch.randn(mini_batch_size, z_dim).to(device) input_z = input_z.view(input_z.size(0), input_z.size(1), 1, 1) fake_images = generator(input_z) d_out_fake = discriminator(fake_images) # 誤差を計算 g_loss = criterion(d_out_fake.view(-1), label_real) # バックプロパゲーション g_optimizer.zero_grad() d_optimizer.zero_grad() g_loss.backward() g_optimizer.step() # -------------------- # 3. 記録 # -------------------- epoch_d_loss += d_loss.item() epoch_g_loss += g_loss.item() iteration += 1 # epochのphaseごとのlossと正解率 t_epoch_finish = time.time() print('-------------') print( 'epoch {} || Epoch_D_Loss:{:.4f} ||Epoch_G_Loss:{:.4f}'.format( epoch, epoch_d_loss / batch_size, epoch_g_loss / batch_size)) print('timer: {:.4f} sec.'.format(t_epoch_finish - t_epoch_start)) t_epoch_start = time.time() return generator, discriminator