def main(self): print('Starting Training...') loss_hist = Averager() loss = [] validation_losses = [] precisions = [] itr = 1 for epoch in range(self.num_epochs): self.model.train() loss_hist.reset() for images, targets in self.train_data_loader: images = list(image.to(self.device) for image in images) # targets = [{k: v.to(device) for k, v in t.items()} for t in targets] targets = [{k: v.long().to(self.device) for k, v in t.items()} for t in targets] loss_dict = self.model(images, targets) losses = sum(loss for loss in loss_dict.values()) self.validate() loss_value = losses.item() loss_hist.send(loss_value) loss.append(loss_value) self.optimizer.zero_grad() losses.backward() self.optimizer.step() if math.isnan(loss_value): plot_grad_flow(self.model.named_parameters()) raise ValueError('Loss is nan') if itr % 50 == 0: print(f"Iteration #{itr} loss: {loss_value}") itr += 1 # update the learning rate if self.lr_scheduler is not None: self.lr_scheduler.step() if self.val_dataset: precision, validation_loss = self.validate() precisions.append(precision) validation_losses.append(validation_loss) print(f'Mean Precision for Validation Data: {precision}') print(f'Validation Loss: {validation_loss}') print(f"Epoch #{epoch} loss: {loss_hist.value}") print('Finished!') return loss, precisions, validation_losses
def validate(val_loader, model, device): model.eval() itr = 1 loss_hist = Averager() loss_hist.reset() for images, targets, image_ids in val_loader: images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict) loss_value = losses.item() loss_hist.send(loss_value) if itr % 20 == 0: print(f"Iteration: {itr} loss: {loss_hist.value}") itr += 1 return loss_hist.value
def train_fn(start_epochs, epochs, train_loader, val_loader, model, device, optimizer, best_loss, checkpoint_path, best_model_path, lr_scheduler=None): print("Starting Training") model.train() loss_hist = Averager() itr = 1 train_loss = [] validation_loss = [] for epoch in range(start_epochs, epochs + 1): loss_hist.reset() for images, targets, image_ids in train_loader: images = list(image.to(device) for image in images) # images = images.to(device) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) loss_value = losses.item() loss_hist.send(loss_value) optimizer.zero_grad() losses.backward() optimizer.step() if itr % 10 == 0: print(f"Iteration #{itr} loss: {loss_value}") itr += 1 # update the learning rate if lr_scheduler is not None: lr_scheduler.step() # val_loss = validate(val_loader, model, device) print( f"Epoch #{epoch} Train loss: {loss_hist.value}, Validation Loss : Commented" ) train_loss.append(loss_hist.value) # validation_loss.append(val_loss) checkpoint = { 'epoch': epoch + 1, # 'best_loss': val_loss, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), } save_ckp(checkpoint, False, checkpoint_path, best_model_path) # if best_loss <= val_loss: # print('Validation loss decreased ({:.6f} --> {:.6f}). Saving model ...'.format(best_loss, val_loss)) # save_ckp(checkpoint, True, checkpoint_path, best_model_path) # best_loss = val_loss return model, train_loss #, validation_loss
def run(train_path): df = pd.read_csv(train_path) print(df.shape) df['x'] = df['bbox'].apply(lambda x: float(np.array(re.findall("([0-9]+[.]?[0-9]*)", x))[0])) df['y'] = df['bbox'].apply(lambda x: float(np.array(re.findall("([0-9]+[.]?[0-9]*)", x))[1])) df['w'] = df['bbox'].apply(lambda x: float(np.array(re.findall("([0-9]+[.]?[0-9]*)", x))[2])) df['h'] = df['bbox'].apply(lambda x: float(np.array(re.findall("([0-9]+[.]?[0-9]*)", x))[3])) df.drop(['bbox'], inplace=True, axis=1) # split the data image_ids = df['image_id'].unique() valid_ids = image_ids[-665:] train_ids = image_ids[:-665] train_df = df[df['image_id'].isin(train_ids)] valid_df = df[df['image_id'].isin(valid_ids)] train_dataset = WheatDatasetTrain(train_df, config.DIR_TRAIN, get_train_transform()) valid_dataset = WheatDatasetTrain(valid_df, config.DIR_TRAIN, get_valid_transform()) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BS, shuffle=False, num_workers=config.NUM_WORKERS, collate_fn=collate_fn ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BS, shuffle=False, num_workers=config.NUM_WORKERS, collate_fn=collate_fn ) # Device used is cuda device = torch.device('cuda') model = obtain_model() model.to(device) params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1) lr_scheduler = None loss_hist = Averager() itr = 1 for epoch in range(config.EPOCHS): loss_hist.reset() for images, targets, image_ids in train_data_loader: images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) loss_value = losses.item() loss_hist.send(loss_value) optimizer.zero_grad() losses.backward() optimizer.step() if itr % 50 == 0: print(f"Iteration #{itr} loss: {loss_value}") itr += 1 # update the learning rate if lr_scheduler is not None: lr_scheduler.step() print(f"Epoch #{epoch} loss: {loss_hist.value}")