def train_step(model_with_loss: Loss, optimizer: Optimizer, x_batch, y_batch) -> Dict[str, tf.Tensor]: with tf.GradientTape(persistent=True) as tape: # type: ignore _ = model_with_loss(x_batch, y_batch) loss_value = model_with_loss.metric_values["_loss"] # print('watched variables') # print(tape.watched_variables()) loss_mean = tf.reduce_mean(loss_value) # encoded_mean = tf.reduce_mean(tape.watched_variables()[-1]) metric_values = model_with_loss.metric_values error = tf.cast(tf.argmax(metric_values["outputs"], axis=1) != tf.argmax(y_batch, axis=1), tf.float32) metric_values["error"] = error model_with_loss.reset() # grads = tape.gradient(tf.reduce_mean(loss_value), self.encoder_decoder.parameters()) # print('Encoder Decoder Params') # print(self.encoder_decoder.parameters()) # print(tape.gradient(tf.reduce_mean(loss_value), tape.watched_variables())) # print('dLoss / dWatched') # print(tape.gradient(loss_mean, tape.watched_variables())) # for var in tape.watched_variables(): # print(f'd{var.name}/dWatched') # print(tape.gradient(var, tape.watched_variables())) # grads = None # print(grads) grads = tape.gradient(loss_mean, model_with_loss.parameters()) optimizer.apply_gradients(zip(grads, model_with_loss.parameters())) return metric_values
def main(args): # get datasets source_train, source_test = chainer.datasets.get_svhn() target_train, target_test = chainer.datasets.get_mnist(ndim=3, rgb_format=True) source = source_train, source_test # resize mnist to 32x32 def transform(in_data): img, label = in_data img = resize(img, (32, 32)) return img, label target_train = TransformDataset(target_train, transform) target_test = TransformDataset(target_test, transform) target = target_train, target_test # load pretrained source, or perform pretraining pretrained = os.path.join(args.output, args.pretrained_source) if not os.path.isfile(pretrained): source_cnn = pretrain_source_cnn(source, args) else: source_cnn = Loss(num_classes=10) serializers.load_npz(pretrained, source_cnn) # how well does this perform on target domain? test_pretrained_on_target(source_cnn, target, args) # initialize the target cnn (do not use source_cnn.copy) target_cnn = Loss(num_classes=10) # copy parameters from source cnn to target cnn target_cnn.copyparams(source_cnn) train_target_cnn(source, target, source_cnn, target_cnn, args)
def __init__(self, train_dl, val_dl): self.device = ('cuda:0' if torch.cuda.is_available() else 'cpu') self.train_dl = train_dl self.val_dl = val_dl self.loss = Loss() self.net = UNet(1).to(self.device) self.net.apply(Model._init_weights) self.criterion = self.loss.BCEDiceLoss self.optim = None self.scheduler = None self._init_optim(LR, BETAS) self.cycles = 0 self.hist = {'train': [], 'val': [], 'loss': []} utils.create_dir('./pt') utils.log_data_to_txt('train_log', f'\nUsing device {self.device}')
def pretrain_source_cnn(data, args, epochs=1000): print(":: pretraining source encoder") source_cnn = Loss(num_classes=10) if args.device >= 0: source_cnn.to_gpu() optimizer = chainer.optimizers.Adam() optimizer.setup(source_cnn) train_iterator, test_iterator = data2iterator(data, args.batchsize, multiprocess=False) # train_iterator = chainer.iterators.MultiprocessIterator(data, args.batchsize, n_processes=4) updater = chainer.training.StandardUpdater(iterator=train_iterator, optimizer=optimizer, device=args.device) trainer = chainer.training.Trainer(updater, (epochs, 'epoch'), out=args.output) # learning rate decay # trainer.extend(extensions.ExponentialShift("alpha", rate=0.9, init=args.learning_rate, target=args.learning_rate*10E-5)) trainer.extend( extensions.Evaluator(test_iterator, source_cnn, device=args.device)) # trainer.extend(extensions.snapshot(filename='snapshot_epoch_{.updater.epoch}'), trigger=(10, "epoch")) trainer.extend(extensions.snapshot_object( optimizer.target, "source_model_epoch_{.updater.epoch}"), trigger=(epochs, "epoch")) trainer.extend(extensions.ProgressBar(update_interval=10)) trainer.extend(extensions.LogReport(trigger=(1, "epoch"))) trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ])) trainer.run() return source_cnn
def main(): parser = argparse.ArgumentParser() arg = parser.add_argument arg('--size', type=str, default='512X512', help='Input size, for example 512X512. Must be multiples of 2') arg('--num_workers', type=int, default=4, help='Enter the number of workers') arg('--batch_size', type=int, default=16, help='Enter batch size') arg('--n_epochs', type=int, default=52, help='Enter number of epochs to run training for') arg('--report_each', type=int, default=10, help='Enter the span of last readings of running loss to report') arg('--lr', type=int, default=0.0001, help='Enter learning rate') arg('--fold_no', type=int, default=0, help='Enter the fold no') arg('--to_augment', type=bool, default=False, help='Augmentation flag') args = parser.parse_args() local_data_path = Path('.').absolute() local_data_path.mkdir(exist_ok=True) #mention the fold path here train_path=local_data_path/'..'/'input'/'train' a=CoinDataset(train_path,to_augment=args.to_augment) n_classes=get_n_classes(train_path) print(n_classes) ''' num_workers,batch_size ''' def make_loader(ds_root: Path, to_augment=False, shuffle=False): return DataLoader( dataset=CoinDataset(ds_root, to_augment=to_augment), shuffle=shuffle, num_workers=args.num_workers, batch_size=args.batch_size, pin_memory=True ) #craeting a dataloader #mention the fold path here train_path=local_data_path/'..'/'input'/'train' train_loader=make_loader(train_path,to_augment=args.to_augment, shuffle=True) validation_path=local_data_path/'..'/'input'/'validation' validation_loader=make_loader(validation_path,to_augment=args.to_augment, shuffle=True) test_path=local_data_path/'..'/'input'/'test' test_loader=make_loader(test_path,to_augment=args.to_augment, shuffle=True) #define model, and handle gpus print('device is',device) model_name='resnet50' model=get_model(model_name=model_name,pretrained_status=True,n_classes=n_classes).to(device) if device.type=="cuda": #model = nn.DataParallel(model, device_ids=device_list) print('cuda devices',device_list) #define optimizer and learning_rate init_optimizer=lambda lr: Adam(model.parameters(), lr=lr) lr=args.lr optimizer=init_optimizer(lr) criterion=Loss() #print(model) report_each=args.report_each #model save implementation model_path= local_data_path/'model_checkpoints' model_path.mkdir(exist_ok=True) model_path=local_data_path/'model_checkpoints'/'{model_name}_{fold}.pt'.format(model_name=model_name,fold=args.fold_no) best_model_path= local_data_path/'best_model_checkpoints' best_model_path.mkdir(exist_ok=True) best_model_path=local_data_path/'best_model_checkpoints'/'{model_name}_{fold}.pt'.format(model_name=model_name,fold=args.fold_no) #updated fold checkpoint here save = lambda ep: torch.save({ 'model': model.state_dict(), 'epoch': ep, 'best_valid_loss': best_valid_loss }, str(model_path)) best_valid_loss = float('inf') valid_losses = [] test_losses=[] valid_accuracy = [] test_accuracy=[] for epoch in range(0, args.n_epochs): model.train() tq = tqdm(total=(len(train_loader) * args.batch_size)) tq.set_description('Epoch {}, lr {}'.format(epoch, lr)) losses = [] for i, (inputs,_,_, targets) in enumerate(train_loader): inputs=inputs.to(device) outputs = model(inputs) #start here _, preds = torch.max(outputs, 1) #end here targets=targets.to(device)-1 loss = criterion(outputs, targets) optimizer.zero_grad() batch_size = inputs.size(0) tq.update(batch_size) losses.append(loss.item()) mean_loss = np.mean(losses[-report_each:]) tq.set_postfix(loss='{:.5f}'.format(mean_loss)) (batch_size * loss).backward() optimizer.step() tq.close() save(epoch) valid_metrics = validation(model, criterion, validation_loader) valid_loss = valid_metrics['valid_loss'] valid_losses.append(valid_loss) test_metrics = test(model, criterion, test_loader) test_loss = test_metrics['test_loss'] test_losses.append(test_loss) if valid_loss < best_valid_loss: print('found better val loss model') best_valid_loss = valid_loss shutil.copy(str(model_path), str(best_model_path))
if cfg.data == 'Structured3D': dataset = Structured3D(cfg.Dataset.Structured3D, 'test') elif cfg.data == 'NYU303': dataset = NYU303(cfg.Dataset.NYU303, 'test', exam=cfg.exam) elif cfg.data == 'CUSTOM': dataset = CustomDataset(cfg.Dataset.CUSTOM, 'test') else: raise NotImplementedError dataloader = torch.utils.data.DataLoader(dataset, num_workers=cfg.num_workers) # create network model = Detector() # compute loss criterion = Loss(cfg.Weights) # set data parallel # if cfg.num_gpus > 1 and torch.cuda.is_available(): # model = torch.nn.DataParallel(model) # reload weights if cfg.pretrained: state_dict = torch.load(cfg.pretrained, map_location=torch.device('cpu')) model.load_state_dict(state_dict) # set device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model.to(device) criterion.to(device)
def main(): yolov5l = YOLOv5_large(( 608, 608, 3, ), 80) loss1 = Loss(( 608, 608, 3, ), 0, 80) loss2 = Loss(( 608, 608, 3, ), 1, 80) loss3 = Loss(( 608, 608, 3, ), 2, 80) if exists('./checkpoints/ckpt'): yolov5l.load_weights('./checkpoints/ckpt/variables/variables') optimizer = tf.keras.optimizers.Adam(1e-4) yolov5l.compile(optimizer=optimizer, loss={ 'output1': lambda labels, outputs: loss1([outputs, labels]), 'output2': lambda labels, outputs: loss2([outputs, labels]), 'output3': lambda labels, outputs: loss3([outputs, labels]) }) class SummaryCallback(tf.keras.callbacks.Callback): def __init__(self, eval_freq=100): self.eval_freq = eval_freq testset = tf.data.TFRecordDataset(testset_filenames).map( parse_function).repeat(-1) self.iter = iter(testset) self.train_loss = tf.keras.metrics.Mean(name='train loss', dtype=tf.float32) self.log = tf.summary.create_file_writer('./checkpoints') def on_batch_begin(self, batch, logs=None): pass def on_batch_end(self, batch, logs=None): self.train_loss.update_state(logs['loss']) if batch % self.eval_freq == 0: image, bbox, labels = next(self.iter) image = image.numpy().astype('uint8') predictor = Predictor(yolov5l=yolov5l) boundings = predictor.predict(image) color_map = dict() for bounding in boundings: if bounding[5].numpy().astype('int32') not in color_map: color_map[bounding[5].numpy().astype('int32')] = tuple( np.random.randint(low=0, high=256, size=(3, )).tolist()) clr = color_map[bounding[5].numpy().astype('int32')] cv2.rectangle(image, tuple(bounding[0:2].numpy().astype('int32')), tuple(bounding[2:4].numpy().astype('int32')), clr, 1) cv2.putText( image, predictor.getClsName( bounding[5].numpy().astype('int32')), tuple(bounding[0:2].numpy().astype('int32')), cv2.FONT_HERSHEY_PLAIN, 1, clr, 2) image = tf.expand_dims(image, axis=0) with self.log.as_default(): tf.summary.scalar('train loss', self.train_loss.result(), step=optimizer.iterations) tf.summary.image('detect', image[..., ::-1], step=optimizer.iterations) self.train_loss.reset_states() def on_epoch_begin(self, epoch, logs=None): pass def on_epoch_end(self, batch, logs=None): pass # load downloaded dataset trainset_filenames = [ join('trainset', filename) for filename in listdir('trainset') ] testset_filenames = [ join('testset', filename) for filename in listdir('testset') ] trainset = tf.data.TFRecordDataset(trainset_filenames).map( parse_function_generator(80)).shuffle(batch_size).batch( batch_size).prefetch(tf.data.experimental.AUTOTUNE) testset = tf.data.TFRecordDataset(testset_filenames).map( parse_function_generator(80)).shuffle(batch_size).batch( batch_size).prefetch(tf.data.experimental.AUTOTUNE) callbacks = [ tf.keras.callbacks.TensorBoard(log_dir='./checkpoints'), tf.keras.callbacks.ModelCheckpoint(filepath='./checkpoints/ckpt', save_freq=10000), SummaryCallback(), ] yolov5l.fit(trainset, epochs=100, validation_data=testset, callbacks=callbacks) yolov5l.save('yolov5l.h5')
class Model: def __init__(self, train_dl, val_dl): self.device = ('cuda:0' if torch.cuda.is_available() else 'cpu') self.train_dl = train_dl self.val_dl = val_dl self.loss = Loss() self.net = UNet(1).to(self.device) self.net.apply(Model._init_weights) self.criterion = self.loss.BCEDiceLoss self.optim = None self.scheduler = None self._init_optim(LR, BETAS) self.cycles = 0 self.hist = {'train': [], 'val': [], 'loss': []} utils.create_dir('./pt') utils.log_data_to_txt('train_log', f'\nUsing device {self.device}') def _init_optim(self, lr, betas): self.optim = optim.Adam(utils.filter_gradients(self.net), lr=lr) self.scheduler = optim.lr_scheduler.StepLR(self.optim, step_size=100, gamma=.75) def _save_models(self): utils.save_state_dict(self.net, 'model', './pt') utils.save_state_dict(self.optim, 'optim', './pt') utils.save_state_dict(self.scheduler, 'scheduler', './pt') def train(self, epochs): self.net.train() for epoch in range(epochs): self.net.train() for idx, data in enumerate(self.train_dl): batch_time = time.time() self.cycles += 1 print(self.cycles) image = data['MRI'].to(self.device) target = data['Mask'].to(self.device) output = self.net(image) output_rounded = np.copy(output.data.cpu().numpy()) output_rounded[np.nonzero(output_rounded < 0.5)] = 0. output_rounded[np.nonzero(output_rounded >= 0.5)] = 1. train_f1 = self.loss.F1_metric(output_rounded, target.data.cpu().numpy()) loss = self.criterion(output, target) self.hist['train'].append(train_f1) self.hist['loss'].append(loss.item()) self.optim.zero_grad() loss.backward() self.optim.step() self.scheduler.step() if self.cycles % 100 == 0: self._save_models() val_f1 = self.evaluate() utils.log_data_to_txt( 'train_log', f'\nEpoch: {epoch}/{epochs} - Batch: {idx * BATCH_SIZE}/{len(self.train_dl.dataset)}' f'\nLoss: {loss.mean().item():.4f}' f'\nTrain F1: {train_f1:.4f} - Val F1: {val_f1}' f'\nTime taken: {time.time() - batch_time:.4f}s') def evaluate(self): # model.eval() loss_v = 0 with torch.no_grad(): for idx, data in enumerate(self.val_dl): image, target = data['MRI'], data['Mask'] image = image.to(self.device) target = target.to(self.device) outputs = self.net(image) out_thresh = np.copy(outputs.data.cpu().numpy()) out_thresh[np.nonzero(out_thresh < .3)] = 0.0 out_thresh[np.nonzero(out_thresh >= .3)] = 1.0 loss = self.loss.F1_metric(out_thresh, target.data.cpu().numpy()) loss_v += loss return loss_v / idx @classmethod def _init_weights(cls, layer: nn.Module): name = layer.__class__.__name__ if name.find('Conv') != -1 and name.find('2d') != -1: nn.init.normal_(layer.weight.data, .0, 2e-2) if name.find('BatchNorm') != -1: nn.init.normal_(layer.weight.data, 1.0, 2e-2) nn.init.constant_(layer.bias.data, .0)
def train_cptn(): detector = TextDetector() loss = Loss() optimizer = tf.keras.optimizers.Adam( tf.keras.optimizers.schedules.ExponentialDecay(1e-5, decay_steps=30000, decay_rate=0.9)) # load dataset trainset = tf.data.TFRecordDataset( join('datasets', 'trainset.tfrecord')).repeat(-1).map(ctpn_parse_function).batch( 1).prefetch(tf.data.experimental.AUTOTUNE) # restore from existing checkpoint if False == exists('checkpoints'): mkdir('checkpoints') checkpoint = tf.train.Checkpoint(model=detector.ctpn, optimizer=optimizer) checkpoint.restore(tf.train.latest_checkpoint('checkpoints')) # create log log = tf.summary.create_file_writer('checkpoints') # train model avg_loss = tf.keras.metrics.Mean(name="loss", dtype=tf.float32) for image, labels in trainset: if labels.shape[1] == 0: print("skip sample without labels") continue with tf.GradientTape() as tape: bbox_pred = detector.ctpn(image) l = loss([bbox_pred, labels]) avg_loss.update_state(l) # write log if tf.equal(optimizer.iterations % 100, 0): with log.as_default(): tf.summary.scalar('loss', avg_loss.result(), step=optimizer.iterations) # draw text detection results text_lines, _, _ = detector.detect(image, False) image = image[0, ...].numpy().astype('uint8') for text_line in text_lines: cv2.rectangle(image, (int(text_line[0]), int(text_line[1])), (int(text_line[2]), int(text_line[3])), (0, 255, 0), 2) image = tf.expand_dims(image, axis=0) tf.summary.image('text lines', image, step=optimizer.iterations) print('Step #%d Loss: %.6f lr: %.6f' % (optimizer.iterations, avg_loss.result(), optimizer._hyper['learning_rate'](optimizer.iterations))) if avg_loss.result() < 0.01: break avg_loss.reset_states() grads = tape.gradient(l, detector.ctpn.trainable_variables) if tf.reduce_any( [tf.reduce_any(tf.math.is_nan(grad)) for grad in grads]) == True: print("NaN was detected in gradients, skip gradient apply!") continue optimizer.apply_gradients(zip(grads, detector.ctpn.trainable_variables)) # save model if tf.equal(optimizer.iterations % 2000, 0): checkpoint.save(join('checkpoints', 'ckpt')) # save the network structure with weights if False == exists('model'): mkdir('model') detector.ctpn.save(join('model', 'ctpn.h5'))
def main(): gpus = tf.config.experimental.list_physical_devices('GPU') [tf.config.experimental.set_memory_growth(gpu, True) for gpu in gpus] # yolov5l model yolov5l = YOLOv5_large((608, 608, 3), 80) loss1 = Loss((608, 608, 3), 0, 80) loss2 = Loss((608, 608, 3), 1, 80) loss3 = Loss((608, 608, 3), 2, 80) #optimizer = tf.keras.optimizers.Adam(tf.keras.optimizers.schedules.ExponentialDecay(1e-5, decay_steps = 110000, decay_rate = 0.99)); optimizer = tf.keras.optimizers.Adam(1e-5) checkpoint = tf.train.Checkpoint(model=yolov5l, optimizer=optimizer) train_loss = tf.keras.metrics.Mean(name='train loss', dtype=tf.float32) test_loss = tf.keras.metrics.Mean(name='test loss', dtype=tf.float32) # load downloaded dataset trainset_filenames = [ join('trainset', filename) for filename in listdir('trainset') ] testset_filenames = [ join('testset', filename) for filename in listdir('testset') ] trainset = tf.data.TFRecordDataset(trainset_filenames).map( parse_function_generator(80)).repeat(-1).shuffle(batch_size).batch( batch_size).prefetch(tf.data.experimental.AUTOTUNE) testset = tf.data.TFRecordDataset(testset_filenames).map( parse_function_generator(80)).repeat(-1).shuffle(batch_size).batch( batch_size).prefetch(tf.data.experimental.AUTOTUNE) validationset = tf.data.TFRecordDataset(testset_filenames).map( parse_function).repeat(-1) trainset_iter = iter(trainset) testset_iter = iter(testset) validationset_iter = iter(validationset) # restore from existing checkpoint if False == exists('checkpoints'): mkdir('checkpoints') checkpoint.restore(tf.train.latest_checkpoint('checkpoints')) # tensorboard summary log = tf.summary.create_file_writer('checkpoints') # train model while True: images, labels = next(trainset_iter) labels1, labels2, labels3 = labels with tf.GradientTape() as tape: outputs1, outputs2, outputs3 = yolov5l(images) loss = loss1([outputs1, labels1]) + loss2( [outputs2, labels2]) + loss3([outputs3, labels3]) # check whether the loss numberic is correct if tf.math.reduce_any(tf.math.is_nan(loss)) == True: print("NaN was detected in loss, skip the following steps!") continue grads = tape.gradient(loss, yolov5l.trainable_variables) # check whether the grad numerics is correct if tf.math.reduce_any([ tf.math.reduce_any(tf.math.is_nan(grad)) for grad in grads ]) == True: print("NaN was detected in gradients, skip gradient apply!") continue optimizer.apply_gradients(zip(grads, yolov5l.trainable_variables)) train_loss.update_state(loss) # save model if tf.equal(optimizer.iterations % 10000, 0): # save checkpoint every 1000 steps checkpoint.save(join('checkpoints', 'ckpt')) yolov5l.save('yolov5l.h5') if tf.equal(optimizer.iterations % 100, 0): # evaluate for i in range(10): images, labels = next(testset_iter) # images.shape = (b, h, w, 3) outputs = yolov5l(images) loss = yolov3_loss([*outputs, *labels]) test_loss.update_state(loss) # visualize image, bbox, labels = next(validationset_iter) # image.shape = (h, w, 3) image = image.numpy().astype('uint8') predictor = Predictor(yolov5l=yolov5l) boundings = predictor.predict(image) color_map = dict() for bounding in boundings: if bounding[5].numpy().astype('int32') not in color_map: color_map[bounding[5].numpy().astype('int32')] = tuple( np.random.randint(low=0, high=256, size=(3, )).tolist()) clr = color_map[bounding[5].numpy().astype('int32')] cv2.rectangle(image, tuple(bounding[0:2].numpy().astype('int32')), tuple(bounding[2:4].numpy().astype('int32')), clr, 1) cv2.putText( image, predictor.getClsName(bounding[5].numpy().astype('int32')), tuple(bounding[0:2].numpy().astype('int32')), cv2.FONT_HERSHEY_PLAIN, 1, clr, 2) image = tf.expand_dims(image, axis=0) # write log with log.as_default(): tf.summary.scalar('train loss', train_loss.result(), step=optimizer.iterations) tf.summary.scalar('test loss', test_loss.result(), step=optimizer.iterations) tf.summary.image('detect', image[..., ::-1], step=optimizer.iterations) print('Step #%d Train Loss: %.6f Test Loss: %.6f' % (optimizer.iterations, train_loss.result(), test_loss.result())) # break condition #if train_loss.result() < 0.001: break; # reset train_loss.reset_states() test_loss.reset_states() yolov5l.save('yolov5l.h5')
def main(): parser = argparse.ArgumentParser() arg = parser.add_argument arg('--size', type=str, default='512X512', help='Input size, for example 512X512. Must be multiples of 2') arg('--num_workers', type=int, default=4, help='Enter the number of workers') arg('--batch_size', type=int, default=16, help='Enter batch size') arg('--n_epochs', type=int, default=52, help='Enter number of epochs to run training for') arg('--report_each', type=int, default=10, help='Enter the span of last readings of running loss to report') arg('--lr', type=float, default=0.0001, help='Enter learning rate') arg('--fold_no', type=int, default=0, help='Enter the fold no') arg('--to_augment', type=bool, default=False, help='Augmentation flag') arg('--model_name', type=str, default='resnet18', help='enter model name') args = parser.parse_args() local_data_path = Path('.').absolute() local_data_path.mkdir(exist_ok=True) #mention the fold path here train_path = local_data_path / '..' / 'input' / 'train' a = CoinDataset(train_path, to_augment=args.to_augment) n_classes = get_n_classes(train_path) print(n_classes) ''' num_workers,batch_size ''' def make_loader(ds_root: Path, to_augment=False, shuffle=False): return DataLoader(dataset=CoinDataset(ds_root, to_augment=to_augment), shuffle=shuffle, num_workers=args.num_workers, batch_size=args.batch_size, pin_memory=True) #craeting a dataloader #mention the fold path here train_path = local_data_path / '..' / 'input' / 'train' train_loader = make_loader(train_path, to_augment=args.to_augment, shuffle=True) validation_path = local_data_path / '..' / 'input' / 'validation' validation_loader = make_loader(validation_path, to_augment=args.to_augment, shuffle=True) test_path = local_data_path / '..' / 'input' / 'test' test_loader = make_loader(test_path, to_augment=args.to_augment, shuffle=True) #define model, and handle gpus print('device is', device) model_name = args.model_name model = get_model(model_name=model_name, pretrained_status=True, n_classes=n_classes).to(device) if device.type == "cuda": #model = nn.DataParallel(model, device_ids=device_list) print('cuda devices', device_list) #define optimizer and learning_rate init_optimizer = lambda lr: Adam(model.parameters(), lr=lr) lr = args.lr optimizer = init_optimizer(lr) criterion = Loss() #print(model) report_each = args.report_each #model save implementation model_path = local_data_path / 'model_checkpoints' model_path.mkdir(exist_ok=True) model_path = local_data_path / 'model_checkpoints' / '{model_name}_{fold}.pt'.format( model_name=model_name, fold=args.fold_no) best_model_path = local_data_path / 'best_model_checkpoints' best_model_path.mkdir(exist_ok=True) best_model_path = local_data_path / 'best_model_checkpoints' / '{model_name}_{fold}.pt'.format( model_name=model_name, fold=args.fold_no) #updated fold checkpoint here save = lambda ep: torch.save( { 'model': model.state_dict(), 'epoch': ep, 'best_valid_loss': best_valid_loss }, str(model_path)) best_valid_loss = float('inf') valid_losses = [] test_losses = [] valid_accuracy = [] test_accuracy = [] ####defining the dataframe for dumping outputs############################## n_trials = 1 modes = ['train', 'validation', 'test'] metrics = ['loss', 'accuracy'] def get_column_name(trial, mode, metric): return 'trial= ' + str(trial) + ' mode=' + mode + ' metric=' + metric col_list = [] for trial in range(n_trials): for mode in modes: for metric in metrics: col_list.append(get_column_name(trial, mode, metric)) #create the dataframe before saving the results df = pd.DataFrame(0.0, index=np.arange(args.n_epochs), columns=col_list) ########training loop begins################################################# ####need to start the trial of the experiment here for trial in range(n_trials): for epoch in range(0, args.n_epochs): model.train() tq = tqdm(total=(len(train_loader) * args.batch_size)) tq.set_description('Epoch {}, lr {}'.format(epoch, lr)) losses = [] for i, (inputs, _, _, targets) in enumerate(train_loader): inputs = inputs.to(device) outputs = model(inputs) #start here _, preds = torch.max(outputs, 1) #end here targets = targets.to(device) - 1 loss = criterion(outputs, targets) optimizer.zero_grad() batch_size = inputs.size(0) tq.update(batch_size) losses.append(loss.item()) mean_loss = np.mean(losses[-report_each:]) tq.set_postfix(loss='{:.5f}'.format(mean_loss)) (batch_size * loss).backward() optimizer.step() tq.close() save(epoch) #############epoch completes here, dump the data to the dataframe ##############3 train_loss = np.mean(losses) valid_metrics = validation(model, criterion, validation_loader) valid_loss = valid_metrics['valid_loss'] valid_accuracy = valid_metrics['valid_accuracy'] valid_losses.append(valid_loss) test_metrics = test(model, criterion, test_loader) test_loss = test_metrics['test_loss'] test_accuracy = test_accuracy['test_accuracy'] test_losses.append(test_loss) #update the data in the data frame #test accuracy not needed leave it df.loc[epoch][get_column_name(trial, 'train', 'loss')] = train_loss df.loc[epoch][get_column_name(trial, 'validation', 'loss')] = valid_loss df.loc[epoch][get_column_name(trial, 'validation', 'accuracy')] = valid_accuracy df.loc[epoch][get_column_name(trial, 'test', 'loss')] = test_loss df.loc[epoch][get_column_name(trial, 'test', 'accuracy')] = test_accuracy #just check if correctly updated print('just checking if one field updated', df.loc[epoch][get_column_name(trial, 'test', 'accuracy')]) #save the incomplete dataframe till now df.to_csv('results.csv') if valid_loss < best_valid_loss: print('found better val loss model') best_valid_loss = valid_loss shutil.copy(str(model_path), str(best_model_path)) #save the complete dataframe here df.to_csv('results.csv')
def main(): gpus = tf.config.experimental.list_physical_devices('GPU'); [tf.config.experimental.set_memory_growth(gpu, True) for gpu in gpus]; # yolov3 model yolov3 = YOLOv3((416,416,3), 80); yolov3_loss = Loss((416,416,3), 80); #optimizer = tf.keras.optimizers.Adam(tf.keras.optimizers.schedules.ExponentialDecay(1e-5, decay_steps = 110000, decay_rate = 0.99)); optimizer = tf.keras.optimizers.Adam(1e-5); checkpoint = tf.train.Checkpoint(model = yolov3, optimizer = optimizer); train_loss = tf.keras.metrics.Mean(name = 'train loss', dtype = tf.float32); test_loss = tf.keras.metrics.Mean(name = 'test loss', dtype = tf.float32); # load downloaded dataset trainset_filenames = [join('trainset', filename) for filename in listdir('trainset')]; testset_filenames = [join('testset', filename) for filename in listdir('testset')]; trainset = tf.data.TFRecordDataset(trainset_filenames).map(parse_function_generator(80)).repeat(-1).shuffle(batch_size).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE); testset = tf.data.TFRecordDataset(testset_filenames).map(parse_function_generator(80)).repeat(-1).shuffle(batch_size).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE); validationset = tf.data.TFRecordDataset(testset_filenames).map(parse_function).repeat(-1); trainset_iter = iter(trainset); testset_iter = iter(testset); validationset_iter = iter(validationset); # restore from existing checkpoint if False == exists('checkpoints'): mkdir('checkpoints'); checkpoint.restore(tf.train.latest_checkpoint('checkpoints')); # tensorboard summary log = tf.summary.create_file_writer('checkpoints'); # train model while True: images, labels = next(trainset_iter); with tf.GradientTape() as tape: outputs = yolov3(images); loss = yolov3_loss([*outputs, *labels]); # check whether the loss numberic is correct if tf.math.reduce_any(tf.math.is_nan(loss)) == True: print("NaN was detected in loss, skip the following steps!"); continue; grads = tape.gradient(loss, yolov3.trainable_variables); # check whether the grad numerics is correct if tf.math.reduce_any([tf.math.reduce_any(tf.math.is_nan(grad)) for grad in grads]) == True: print("NaN was detected in gradients, skip gradient apply!"); continue; optimizer.apply_gradients(zip(grads, yolov3.trainable_variables)); train_loss.update_state(loss); # save model if tf.equal(optimizer.iterations % 10000, 0): # save checkpoint every 1000 steps checkpoint.save(join('checkpoints','ckpt')); yolov3.save('yolov3.h5'); if tf.equal(optimizer.iterations % 100, 0): # evaluate for i in range(10): images, labels = next(testset_iter); # images.shape = (b, h, w, 3) outputs = yolov3(images); loss = yolov3_loss([*outputs, *labels]); test_loss.update_state(loss); # visualize image, bbox, labels = next(validationset_iter); # image.shape = (h, w, 3) image = image.numpy().astype('uint8'); predictor = Predictor(yolov3 = yolov3); boundings = predictor.predict(image); color_map = dict(); for bounding in boundings: if bounding[5].numpy().astype('int32') in color_map: clr = color_map[bounding[5].numpy().astype('int32')]; else: color_map[bounding[5].numpy().astype('int32')] = tuple(np.random.randint(low = 0, high = 256, size = (3,)).tolist()); clr = color_map[bounding[5].numpy().astype('int32')]; cv2.rectangle(image, tuple(bounding[0:2].numpy().astype('int32')), tuple(bounding[2:4].numpy().astype('int32')), clr, 5); image = tf.expand_dims(image, axis = 0); # write log with log.as_default(): tf.summary.scalar('train loss', train_loss.result(), step = optimizer.iterations); tf.summary.scalar('test loss', test_loss.result(), step = optimizer.iterations); tf.summary.image('detect', image[...,::-1], step = optimizer.iterations); print('Step #%d Train Loss: %.6f Test Loss: %.6f' % (optimizer.iterations, train_loss.result(), test_loss.result())); # break condition #if train_loss.result() < 0.001: break; # reset train_loss.reset_states(); test_loss.reset_states(); yolov3.save('yolov3.h5');
def loss(labels, outputs): return Loss((416,416,3,),80)([outputs[0], outputs[1], outputs[2], labels[0], labels[1], labels[2]]);