Exemplo n.º 1
0
    def get_callbacks(self):
        """
        Define any callbacks for the training
        """

        model_filename = os.path.join(self.output_path,
                                      self.inference_filename)

        print("Writing model to '{}'".format(model_filename))

        # Save model whenever we get better validation loss
        model_checkpoint = K.callbacks.ModelCheckpoint(model_filename,
                                                       verbose=1,
                                                       monitor="val_loss",
                                                       save_best_only=True)

        directoryName = "unet_block{}_inter{}_intra{}".format(
            self.blocktime, self.num_threads, self.num_inter_threads)

        # Tensorboard callbacks
        if self.use_upsampling:
            tensorboard_filename = os.path.join(
                self.output_path, "keras_tensorboard_upsampling"
                "_batch{}/{}".format(self.batch_size, directoryName))
        else:
            tensorboard_filename = os.path.join(
                self.output_path, "keras_tensorboard_transposed"
                "_batch{}/{}".format(self.batch_size, directoryName))

        tensorboard_checkpoint = K.callbacks.TensorBoard(
            log_dir=tensorboard_filename, write_graph=True, write_images=True)

        foundations.set_tensorboard_logdir(tensorboard_filename)

        return model_filename, [model_checkpoint, tensorboard_checkpoint]
Exemplo n.º 2
0
    def __init__(self, train_dl, val_dl, test_dl, model: torch.nn.Module, optimizer, scheduler, criterion, params):
        self.train_dl = train_dl
        self.val_dl = val_dl
        self.test_dl = test_dl
        self.visual_iter = iter(val_dl)
        self.unnorm = Unnormalize(training_mean, training_std)

        self.model = model
        self.optimizer = optimizer
        self.num_epochs = params["num_epochs"]
        self.lr = params["max_lr"]
        self.scheduler = scheduler
        self.criterion = criterion

        os.makedirs('checkpoints', exist_ok=True)
        os.makedirs('tensorboard', exist_ok=True)
        if settings.USE_FOUNDATIONS:
            foundations.set_tensorboard_logdir('tensorboard')
        self.writer = SummaryWriter("tensorboard")
        self.meter_train = Meter(self.writer, 'train',0)
        self.meter_val = Meter(self.writer, 'val',0)
        self.current_epoch = 0
        self.best_metric = 1e9
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.phase = 'train'
        self.train()
Exemplo n.º 3
0
    def __init__(self, train_dl, val_dl, test_dl, model: torch.nn.Module, optimizer, scheduler, criterion, params):
        self.train_dl = train_dl
        self.val_dl = val_dl
        self.test_dl = test_dl
        self.visual_iter = iter(val_dl)
        self.unnorm = Unnormalize(model.input_mean, model.input_std)

        # self.model = torch.nn.DataParallel(convert_model(model)).cuda()
        # serious bugs due to DataParallel, may caused by BN and apex
        self.model = model
        self.optimizer = optimizer
        self.num_epochs = params["num_epochs"]
        self.lr = params["max_lr"]
        self.clip_gradient = params["clip_gradient"]
        self.scheduler = scheduler
        self.criterion = criterion
        self.batch_repeat = params["batch_repeat"]

        os.makedirs('checkpoints', exist_ok=True)
        os.makedirs('tensorboard', exist_ok=True)
        if settings.USE_FOUNDATIONS:
            foundations.set_tensorboard_logdir('tensorboard')
        self.writer = SummaryWriter("tensorboard")
        self.meter_train = Classification_Meter(self.writer, 'train', 0)
        self.meter_val = Classification_Meter(self.writer, 'val', 0)
        self.current_epoch = 0
        self.best_metric = 1e9
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.phase = 'train'
        self.seeds = [np.random.randint(0, 2e9), random.randint(0, 2e9)]
        self.train()
        self.history_best = {}
Exemplo n.º 4
0
    def train(self, xtrain, ytrain, xval, yval):
        callbacks = []
        tb = TensorBoard(log_dir='tflogs', write_graph=True, write_grads=False)
        callbacks.append(tb)

        try:
            foundations.set_tensorboard_logdir('tflogs')
        except:
            print("foundations command not found")

        es = EarlyStopping(monitor='val_loss', mode='min', patience=5, min_delta=0.0001,
                           verbose=1)
        callbacks.append(tb)
        callbacks.append(es)

        rp = ReduceLROnPlateau(monitor='val_loss', factor=0.6, patience=2,
                               verbose=1)
        callbacks.append(rp)

        f1_callback = f1_score_callback(xval, yval, model_save_filename=self.model_save_filename)
        callbacks.append(f1_callback)

        class_weights = {1: 5, 0: 1}

        train_generator = DataGenerator(xtrain, ytrain)
        validation_generator = DataGenerator(xval, yval)
        self.model.fit_generator(train_generator,
                                 steps_per_epoch = len(train_generator),
                                 epochs = model_params['epochs'],
                                 validation_data=validation_generator,
                                 callbacks = callbacks,
                                 shuffle = False,
                                  use_multiprocessing = True,
                                  verbose = 1,
                                 class_weight =class_weights)

        self.model = load_model(self.model_save_filename, custom_objects={'customPooling': customPooling})

        try:
            foundations.save_artifact(self.model_save_filename, key='trained_model.h5')
        except:
            print("foundations command not found")
Exemplo n.º 5
0
print("____________________________________________")
print("_________________DATA_______________________")
print("____________________________________________")

import DessaCallback as dc
import PyArrowDataExtraction as de

tensorboard_callback = keras.callbacks.TensorBoard(
    log_dir=logdir)  #tensorboard
csv_callback = keras.callbacks.CSVLogger("experiment_training.log",
                                         separator=",",
                                         append=False)  #csvlogger
csv_callback_test = keras.callbacks.CSVLogger("experiment_testing.log",
                                              separator=",",
                                              append=False)  #csvloggertesting
foundations.set_tensorboard_logdir(logdir)  #foundations

pds = pq.ParquetDataset(parquet_files)
pds.split_row_groups = True
table = pds.read()
print(str(table.num_rows))
xy = de.getXandYFromPyArrow(table)

pds2 = pq.ParquetDataset(parquet_files2)
pds2.split_row_groups = True
table2 = pds2.read()
print(str(table2.num_rows))
if not table2.num_rows > 0:
    table2 = table
xy2 = de.getXandYFromPyArrow(table2)
from foundations import load_parameters, log_params
print("using atlas framework")
params = load_parameters()
seed_everything(params['seed'])
log_params(params)

params = parse_params(params)
print(params)

model = CIFAR_Module(params).cuda()
lr_logger = LearningRateLogger()
logger = TensorBoardLogger("../logs", name=params["backbone"])
if USE_FOUNDATIONS:
    from foundations import set_tensorboard_logdir
    set_tensorboard_logdir(f'../logs/{params["backbone"]}')

checkpoint_callback = ModelCheckpoint(save_top_k=1,
                                      monitor='acc',
                                      prefix=str(params["seed"]))
t_params = get_trainer_params(params)
trainer = Trainer(callbacks=[lr_logger],
                  logger=logger,
                  checkpoint_callback=checkpoint_callback,
                  **t_params)
trainer.fit(model)

if USE_FOUNDATIONS and checkpoint_callback.best_model_path != "":
    from foundations import log_metric, save_artifact
    save_artifact(checkpoint_callback.best_model_path,
                  key='best_model_checkpoint')
Exemplo n.º 7
0
def train(train_dl, val_dl, test_dl, val_dl_iter, model, optimizer, scheduler,
          criterion, params, train_sampler, val_sampler, rank):
    n_epochs = params['n_epochs']
    max_lr = params['max_lr']
    val_rate = params['val_rate']
    batch_repeat = params['batch_repeat']
    history_best = {}
    best_metric = 0

    if rank == 0:
        os.makedirs('checkpoints', exist_ok=True)
        os.makedirs('tensorboard', exist_ok=True)
        if settings.USE_FOUNDATIONS:
            foundations.set_tensorboard_logdir('tensorboard')
        writer = SummaryWriter("tensorboard")
    else:
        writer = None

    for epoch in range(n_epochs):
        train_records = DistributedClassificationMeter(writer=writer,
                                                       phase="train",
                                                       epoch=epoch,
                                                       workers=params["gpus"],
                                                       criterion=criterion)
        if train_sampler:
            train_sampler.set_epoch(epoch)
        train_one_epoch(epoch, model, train_dl, max_lr, optimizer, criterion,
                        scheduler, train_records, batch_repeat, rank, writer,
                        params)
        if epoch % val_rate == 0:
            val_records = DistributedClassificationMeter(
                writer=writer,
                phase="validation",
                epoch=epoch,
                workers=params["gpus"],
                criterion=criterion)
            if val_sampler:
                val_sampler.set_epoch(epoch)
            validate(model, val_dl, criterion, val_records, rank)

            # 改的时候记得改大于小于啊!!!
            # aaaa记得改初始值啊
            info = val_records.log_metric(write_scalar=False)
            selection_metric = info["acc"]

            if selection_metric >= best_metric and rank == 0:
                best_metric = selection_metric
                print(
                    f'>>> Saving best model metric={selection_metric:.4f} compared to previous best {best_metric:.4f}'
                )
                checkpoint = {
                    'model': model.module.state_dict(),
                    'params': params
                }
                history_best = {
                    "train_" + key: value
                    for key, value in train_records.get_metric().items()
                }
                for key, value in val_records.get_metric().items():
                    history_best["val_" + key] = value

                torch.save(checkpoint, 'checkpoints/best_model.pth')
                if settings.USE_FOUNDATIONS:
                    foundations.save_artifact('checkpoints/best_model.pth',
                                              key='best_model_checkpoint')

    # Log metrics to GUI
    if rank == 0:
        for metric, value in history_best.items():
            if settings.USE_FOUNDATIONS:
                foundations.log_metric(metric, float(value))
            else:
                print(metric, float(value))
import foundations

foundations.set_tensorboard_logdir('tensorboard_files/')