예제 #1
0
    def fit(self, model, train_loader, val_loader, test_loader):
        optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                            model.parameters()),
                                     lr=1e-4,
                                     weight_decay=1e-5)
        stopper = EarlyStopping(self.model_path,
                                self.tasks,
                                patience=self.patience)

        self._train(model, train_loader, val_loader, self.loss_fn, optimizer,
                    stopper)
        stopper.load_checkpoint(model)
        test_results_dict = self._eval(model, test_loader)
        for metric in self.metrics:
            print(f"test {metric}:{test_results_dict[metric]['mean']}")
        return model, test_results_dict
예제 #2
0
def trainNet(model, train_loader, val_loader, device, static_map, start_epoch=0, globaliter_=0):
    # Print all of the hyper parameters of the training iteration:
    print("===== HYPERPARAMETERS =====")
    print("batch_size=", config['dataloader']['batch_size'])
    print("epochs=", config['num_epochs'])
    print('starting from epoch %i' % start_epoch)
    print("learning_rate=", config['optimizer']['lr'])
    print("network_depth=", config['model']['depth'])
    print("=" * 30)

    # define the optimizer & learning rate
    optim = torch.optim.SGD(model.parameters(), **config['optimizer'])

    scheduler = StepLR(optim,
                       step_size=config['lr_step_size'],
                       gamma=config['lr_gamma'])

    if config['cont_model_path'] is not None:
        log_dir = config['cont_model_path']
    else:
        log_dir = 'runs/Unet-' + datetime.now().strftime("%Y-%m-%d-%H-%M-%S-") + \
                  '-'.join(config['dataset']['cities'])
    writer = Visualizer(log_dir)

    # dump config file
    with open(os.path.join(log_dir, 'config.json'), 'w') as fp:
        json.dump(config, fp)

    # Time for printing
    training_start_time = time.time()
    globaliter = globaliter_

    # initialize the early_stopping object
    early_stopping = EarlyStopping(log_dir, patience=config['patience'], verbose=True)

    # Loop for n_epochs
    for epoch_idx, epoch in enumerate(range(start_epoch, config['num_epochs'])):
        writer.write_lr(optim, epoch)

        # train for one epoch
        globaliter = train(model, train_loader, static_map, optim, device, writer, epoch, globaliter)

        # At the end of the epoch, do a pass on the validation set
        val_loss = validate(model, val_loader, static_map, device, writer, globaliter)

        # At the end of the epoch, do a pass on the validation set only considering the test times
        # val_loss_testtimes = validate(model, val_loader_ttimes, device, writer, globaliter, if_testtimes=True)

        # early_stopping needs the validation loss to check if it has decresed,
        # and if it has, it will make a checkpoint of the current model
        early_stopping(val_loss, model, epoch+1, globaliter)

        if early_stopping.early_stop:
            print("Early stopping")
            break

        if config['debug'] and epoch_idx >= 0:
            break

        scheduler.step(epoch)

    print("Training finished, took {:.2f}s".format(time.time() - training_start_time))

    # remember to close tensorboard writer
    writer.close()
예제 #3
0
    def train_test(self):

        #load model if model exists weigh initialization
        if self.config.load_model is True:
            self.model.load_model()
        else:
            self.model.weight_init()
            print('weight is initilized')

        # optimizer
        self.momentum = 0.9
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.config.lr, weight_decay=1.0)

        # scheduler = lr_scheduler.StepLR(self.optimizer, step_size=70, gamma=0.01)
        scheduler = lr_scheduler.ExponentialLR(self.optimizer, gamma=0.9)

        # loss function
        if self.config.gpu_mode:
            self.model.cuda()
            self.MSE_loss = nn.MSELoss().cuda()
        else:
            self.MSE_loss = nn.MSELoss()

        print('---------- Networks architecture -------------')
        utils.print_network(self.model)
        print('----------------------------------------------')

        # load dataset
        train_data_loader = self.data_train
        test_data_loader = self.data_test

        ################# Train #################
        print('Training is started.')
        avg_loss = []
        avg_loss_test = []
        avg_loss_log_test = []
        step = 0

        es = EarlyStopping(patience=8)

        self.model.train() # It just sets the training mode.model.eval() to set testing mode
        for epoch in range(self.config.num_epochs):
            scheduler.step()
            epoch_loss = 0
            for iter, (input, target, _) in enumerate(train_data_loader):
                # input data (low resolution image)
                if self.config.gpu_mode:
                    x_ = Variable(input.cuda())
                    y_ = Variable(target.cuda())
                else:
                    x_ = Variable(input)
                    y_ = Variable(target)

                # update network
                self.optimizer.zero_grad()
                model_out = self.model(x_)
                loss = torch.sqrt(self.MSE_loss(model_out, y_))
                loss.backward() # 结果得到是tensor
                self.optimizer.step()

                # log
                epoch_loss += loss
                print("Epoch: [%2d] [%4d/%4d] loss: %.8f" % ((epoch + 1), (iter + 1), len(train_data_loader), loss))

                # tensorboard logging
                self.logger.scalar_summary('loss', loss, step + 1)
                step += 1

            # avg. loss per epoch
            avg_loss.append((epoch_loss / len(train_data_loader)).detach().cpu().numpy())

            if (epoch + 1) % self.config.save_epochs == 0:
                self.model.save_model(epoch + 1)

            # caculate test loss
            with torch.no_grad():
                loss_test, loss_log_test = self.test(test_data_loader)

            epoch_loss_test = loss_test / len(test_data_loader)
            epoch_loss_log_test = loss_log_test / len(test_data_loader)

            avg_loss_test.append(float(epoch_loss_test))
            avg_loss_log_test.append(float(epoch_loss_log_test))

            # if es.step(float(epoch_loss_test)):
            #     self.model.save_model(epoch=None)
            #     print('Early stop at %2d epoch' % (epoch + 1))
            #     break

        # Plot avg. loss
        utils.plot_loss(self.config, [avg_loss, avg_loss_log_test])
        utils.plot_loss(self.config, [avg_loss_test], origin=True)

        print('avg_loss: ', avg_loss[-1])
        print('avg_loss_log with original data: ', avg_loss_test[-1])
        print('avg_loss_log with log data: ', avg_loss_log_test[-1])
        print("Training and test is finished.")

        # Save final trained parameters of model
        self.model.save_model(epoch=None)
예제 #4
0
writer = Visualizer(log_dir)
with open(os.path.join(log_dir, 'config.json'), 'w') as fp:
    json.dump(config, fp)

# define device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# model
model = diff_pool_net2(dataset, **config['model']).to(device)
data = data.to(device)
lr = config['optimizer']['lr']
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# initialize the early_stopping object
early_stopping = EarlyStopping(log_dir,
                               patience=config['patience'],
                               verbose=False)

best_val_acc = test_acc = 0
for epoch in range(1, config['epochs']):
    output_dict = train()

    accs, s = test()
    train_acc, val_acc, tmp_test_acc = accs

    writer.write_lr(optimizer, epoch)
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        test_acc = tmp_test_acc

    writer.write_acc_train(train_acc, epoch)
예제 #5
0
def bcn(config, data_file, embeddings, device, dataset, embeddings_type):
    #   extensions : add 2 languages, use a combination of CoVe embeddings (like ELMo)

    name = "test_model"
    torch.manual_seed(123)

    inputs = data.Field(lower=True, include_lengths=True, batch_first=True)
    labels = data.Field(sequential=False, unk_token=None)

    print('Generating train, dev, test splits')

    if dataset == 'IWSLT':
        # using the IWSLT 2016 TED talk translation task
        train, dev, test = datasets.IWSLT.splits(root=data_file,
                                                 exts=['.en', '.de'],
                                                 fields=[inputs, inputs])
    elif dataset == 'SST-2':
        train, dev, test = datasets.SST.splits(
            text_field=inputs,
            label_field=labels,
            root=data_file,
            fine_grained=False,
            train_subtrees=True,
            filter_pred=lambda ex: ex.label != 'neutral')
    elif dataset == 'SST-5':
        train, dev, test = datasets.SST.splits(text_field=inputs,
                                               label_field=labels,
                                               root=data_file,
                                               fine_grained=True,
                                               train_subtrees=True)
    elif dataset == 'IMDB':
        train, test = datasets.IMDB.splits(text_field=inputs,
                                           label_field=labels,
                                           root=data_file)
        train, dev = train.split(
            split_ratio=0.9,
            stratified=True)  # 0.9 in order to be close to the paper
    elif dataset == 'TREC-6':
        train, test = datasets.TREC.splits(text_field=inputs,
                                           label_field=labels,
                                           root=data_file,
                                           fine_grained=False)
        train, dev = train.split(split_ratio=0.9, stratified=True)
    elif dataset == 'TREC-50':
        train, test = datasets.TREC.splits(text_field=inputs,
                                           label_field=labels,
                                           root=data_file,
                                           fine_grained=True)
        train, dev = train.split()
    elif dataset == 'SNLI':
        train, dev, test = datasets.SNLI.splits(text_field=inputs,
                                                label_field=labels,
                                                root=data_file)
    else:
        print('Invalid dataset name detected...')
        return

    print('Building vocabulary')
    inputs.build_vocab(train, dev, test)
    inputs.vocab.load_vectors(
        vectors=GloVe(name='840B', dim=300, cache=embeddings))

    labels.build_vocab(train, dev, test)

    train_iter, dev_iter, test_iter = data.BucketIterator.splits(
        (train, dev, test),
        batch_size=config["train_batch_size"],
        device=torch.device(device) if device >= 0 else None,
        sort_within_batch=True)

    model = BCN(config=config,
                n_vocab=len(inputs.vocab),
                vocabulary=inputs.vocab.vectors,
                embeddings=embeddings,
                num_labels=len(labels.vocab.freqs),
                embeddings_type=embeddings_type)

    bcn_params = [
        p for n, p in model.named_parameters()
        if "mtlstm" not in n and p.requires_grad
    ]

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(bcn_params, lr=0.001)

    if device != -1:
        model.to(device)
    print(model)
    total_params = sum(p.numel() for p in model.parameters())
    total_trainable_params = sum(p.numel() for p in bcn_params
                                 if p.requires_grad)

    print("Total Params:", number_h(total_params))
    print("Total Trainable Params:", number_h(total_trainable_params))

    #####################################
    # Training Pipeline
    #####################################
    trainer = BCNTrainer(model=model,
                         train_loader=train_iter,
                         valid_loader=dev_iter,
                         criterion=criterion,
                         device="cpu" if device == -1 else 'cuda',
                         config=config,
                         optimizers=[optimizer])

    print('Generating CoVe')

    ####################################################################
    # Experiment: logging and visualizing the training process
    ####################################################################
    exp = Experiment(name, config, src_dirs=None, output_dir=EXP_DIR)
    exp.add_metric("ep_loss", "line", "epoch loss class", ["TRAIN", "VAL"])
    exp.add_metric("ep_f1", "line", "epoch f1", ["TRAIN", "VAL"])
    exp.add_metric("ep_acc", "line", "epoch accuracy", ["TRAIN", "VAL"])

    exp.add_value("epoch", title="epoch summary")
    exp.add_value("progress", title="training progress")

    ####################################################################
    # Training Loop
    ####################################################################
    best_loss = None
    early_stopping = EarlyStopping("min", config["patience"])

    for epoch in range(1, config["epochs"] + 1):
        train_loss = trainer.train_epoch()
        print(model.w, model.gama)
        val_loss, y, y_pred = trainer.eval_epoch()

        # Calculate accuracy and f1-macro on the evaluation set
        exp.update_metric("ep_loss", train_loss.item(), "TRAIN")
        exp.update_metric("ep_loss", val_loss.item(), "VAL")
        exp.update_metric("ep_f1", 0, "TRAIN")
        exp.update_metric("ep_f1", f1_macro(y, y_pred), "VAL")
        exp.update_metric("ep_acc", 0, "TRAIN")
        exp.update_metric("ep_acc", acc(y, y_pred), "VAL")

        print()
        epoch_log = exp.log_metrics(["ep_loss", "ep_f1", "ep_acc"])
        print(epoch_log)
        exp.update_value("epoch", epoch_log)

        # Save the model if the val loss is the best we've seen so far.
        if not best_loss or val_loss < best_loss:
            best_loss = val_loss
            trainer.best_acc = acc(y, y_pred)
            trainer.best_f1 = f1_macro(y, y_pred)
            trainer.checkpoint(name=name)

        if early_stopping.stop(val_loss):
            print("Early Stopping (according to cls loss)....")
            break

        print("\n" * 2)

    return best_loss, trainer.best_acc, trainer.best_f1
예제 #6
0
def train(config, model, train_loader, val_loader, optimizer):
    if not os.path.exists('./runs'):
        os.mkdir('./runs')
    if not os.path.exists('./checkpoints'):
        os.mkdir('./checkpoints')
    writer = SummaryWriter('./runs/{}'.format(config.exp_name))
    early_stopping = EarlyStopping(save_dir=config.save_dir,
                                   model_type=config.exp_name,
                                   patience=config.patience,
                                   verbose=True)

    for epoch in tqdm(range(1, config.n_epochs + 1)):
        highlight_loss = []
        # Training
        model.train()
        for batch_idx, data in enumerate(train_loader):
            # zero the grads
            optimizer.zero_grad()
            # handle the case of history vs. non-history training
            if len(data) == 4:  # is_history = False
                vid_feat_tensor, gt_strided_binary, user_path, nframes = data
                # convert data to cuda
                vid_feat_tensor, gt_strided_binary = vid_feat_tensor.unsqueeze(
                    dim=2).transpose(1, 3).cuda(), gt_strided_binary.view(
                        1, 1, -1).cuda()
                # forward to model
                output = model(vid_feat_tensor)
            else:  # is_history = True i.e. len(data) = 5
                vid_feat_tensor, gt_strided_binary, usr_hist_list, usr_path, nframes = data
                # check if usr_hist_list has some data
                if len(usr_hist_list) == 0:
                    continue
                else:
                    pass
                # convert data to cuda
                vid_feat_tensor, gt_strided_binary, usr_hist_list = vid_feat_tensor.unsqueeze(
                    dim=2).transpose(1, 3).cuda(), gt_strided_binary.view(
                        1, 1, -1).cuda(), [
                            hist.float().cuda() for hist in usr_hist_list
                        ]
                # forward to the model with history
                output = model(vid_feat_tensor, usr_hist_list)

            # compute loss
            loss = cross_entropy2d(output, gt_strided_binary)

            # backward and update the model
            loss.backward()
            optimizer.step()

            highlight_loss.append(loss.item())

            if batch_idx % config.print_interval == 0:
                print('Train Epoch: {} [{}/{}]\tLoss: {:.6f}'.format(
                    epoch, batch_idx + 1, len(train_loader), loss.item()))

        mean_highlight_loss = np.average(highlight_loss)
        writer.add_scalar('Train/loss', mean_highlight_loss, epoch)

        # Validation
        if config.is_validate and epoch % config.validate_interval == 0:
            avg_map, avg_val_loss = validate(config, model, val_loader)

            # val avg_map for early stopping
            early_stopping(avg_map, model, epoch)

            if early_stopping.early_stop:
                print("Early stopping")
                break

            writer.add_scalar('Val/mAP', avg_map, epoch)
            writer.add_scalar('Val/Loss', avg_val_loss, epoch)

    # close summary writer
    writer.close()
    return
예제 #7
0
from torchsummary import summary

from models.resnet import *
from models.resnext import *
from models.densenet import *
from utils.arg_utils import *
from utils.data_utils import *
from utils.progress_utils import progress_bar
from utils.earlystopping import EarlyStopping
"""
arguments
"""
args = fetch_args()
device = 'cuda' if torch.cuda.is_available() else 'cpu'
early_stopping = EarlyStopping(args['patience'],
                               verbose=True,
                               delta=args['delta'])
"""
loading data-set....
"""
print("==> loading data-set...")
train_loader, classes = gen_train_loader(args['train_path'],
                                         args['input_size'],
                                         args['train_batch_size'])
test_loader, _ = gen_test_loader(args['test_path'], args['input_size'],
                                 args['test_batch_size'])
print('Task classes are: ', classes)
num_classes = len(classes)
print(num_classes)
"""
model
예제 #8
0
    def train(
        self, train_loader, val_loader=None, max_epochs=1000, enable_early_stopping=True
    ):
        if val_loader is None:
            enable_early_stopping = False

        print()
        print("-" * 2, "Training Setup", "-" * 2)
        print(f"Maximum Epochs: {max_epochs}")
        print(f"Enable Early Stoping: {enable_early_stopping}")
        print("-" * 20)
        print("*Start Training.")

        # model setup
        self.model.train().to(self.device)
        if self.multi_gpus and torch.cuda.device_count() > 1:
            print(f"*Using {torch.cuda.device_count()} GPUs!")
            self.model = nn.DataParallel(self.model)

        # early stopping instance
        if enable_early_stopping:
            if self.early_stopping is None:
                self.early_stopping = EarlyStopping(patience=5)
            else:
                self.early_stopping.reset_counter()

        # training start!
        for epoch in range(1, max_epochs + 1):
            running_loss = 0.0

            for step, data in enumerate(train_loader, start=1):
                inputs, labels = data
                inputs, labels = inputs.to(self.device), labels.to(self.device)

                # Zero the parameter gradients
                self.optimizer.zero_grad()
                # forward + backward + optimize
                outputs = self.model(inputs)
                loss = self.loss_func(outputs, labels)
                loss.backward()
                self.optimizer.step()
                # print statistics
                running_loss += loss.item()

                if step % 100 == 0 or step == len(train_loader):
                    print(
                        f"[{epoch}/{max_epochs}, {step}/{len(train_loader)}] loss: {running_loss / step :.3f}"
                    )

            # train & validation loss
            train_loss = running_loss / len(train_loader)
            if val_loader is None:
                print(f"train loss: {train_loss:.3f}")
            else:
                # FIXME: fixed the problem that first validation is not correct
                val_loss = self.validation(val_loader)
                print(f"train loss: {train_loss:.3f}, val loss: {val_loss:.3f}")

                if enable_early_stopping:
                    self.early_stopping(self.model, val_loss, self.optimizer)
                    if self.early_stopping.get_early_stop() == True:
                        print("*Early Stopping.")
                        break

        print("*Finished Training!")
        if enable_early_stopping:
            checkpoint = self.early_stopping.get_checkpoint()
        else:
            checkpoint = Checkpoint()
            checkpoint.tmp_save(self.model, self.optimizer, epoch, val_loss)
        self.checkpoint = checkpoint
        self.model = checkpoint.load(self.model, self.optimizer)["model"]
        return self.model
예제 #9
0
class ModelWrapper(DefaultSetting):
    def __init__(
        self,
        model,
        loss_func=None,
        optimizer=None,
        device=None,
        multi_gpus=True,
        log=100,
    ):
        super().__init__(device, loss_func)
        self.model = model
        if optimizer is None:
            self.optimizer = self.default_optimizer(model)
        else:
            self.optimizer = optimizer
        self.multi_gpus = multi_gpus
        self.log = log
        self.checkpoint = None
        self.early_stopping = None

    # TODO: haven't check this this function (__call__) yet
    # update model setting
    def __call__(
        self,
        model=None,
        loss_func=None,
        optimizer=None,
        device=None,
        multi_gpus=None,
        log=None,
    ):
        if model is not None:
            self.model = model

        if optimizer is None:
            self.optimizer = self.default_optimizer(self.model)
        else:
            self.optimizer = optimizer

        if loss_func is not None:
            self.loss_func = loss_func

        if device is not None:
            self.device = device

        if multi_gpus is not None:
            self.multi_gpus = multi_gpus

        if self.log is not None:
            self.log = log

        self.checkpoint = None
        self.early_stopping = None

    # train model
    def train(
        self, train_loader, val_loader=None, max_epochs=1000, enable_early_stopping=True
    ):
        if val_loader is None:
            enable_early_stopping = False

        print()
        print("-" * 2, "Training Setup", "-" * 2)
        print(f"Maximum Epochs: {max_epochs}")
        print(f"Enable Early Stoping: {enable_early_stopping}")
        print("-" * 20)
        print("*Start Training.")

        # model setup
        self.model.train().to(self.device)
        if self.multi_gpus and torch.cuda.device_count() > 1:
            print(f"*Using {torch.cuda.device_count()} GPUs!")
            self.model = nn.DataParallel(self.model)

        # early stopping instance
        if enable_early_stopping:
            if self.early_stopping is None:
                self.early_stopping = EarlyStopping(patience=5)
            else:
                self.early_stopping.reset_counter()

        # training start!
        for epoch in range(1, max_epochs + 1):
            running_loss = 0.0

            for step, data in enumerate(train_loader, start=1):
                inputs, labels = data
                inputs, labels = inputs.to(self.device), labels.to(self.device)

                # Zero the parameter gradients
                self.optimizer.zero_grad()
                # forward + backward + optimize
                outputs = self.model(inputs)
                loss = self.loss_func(outputs, labels)
                loss.backward()
                self.optimizer.step()
                # print statistics
                running_loss += loss.item()

                if step % 100 == 0 or step == len(train_loader):
                    print(
                        f"[{epoch}/{max_epochs}, {step}/{len(train_loader)}] loss: {running_loss / step :.3f}"
                    )

            # train & validation loss
            train_loss = running_loss / len(train_loader)
            if val_loader is None:
                print(f"train loss: {train_loss:.3f}")
            else:
                # FIXME: fixed the problem that first validation is not correct
                val_loss = self.validation(val_loader)
                print(f"train loss: {train_loss:.3f}, val loss: {val_loss:.3f}")

                if enable_early_stopping:
                    self.early_stopping(self.model, val_loss, self.optimizer)
                    if self.early_stopping.get_early_stop() == True:
                        print("*Early Stopping.")
                        break

        print("*Finished Training!")
        if enable_early_stopping:
            checkpoint = self.early_stopping.get_checkpoint()
        else:
            checkpoint = Checkpoint()
            checkpoint.tmp_save(self.model, self.optimizer, epoch, val_loss)
        self.checkpoint = checkpoint
        self.model = checkpoint.load(self.model, self.optimizer)["model"]
        return self.model

    # %% validation
    @torch.no_grad()
    def validation(self, val_loader):
        self.model.eval().to(self.device)
        running_loss = 0.0
        for data in val_loader:
            inputs, labels = data
            inputs, labels = inputs.to(self.device), labels.to(self.device)
            outputs = self.model(inputs)
            loss = self.loss_func(outputs, labels)
            running_loss += loss.item()
        return running_loss / len(val_loader)

    # classification report of the model on test data
    @torch.no_grad()
    def classification_report(
        self, test_loader, target_names=None, binary=False, visualize=False
    ):
        print("-" * 10, "Classification Report", "-" * 10)
        print(f"loss: {self.validation(test_loader)}")
        model = self.model
        model.eval().to(self.device)

        y_pred, y_true = [], []
        for data in test_loader:
            inputs, labels = data
            inputs, labels = inputs.to(self.device), labels.to(self.device).long()
            outputs = model(inputs)
            if not binary:
                _, predicted = torch.max(outputs, 1)
            else:
                predicted = torch.round(outputs)

            y_true += labels.squeeze().cpu().tolist()
            y_pred += predicted.squeeze().cpu().tolist()

        if visualize:
            vis = Visualization(y_true, y_pred, target_names)
            vis.confusion_matrix()
            vis.classification_report()
            vis.show()
        report = classification_report(y_true, y_pred, target_names=target_names)
        print(report)
        return report
예제 #10
0
def trainNet(model, train_loader, val_loader, device, adj, nn_ixs, edge_index, config, log_dir, coords=None):
    """

    Args:
        model:
        train_loader:
        val_loader:
        device:
        adj:
        nn_ixs:
        edge_index:
        config:
        log_dir:
        coords:

    Returns:

    """

    # define the optimizer & learning rate
    optim = torch.optim.Adam(model.parameters(), **config['optimizer'])

    # scheduler = StepLR(optim, step_size=config['lr_step_size'], gamma=config['lr_gamma'])

    writer = Visualizer(log_dir)

    # dump config file
    with open(os.path.join(log_dir, 'config.json'), 'w') as fp:
        json.dump(config, fp)

    # Time for printing
    training_start_time = time.time()
    globaliter = 0

    # initialize the early_stopping object
    early_stopping = EarlyStopping(log_dir, patience=config['patience'], verbose=True)
    #    adj = adj.to(device)
    batch_size = config['dataloader']['batch_size']
    print_every_step = config['print_every_step']
    # Loop for n_epochs
    for epoch_idx, epoch in enumerate(range(config['num_epochs'])):

        writer.write_lr(optim, globaliter)

        # train for one epoch
        globaliter = train(model=model, train_loader=train_loader, optim=optim, device=device, writer=writer,
                           epoch=epoch, globaliter=globaliter, adj=adj, nn_ixs=nn_ixs, edge_index=edge_index,
                           batch_size=batch_size, coords=coords, print_every_step=print_every_step)

        # At the end of the epoch, do a pass on the validation set
        # val_loss = validate(model, val_loader, device, writer, globaliter, adj, nn_ixs, edge_index)
        val_loss = validate(model=model, val_loader=val_loader, device=device, adj=adj, nn_ixs=nn_ixs,
                            edge_index=edge_index, batch_size=batch_size, coords=coords,
                            writer=writer, globaliter=globaliter)

        # early_stopping needs the validation loss to check if it has decresed,
        # and if it has, it will make a checkpoint of the current model
        early_stopping(val_loss, model)

        if early_stopping.early_stop:
            print("Early stopping")
            break

        if config['debug'] and epoch_idx >= 0:
            break

        # scheduler.step()

    print("Training finished, took {:.2f}s".format(time.time() - training_start_time))

    # remember to close writer
    writer.close()
예제 #11
0
    def train(self,
              optimizer,
              criterion,
              batch_size=1,
              epochs=1,
              kfold=2,
              iteration=1,
              shuffle=True,
              random_state=None,
              filepath=None,
              patience=7):
        best_state = []
        best_accuracy = 0.

        _kfold = KFold(n_splits=kfold,
                       shuffle=shuffle,
                       random_state=random_state)
        _data = self.dataset.data.numpy() if isinstance(
            self.dataset.data, torch.Tensor) else self.dataset.data
        _label = self.dataset.label

        minimum_early_stopping_epochs = 10
        result = np.zeros((iteration, kfold), dtype=np.float)
        for iter_index in range(iteration):
            for fold_index, (train_idx,
                             test_idx) in enumerate(_kfold.split(_data)):
                print("=" * 12)
                print("Iter {} Fold {}".format(iter_index, fold_index))
                print("=" * 12)
                _model = self.model
                _model.load_state_dict(self.reset_state)
                x_train_fold = torch.from_numpy(_data[train_idx]).float()
                x_test_fold = torch.from_numpy(_data[test_idx]).float()
                y_train_fold = torch.from_numpy(_label[train_idx])
                y_test_fold = torch.from_numpy(_label[test_idx])

                train_data = TensorDataset(x_train_fold, y_train_fold)
                test_data = TensorDataset(x_test_fold, y_test_fold)

                train_loader = DataLoader(train_data,
                                          batch_size=batch_size,
                                          shuffle=False)
                test_loader = DataLoader(test_data,
                                         batch_size=batch_size,
                                         shuffle=False)

                early_stopping = EarlyStopping(patience=patience)
                for epoch in range(epochs):
                    _model.train()
                    for index, (data, label) in enumerate(train_loader):
                        data, label = data.to(self.DEVICE), label.to(
                            self.DEVICE)

                        optimizer.zero_grad()
                        output = _model(data)
                        loss = criterion(output, label)
                        loss.backward()
                        optimizer.step()

                        print(
                            "Epoch{} Training {:5.2f}% | Loss: {:.4f}".format(
                                epoch, (index + 1) * batch_size /
                                len(train_loader.dataset) * 100., loss.item()),
                            end='\r')

                    #print(_model.output_layer.weight.grad)
                    _model.eval()
                    test_loss = 0.
                    correct = 0
                    with torch.no_grad():
                        for index, (data, label) in enumerate(test_loader):
                            data, label = data.to(self.DEVICE), label.to(
                                self.DEVICE)
                            output = _model(data)
                            loss = criterion(output, label)

                            test_loss += loss.item()
                            # Loss history?
                            pred = output.data.max(1, keepdim=True)[1]
                            correct += pred.eq(
                                label.data.view_as(pred)).cpu().sum()
                            print("Testing... {:5.2f}%".format(
                                (index + 1) * batch_size /
                                len(test_loader.dataset)),
                                  end='\r')

                    test_loss /= len(test_loader.dataset)
                    accuracy = correct / float(len(test_loader.dataset))
                    result[iter_index, fold_index] = accuracy
                    print(
                        "Epoch{} Test Result: loss {:.4f} | accuracy {:.5f}({}/{})"
                        .format(epoch, test_loss, accuracy, correct,
                                len(test_loader.dataset)))

                    if filepath is not None:
                        if not os.path.isdir(filepath):
                            os.mkdir(filepath)
                        torch.save(
                            _model.state_dict(),
                            os.path.join(
                                filepath, f"model{iter_index}_{fold_index}_" +
                                datetime.datetime.now().strftime(
                                    "%m%d_%H:%M:%S")))

                    if epoch >= minimum_early_stopping_epochs:
                        early_stopping(test_loss)
                    if early_stopping.early_stop:
                        print("Early stopping")
                        break

            iter_accuracy = result[iter_index].mean()
            if (iter_accuracy > best_accuracy):
                best_state = _model.state_dict()
                best_accuracy = iter_accuracy
            print('=' * 12)
            print(
                "Iteration {} complete with {:5.2f}% average accuracy".format(
                    iter_index, iter_accuracy * 100.))
            print('=' * 12)

        print("Training complete with {:5.2f}%".format(result.mean()))
        self.model.load_state_dict(best_state)
        return result
예제 #12
0
train_dataset = CTScanDataset(train_path, transform=composed)
trainloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataset = CTScanDataset(val_path, transform=composed)
valloader = DataLoader(val_dataset, batch_size=val_batch_size)

dev = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")

model = Register3d(trainloader[0].size, device=dev, linear=affine_transform)
model.to(dev)

loss_func = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1,
                                                 patience=50, verbose=True)
early_stop = EarlyStopping(patience=100, verbose=True)


for epoch in range(num_epochs):
    train_bar = tqdm(trainloader)
    for source, target in train_bar:
        source, target = source.to(dev), target.to(dev)
        optimizer.zero_grad()
        model.train()
        if affine_transform:
            output, deform_grads, theta = model(source, target)
        else:
            output, deform_grads = model(source, target)
        loss = loss_func(output, target)
        if affine_transform:
            loss += alpha*torch.sum(torch.abs(theta-torch.eye(3, 4)))
예제 #13
0
    def run(self):

        while self.processed_steps < self.num_of_steps:

            self.esti_variance_every_steps = _data_segment(self.corpus)
            print('*'*80)

            need_reload = False
            self.corpus.invoke_train_batches_making()

            variance_earlystopper = EarlyStopping(
                tolerance=self.tolerance, scorers=[VarianceScorer()]
            )

            if len(self.replicas) < self.num_of_devices:
                self.replicas.clear()
                self.parallel_model()

            while self.processed_steps < self.num_of_steps and not need_reload:
                next_batches = self.corpus.get_train_batches(self.buffer_every_steps)

                for batch in next_batches:
                    time_start = time.time()

                    self.train_step(batch)          # worker training
                    self.update()                   # worker collection and sync

                    self.time_sum += time.time() - time_start

                    ' estimate variance begin '
                    if self.processed_steps % self.esti_variance_every_steps == 0:

                        with torch.no_grad():
                            print('*' * 80)
                            print('Variance Estimating...')
                            torch.cuda.empty_cache()
                            variance = self.esti_variance_step()
                            torch.cuda.empty_cache()

                            # Run variance converge computer (use patience mechanism)
                            variance_earlystopper(variance, self.processed_steps)
                            # If the patience has reached the limit, upgrade the model competence level
                            if variance_earlystopper.has_stopped():
                                self.upgrade_competence()
                                need_reload = True
                                break

                            print('Training')
                            print(self.annotate)
                    ' estimate variance end '

                    if self.processed_steps % self.report_every_steps == 0:
                        self.report()

                    if self.processed_steps % self.eval_every_steps == 0:
                        with torch.no_grad():
                            print('*' * 80)
                            print('Evaluating')
                            torch.cuda.empty_cache()
                            for model in self.replicas:
                                model.eval()
                            self.eval_step()
                            self.save()
                            torch.cuda.empty_cache()
                            for model in self.replicas:
                                model.train()
                            print('Training')
                            print(self.annotate)

                    if self.processed_steps >= self.num_of_steps:
                        print('End of train.')
                        return

        return
예제 #14
0
    def train_test(self):

        # load model if model exists weigh initialization
        if self.config.load_model is True:
            self.load_model()
            # self.load_spec_model()
        else:
            self.weight_init()

        # loss function
        if self.config.gpu_mode:
            self.model.cuda()
            self.MSE_loss = nn.MSELoss().cuda()  # 默认算出来是对每个sample的平均
        else:
            self.MSE_loss = nn.MSELoss()

        # optimizer
        self.momentum = 0.9
        self.optimizer = optim.Adam(self.model.parameters(),
                                    lr=self.config.lr,
                                    weight_decay=1.0)

        scheduler = lr_scheduler.StepLR(self.optimizer,
                                        step_size=100,
                                        gamma=0.1)
        # scheduler = lr_scheduler.ExponentialLR(self.optimizer, gamma=0.9)

        print('---------- Networks architecture -------------')
        utils.print_network(self.model)
        print('----------------------------------------------')

        # load dataset
        train_data_loader = self.data_train
        test_data_loader = self.data_test

        ################# Train #################
        print('Training is started.')
        avg_loss = []
        avg_loss_test = []
        avg_loss_log_test = []
        # step = 0

        es = EarlyStopping(patience=50)

        self.model.train(
        )  # It just sets the training mode.model.eval() to set testing mode
        for epoch in range(self.config.num_epochs):
            scheduler.step()
            epoch_loss = 0
            for iter, (input, target,
                       groundtruth) in enumerate(train_data_loader):
                # input data (low resolution image)
                if self.config.gpu_mode:
                    x_ = Variable(input.cuda())
                    y_ = Variable(groundtruth.cuda())
                else:
                    x_ = Variable(input)
                    y_ = Variable(groundtruth)

                # scale是10的话,x_.shape is (batchsize, 1, 300)
                # scale是100的话,x_.shape is (batchsize, 1, 30)

                # update network
                self.optimizer.zero_grad()
                model_out = self.model(x_)
                loss = torch.sqrt(self.MSE_loss(model_out, y_))
                loss.backward()  # 结果得到是tensor
                self.optimizer.step()
                epoch_loss += loss

                # 注意:len(train_data_loader) 是 # train samples/batchsize,有多少个train_data_loader即需要iter多少个batch
                print("Epoch: [%2d] [%4d/%4d] loss: %.8f" %
                      ((epoch + 1), (iter + 1), len(train_data_loader), loss))

                # tensorboard logging
                # self.logger.scalar_summary('loss', loss, step + 1)
                # step += 1

            # avg. loss per epoch
            # 如果除以len(train_data_loader)是平均每一个sample的loss
            avg_loss.append(
                (epoch_loss / len(train_data_loader)).detach().cpu().numpy())

            if (epoch + 1) % self.config.save_epochs == 0:
                self.save_model(epoch + 1)

            # caculate test loss
            with torch.no_grad():
                loss_test, _ = self.test(test_data_loader)

            epoch_loss_test = loss_test / len(test_data_loader)

            avg_loss_test.append(float(epoch_loss_test))

            #nni.report_intermediate_result(
            #    {"default": float(epoch_loss_test), "epoch_loss": float(avg_loss[-1])})

            # if es.step(avg_loss[-1]):
            #     self.save_model(epoch=None)
            #     print('Early stop at %2d epoch' % (epoch + 1))
            #     break

            if epoch % 10 == 0 and epoch != 0:
                utils.plot_loss(self.config, [avg_loss, avg_loss_test])

        #nni.report_final_result({"default": float(avg_loss_test[-1]), "epoch_loss": float(avg_loss[-1])})

        # Plot avg. loss
        utils.plot_loss(self.config, [avg_loss, avg_loss_test])

        with torch.no_grad():
            _, dtw_test = self.test(test_data_loader, True)
            avg_dtw_test = dtw_test / len(test_data_loader)

        print('avg_loss: ', avg_loss[-1])
        print('avg_loss_log with original data: ', avg_loss_test[-1])
        print('dtw with original data: ', avg_dtw_test)
        print("Training and test is finished.")

        # Save final trained parameters of model
        self.save_model(epoch=None)
예제 #15
0
## Training the model--------------------------------------------------------------------------------
n_epochs = 150
patience = 5  #used for early stopping
# optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)
optimizer = RAdam(model.parameters(),
                  lr=1e-3,
                  betas=(0.9, 0.999),
                  eps=1e-8,
                  weight_decay=1e-5,
                  degenerated_to_sgd=True)  #Rectified Adam
# optimizer =  Lookahead(base_optimizer,1e-3 ,k = 6)
train_losses = []
val_losses = []
early_stopping = EarlyStopping(patience=patience,
                               verbose=True,
                               delta=0.005,
                               diff=0.05)
valid_loss_min = np.Inf
epoch_tqdm = tqdm(total=n_epochs, desc='epochs')
for epoch in range(n_epochs):
    train_tqdm = tqdm(total=len(train_loader), desc='training batch')
    ###################
    # train the model #
    ###################
    model.train()
    for batch_idx, (image, boxes, label) in enumerate(train_loader):
        if train_on_gpu:
            image = image.cuda()
            model = model.cuda()
        optimizer.zero_grad()
        output = model.forward(image)