Exemplo n.º 1
0
    def __init__(self,
                 use_cuda,
                 load_model,
                 model_folder,
                 train_directory,
                 validation_directory,
                 builder,
                 loss_fn,
                 args,
                 multi_gpu=True):
        self.use_cuda = use_cuda
        self.load_model = load_model
        self.model_folder = model_folder
        self.validation_directory = validation_directory
        self.train_directory = train_directory
        self.args = args

        self.builder = builder
        self.loss_fn = loss_fn
        self.logdir = join(model_folder, 'logs')
        self.writer = SummaryWriter(self.logdir)
        self.logger = Logger(self.args.log_file)
        self.itr = 0

        # Create Model
        self.model = self.create_model()
        if multi_gpu:
            self.model = torch.nn.DataParallel(self.model,
                                               device_ids=range(
                                                   torch.cuda.device_count()))

        # Build validation set
        validation_builder = builder(self.args.n_views,
                                     validation_directory,
                                     IMAGE_SIZE,
                                     self.args,
                                     toRot=True,
                                     sample_size=SAMPLE_SIZE)
        validation_set = [
            validation_builder.build_set() for i in range(VAL_SEQS)
        ]
        validation_set = ConcatDataset(validation_set)
        self.len_validation_set = len(validation_set)
        del validation_builder
        self.validation_loader = DataLoader(
            validation_set,
            batch_size=8,
            shuffle=False,
            pin_memory=self.use_cuda,
        )
        self.validation_calls = 0
        # Build Training Set
        self.triplet_builder = builder(self.args.n_views, \
            train_directory, IMAGE_SIZE, self.args, toRot=True, sample_size=SAMPLE_SIZE)
        self.training_queue = multiprocessing.Queue(1)
        dataset_builder_process = multiprocessing.Process(
            target=self.build_set,
            args=(self.training_queue, self.triplet_builder, self.logger),
            daemon=True)
        dataset_builder_process.start()

        # Get Logger

        # Model specific setup
        # self.optimizer = optim.SGD(self.model.parameters(), lr=self.args.lr_start, momentum=0.9)
        self.optimizer = optim.Adam(self.model.parameters(),
                                    lr=0.001,
                                    betas=(0.9, 0.999),
                                    eps=1e-08)
        # This will diminish the learning rate at the milestones ///// 0.1, 0.01, 0.001 if not using automized scheduler
        self.learning_rate_scheduler = lr_scheduler.ReduceLROnPlateau(
            self.optimizer, 'min')
Exemplo n.º 2
0

# Loop over epochs.
lr = args.lr
best_val_loss = []
stored_loss = 100000000

# At any point you can hit Ctrl + C to break out of training early.
try:
    optimizer = None
    # Ensure the optimizer is optimizing params, which includes both the model's weights as well as the criterion's weight (i.e. Adaptive Softmax)
    if args.optimizer == 'sgd':
        optimizer = torch.optim.SGD(params, lr=args.lr, weight_decay=args.wdecay)
    if args.optimizer == 'adam':
        optimizer = torch.optim.Adam(params, lr=args.lr, betas=(0, 0.999), eps=1e-9, weight_decay=args.wdecay)
        scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', 0.5, patience=2, threshold=0)
    for epoch in range(1, args.epochs + 1):
        epoch_start_time = time.time()
        train()
        if 't0' in optimizer.param_groups[0]:
            tmp = {}
            for prm in model.parameters():
                tmp[prm] = prm.data.clone()
                prm.data = optimizer.state[prm]['ax'].clone()

            val_loss2 = evaluate(val_data, eval_batch_size)
            print('-' * 89)
            print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                  'valid ppl {:8.2f} | valid bpc {:8.3f}'.format(
                epoch, (time.time() - epoch_start_time), val_loss2, math.exp(val_loss2), val_loss2 / math.log(2)))
            print('-' * 89)
    elif model_name == 'densenet':
        model.classifier = torch.nn.Linear(2208, 3)

    if torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    model.to(device)
    return model


model = get_model("resnet18")
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),
                             lr=LEARNING_RATE,
                             weight_decay=WEIGHT_DECAY)
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=1)

# prepare_dataset()

train_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=mean, std=std)
])
augment_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    torchvision.transforms.RandomHorizontalFlip(p=1),
    torchvision.transforms.RandomRotation(20, resample=Image.BILINEAR),
    transforms.ToTensor(),
    transforms.Normalize(mean=mean, std=std)
])
Exemplo n.º 4
0
def get_scheduler(optimizer, opt):
    print('opt.lr_policy = [{}]'.format(opt.lr_policy))
    if opt.lr_policy == 'lambda':

        def lambda_rule(epoch):
            lr_l = 1.0 - max(0, epoch + 1 + opt.epoch_count -
                             opt.niter) / float(opt.niter_decay + 1)
            return lr_l

        scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule)
    elif opt.lr_policy == 'step':
        scheduler = lr_scheduler.StepLR(optimizer,
                                        step_size=opt.lr_decay_iters,
                                        gamma=0.5)
    elif opt.lr_policy == 'step2':
        scheduler = lr_scheduler.StepLR(optimizer,
                                        step_size=opt.lr_decay_iters,
                                        gamma=0.1)
    elif opt.lr_policy == 'plateau':
        print('schedular=plateau')
        scheduler = lr_scheduler.ReduceLROnPlateau(optimizer,
                                                   mode='min',
                                                   factor=0.1,
                                                   threshold=0.01,
                                                   patience=5)
    elif opt.lr_policy == 'plateau2':
        scheduler = lr_scheduler.ReduceLROnPlateau(optimizer,
                                                   mode='min',
                                                   factor=0.2,
                                                   threshold=0.01,
                                                   patience=5)
    elif opt.lr_policy == 'step_warmstart':

        def lambda_rule(epoch):
            #print(epoch)
            if epoch < 5:
                lr_l = 0.1
            elif 5 <= epoch < 100:
                lr_l = 1
            elif 100 <= epoch < 200:
                lr_l = 0.1
            elif 200 <= epoch:
                lr_l = 0.01
            return lr_l

        scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule)
    elif opt.lr_policy == 'step_warmstart2':

        def lambda_rule(epoch):
            #print(epoch)
            if epoch < 5:
                lr_l = 0.1
            elif 5 <= epoch < 50:
                lr_l = 1
            elif 50 <= epoch < 100:
                lr_l = 0.1
            elif 100 <= epoch:
                lr_l = 0.01
            return lr_l

        scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule)
    else:

        return NotImplementedError(
            'learning rate policy [%s] is not implemented', opt.lr_policy)
    return scheduler
Exemplo n.º 5
0
def train(train_iter, dev_iter, test_iter, model, args):
    if args.cuda:
        model.cuda()
        # torch.cuda.seed()
        torch.cuda.manual_seed(hyperparams.seed_num)

    # optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-8)
    # optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.init_weight_decay)
    # optimizer = torch.optim.SGD(model.parameters(), lr=args.lr,momentum=)
    # optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr)

    if args.Adam is True:
        print("Adam Training......")
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.init_weight_decay)
    elif args.SGD is True:
        print("SGD Training.......")
        optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.init_weight_decay,
                                    momentum=args.momentum_value)
    elif args.Adadelta is True:
        print("Adadelta Training.......")
        optimizer = torch.optim.Adadelta(model.parameters(), lr=args.lr, weight_decay=args.init_weight_decay)

    # lambda1 = lambda epoch: epoch // 30
    # lambda2 = lambda epoch: 0.99 ** epoch
    # print("lambda1 {} lambda2 {} ".format(lambda1, lambda2))
    # scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=[lambda2])

    # scheduler = lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.9)

    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min')


    steps = 0
    epoch_step = 0
    model_count = 0
    model.train()
    for epoch in range(1, args.epochs+1):
        print("\n## 第{} 轮迭代,共计迭代 {} 次 !##\n".format(epoch, args.epochs))
        # scheduler.step()
        # print("now lr is {} \n".format(scheduler.get_lr()))
        print("now lr is {} \n".format(optimizer.param_groups[0].get("lr")))
        for batch in train_iter:
            feature, target = batch.text, batch.label
            feature.data.t_(), target.data.sub_(1)  # batch first, index align
            if args.cuda:
                feature, target = feature.cuda(), target.cuda()

            optimizer.zero_grad()
            model.zero_grad()

            logit = model(feature)
            loss = F.cross_entropy(logit, target)
            loss.backward()
            if args.init_clip_max_norm is not None:
                utils.clip_grad_norm(model.parameters(), max_norm=args.init_clip_max_norm)
            optimizer.step()

            steps += 1
            if steps % args.log_interval == 0:
                train_size = len(train_iter.dataset)
                corrects = (torch.max(logit, 1)[1].view(target.size()).data == target.data).sum()
                accuracy = float(corrects)/batch.batch_size * 100.0
                sys.stdout.write(
                    '\rBatch[{}/{}] - loss: {:.6f}  acc: {:.4f}%({}/{})'.format(steps,
                                                                            train_size,
                                                                             loss.data[0], 
                                                                             accuracy,
                                                                             corrects,
                                                                             batch.batch_size))
            if steps % args.test_interval == 0:
                eval(dev_iter, model, args)
            if steps % args.save_interval == 0:
                if not os.path.isdir(args.save_dir):
                    os.makedirs(args.save_dir)
                save_prefix = os.path.join(args.save_dir, 'snapshot')
                save_path = '{}_steps{}.pt'.format(save_prefix, steps)
                torch.save(model, save_path)
                print("\n", save_path, end=" ")
                test_model = torch.load(save_path)
                model_count += 1
                test_eval(test_iter, test_model, save_path, args, model_count)
    return model_count
Exemplo n.º 6
0
if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    #dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
    model = nn.DataParallel(model)
model.load_state_dict(torch.load('plantclef_imagenet_true_6.pth'))
num_ftrs = model.module.fc.in_features
model.fc = nn.Linear(num_ftrs, len(classes))
model = model.to(device)

criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, patience=5)
if resume:
    if os.path.isfile(resume):
        print("=> loading checkpoint '{}'".format(resume))
        checkpoint = torch.load(resume)
        start_epoch = checkpoint['epoch']
        best_acc = checkpoint['best_acc']
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        train_acc = checkpoint['train_acc']
        best_prec1 = checkpoint['best_prec1']
        best_prec5 = checkpoint['best_prec5']
        lr = checkpoint['lr']
        top1_acc = checkpoint['top1_acc']
        top5_acc = checkpoint['top5_acc']
        train_losses = checkpoint['train_losses']
Exemplo n.º 7
0
)
proba_t = np.zeros((len(test_data), NUM_CLASSES))
folds = 5
train_data.stratifiedKFold(folds)
for fold in range(folds):
    #划分训练集和验证集 并返回验证集数据
    model = Model(num_classes=NUM_CLASSES)
    save_dir = os.path.join(SAVE_DIR, "flod_{}".format(fold))
    agent = Agent(model=model, device_info=DEVICE_INFO, save_dir=save_dir)
    earlyStopping = None

    LOSS = {"celoss": CELoss()}
    OPTIM = Adam(model.parameters(), lr=0.001, weight_decay=0.001)
    reduceLR = lr_scheduler.ReduceLROnPlateau(OPTIM,
                                              mode="max",
                                              factor=0.5,
                                              patience=8,
                                              verbose=True)
    agent.compile(loss_dict=LOSS, optimizer=OPTIM, metrics=METRICS)
    agent.summary()
    valid_X, valid_Y = train_data.get_valid_data(fold)
    valid_Y = one_hot(valid_Y, NUM_CLASSES)
    valid_data = [(valid_X[i], valid_Y[i]) for i in range(valid_X.shape[0])]

    train_generator = DataLoader(train_data,
                                 batch_size=BATCH_SIZE,
                                 shuffle=True,
                                 num_workers=0)
    agent.fit_generator(train_generator,
                        epochs=EPOCH,
                        validation_data=valid_data,
Exemplo n.º 8
0
 def _get_scheduler(self, optimizer, config):
     return lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                           factor=config['factor'],
                                           patience=config['patience'])
Exemplo n.º 9
0
def train(args):
    global DEBUG
    DEBUG = args.debug

    # get timestamp for model id
    dt = datetime.datetime.now()
    timestamp = '{}-{}/{:02d}-{:02d}-{:02d}'.format(dt.strftime("%b"), dt.day, dt.hour, dt.minute, dt.second)
    model_dir = os.path.join(EXP_DIR, timestamp)
    os.makedirs(model_dir)

    # configure logging
    logging.basicConfig(filename=os.path.join(model_dir, 'log.txt'),level=logging.INFO, format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
    if args.verbosity >= 1:
        root = logging.getLogger()
        root.setLevel(logging.DEBUG)
        ch = logging.StreamHandler(sys.stdout)
        ch.setLevel(logging.DEBUG)
        formatter = logging.Formatter('%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
        ch.setFormatter(formatter)
        root.addHandler(ch)

    # set device (if using CUDA)
    seed = 12345
    if torch.cuda.is_available():
        torch.cuda.device(args.gpu)
        torch.cuda.manual_seed(seed)
    else:
        torch.manual_seed(seed)
    # write the args to outfile
    for k, v in sorted(vars(args).items()): logging.info('{} : {}\n'.format(k, v))

    # load data
    training_set, validation_set = load_data(args)

    logging.info('Loaded data: {} training examples, {} validation examples\n'.format(
        len(training_set), len(validation_set)))

    # get config
    experiment_config = get_experiment_config(args, training_set, validation_set)

    # initialize model
    if args.load is None:
        logging.info('Initializing model...\n')
        model = experiment_config.model_generator(experiment_config.model_config)
    else:
        logging.info('Loading model from {}\n'.format(args.load))
        model = torch.load(os.path.join(EXP_DIR, args.load, 'model.ckpt'))
    if torch.cuda.is_available():
        training_set.cuda()
        validation_set.cuda()
        model.cuda()
    logging.info(model)
    logging.info('Number of trainable parameters: {}\n'.format(model.number_of_parameters()))
    logging.info('Training loss: {}\n'.format(experiment_config.loss_fn))

    # optimizer
    lr = args.lr
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=args.weight_decay)
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', verbose=True, factor=0.5, patience=3, min_lr=lr/32)
    logging.info(optimizer)
    logging.info(scheduler)

    # Start Training
    for epoch in range(1, args.epochs + 1):
        if args.randomize_nodes:
            training_set.randomize_nodes()

        train_results = train_one_epoch(model, training_set, experiment_config.loss_fn, optimizer, experiment_config.monitors, args.debug)
        logging.info(results_str(epoch, train_results, 'train'))

        if epoch % 5 == 0:
            results = evaluate_one_epoch(model, validation_set, experiment_config.loss_fn, experiment_config.monitors)
            logging.info(results_str(epoch, results, 'eval'))

            torch.save(model, os.path.join(model_dir, 'model.ckpt'))
            logging.info("Saved model to {}\n".format(os.path.join(model_dir, 'model.ckpt')))

            logging.info("Training: processed {:.1f} graphs per second".format(len(training_set) / train_results['time']))

            with Capturing() as output:
                scheduler.step(results['loss'])
            if len(output) > 0:
                logging.info(output[0])

    return model
Exemplo n.º 10
0
def get_optimizer(args, net):
    """
    Decide Optimizer
    """
    def poly_schd(epoch):
        return math.pow(1 - epoch / args.num_steps, args.poly_exp)

    param_groups = net.parameters()

    if args.optim.lower() == "sgd":
        optimizer = optim.SGD(param_groups,
                              lr=args.lr,
                              weight_decay=args.weight_decay,
                              momentum=args.momentum,
                              nesterov=False)
        logger.info(
            f"[*] Using The SGD Optimizer with lr {args.lr} and weight decay {args.weight_decay} "
            f"and momentum {args.momentum}.")

    elif args.optim.lower() == "adam":
        optimizer = optim.Adam(param_groups,
                               lr=args.lr,
                               weight_decay=args.weight_decay)
        logger.info(
            f"[*] Using The Adam Optimizer with lr {args.lr} and weight decay {args.weight_decay}"
        )
    else:
        raise NotImplementedError

    if args.lr_schedule == "step":
        # step_size 30 gamma 0.2
        scheduler = lr_scheduler.StepLR(optimizer,
                                        step_size=args.step_size,
                                        gamma=args.gamma,
                                        last_epoch=-1)
        logger.info(
            f"[*] Using `Step` LR Scheduler with step size {args.step_size} and gamma {args.gamma}"
        )

    elif args.lr_schedule == "multi_step":
        if isinstance(args.milestones, str):
            args.milestones = eval(args.milestones)
        scheduler = lr_scheduler.MultiStepLR(optimizer,
                                             milestones=args.milestones,
                                             gamma=args.gamma,
                                             last_epoch=args.last_epoch)
        logger.info(
            f"[*] Using `Multi Step` LR Scheduler with milestones {args.milestones} and gamma {args.gamma}"
        )

    elif args.lr_schedule == "reduce_lr_on_plateau":
        patience, threshold = 8, 0.0005
        scheduler = lr_scheduler.ReduceLROnPlateau(optimizer,
                                                   mode="max",
                                                   factor=0.1,
                                                   patience=patience,
                                                   threshold=threshold,
                                                   threshold_mode="rel",
                                                   cooldown=0,
                                                   min_lr=0)
        logger.info(
            f"[*] Using `Reduce Lr On Plateau` LR Scheduler with patience {8} and threshold {threshold}"
        )

    elif args.lr_schedule == "poly":
        scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=poly_schd)
        logger.info(f"[*] Using `Poly` LR Scheduler with poly {args.poly_exp}")

    elif args.lr_schedule == "CosineAnnealingLR":

        scheduler = lr_scheduler.CosineAnnealingLR(optimizer,
                                                   T_max=args.T_max,
                                                   eta_min=args.min_lr)
        logger.info(
            f"[*] Using `CosineAnnealingLR` LR Scheduler with T_max: {args.T_max}, eta_min: {args.min_lr}"
        )
    else:
        raise NotImplementedError

    return optimizer, scheduler
Exemplo n.º 11
0
def train():
    alex_net = AlexNet()
    alex_net = alex_net.cuda()
    alex_net_optimal = AlexNet()
    alex_net_optimal = alex_net_optimal.cuda()
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(alex_net.parameters(),
                          lr=0.01,
                          weight_decay=0.0005,
                          momentum=0.9)
    scheduler = ls.ReduceLROnPlateau(optimizer,
                                     mode='min',
                                     factor=0.1,
                                     patience=2)

    #stopping criteria parameters
    wait = 0
    best_acc = 0.0
    min_delta = 1e-3
    p = 10
    #for epoch in range(2):  # loop over the dataset multiple times

    epoch = 0
    j = 0

    train_error = []
    iter_train = []
    while epoch < p:
        # for epoch in range(1):
        j = j + 1
        running_loss = 0.0
        for i, data in enumerate(trainloader, 0):
            # get the inputs
            inputs, labels = data

            # wrap them in Variable
            inputs, labels = Variable(inputs.cuda()), Variable(labels.cuda())

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = alex_net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # # print statistics
            running_loss += loss.data[0]
            # if i % 2000 == 0:    # print every 2000 mini-batches
            # 	print('[%d, %5d] loss: %.3f' %(epoch + 1, i + 1, running_loss / 2000))
            # 	running_loss = 0.0

        train_error.append(100 - test(trainloader, alex_net))
        iter_train.append(j)
        # acc = test(valloader,alex_net)
        # val_acc=0.0
        val_loss = 0.0
        correct = 0
        total = 0
        for i, data in enumerate(valloader, 0):
            val_input, val_label = data
            val_input, val_label = Variable(val_input.cuda()), val_label.cuda()
            val_output = alex_net(val_input)
            loss = criterion(val_output, Variable(val_label))
            val_loss += loss.data[0]
            _, predicted = torch.max(val_output.data, 1)
            total += val_label.size(0)
            correct += (predicted == val_label).sum()

        val_acc = 100 * (correct / total)
        print(
            'Accuracy of the network on the validation set: %.5f %% and validation loss: %.3f'
            % (val_acc, val_loss))
        scheduler.step(100 - val_acc)

        if (val_acc - best_acc) > min_delta:
            best_acc = val_acc
            epoch = 0
            alex_net_optimal.load_state_dict(alex_net.state_dict())
            # alex_net_optimal = copy.deepcopy(alex_net)
        else:
            epoch = epoch + 1

    #print('Finished Training')
    # plt.plot(iter_train, train_error, label='Train')
    # plt.xlabel('Epoch')
    # plt.ylabel('Cross-Entropy Loss')
    # plt.legend()
    print train_error
    return alex_net_optimal
Exemplo n.º 12
0
def main():
    ''' Main function '''
    parser = argparse.ArgumentParser()

    parser.add_argument('-data', type=str, default='')
    parser.add_argument('-epoch', type=int, default=100)
    parser.add_argument('-batch_size', type=int, default=128)
    parser.add_argument('-dropout', type=float, default=0.5)
    parser.add_argument('-d_model', type=int, default=200)
    parser.add_argument('-brn', type=int, default=3)
    parser.add_argument('-log', default=None)
    parser.add_argument('-save_model', default='model/')
    parser.add_argument('-save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='best')
    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-label_smoothing', action='store_true')
    parser.add_argument('--grained', type=int, default=8)  ##### class
    parser.add_argument('-train_src', default='data/udp_any_data_8.txt')
    parser.add_argument('-save_data', default='')
    parser.add_argument('-max_word_seq_len', type=int, default=128)
    parser.add_argument('-min_word_count', type=int, default=0)
    parser.add_argument('-keep_case', action='store_true')
    parser.add_argument('-share_vocab', action='store_true')
    parser.add_argument('-vocab', default=None)
    parser.add_argument('-fold_num', type=int, default=0)  #############fold
    parser.add_argument('-CUDA_VISIBLE_DEVICES', type=str, default='0')
    parser.add_argument('-lr', type=float, default=1e-3)

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    #opt.cuda = opt.no_cuda

    os.environ['CUDA_VISIBLE_DEVICES'] = opt.CUDA_VISIBLE_DEVICES

    os.makedirs(opt.save_model, exist_ok=True)

    #========= Processing Dataset =========#
    if (opt.data == ''):
        datamanager = DataManager(opt.train_src, opt.grained,
                                  opt.max_word_seq_len, opt.keep_case,
                                  opt.fold_num, opt.min_word_count,
                                  opt.save_data)
        data = datamanager.getdata()

    # #========= Loading Dataset =========#
    else:
        data = torch.load(opt.data)

    print('now seq len:', opt.max_word_seq_len)
    training_data, validation_data, testing_data = prepare_dataloaders(
        data, opt)

    #========= Preparing Model =========#

    print(opt)
    device = torch.device('cuda' if opt.cuda else 'cpu')
    nixae = Nixae(opt.max_word_seq_len,
                  brn=opt.brn,
                  label=opt.grained,
                  d_model=opt.d_model,
                  dropout=opt.dropout).to(device)

    learningrate = opt.lr

    optimizer = optim.Adam(filter(lambda x: x.requires_grad,
                                  nixae.parameters()),
                           lr=learningrate,
                           weight_decay=0.0003)
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer,
                                               'min',
                                               factor=0.1,
                                               patience=20,
                                               verbose=True,
                                               min_lr=1e-5)

    print(nixae)
    train(nixae, training_data, validation_data, testing_data, optimizer,
          device, opt, scheduler)
def train():
    opt = parse_opts()
    print(opt)
    opt.arch = '{}-{}'.format(opt.model, opt.model_depth)
    torch.manual_seed(opt.manual_seed)

    print("Preprocessing train data ...")
    train_data = globals()['{}_test'.format(opt.dataset)](split=opt.split, train=1, opt=opt)
    print("Length of train data = ", len(train_data))

    print("Preprocessing validation data ...")
    val_data = globals()['{}_test'.format(opt.dataset)](split=opt.split, train=2, opt=opt)
    print("Length of validation data = ", len(val_data))

    if opt.modality=='RGB': opt.input_channels = 3
    elif opt.modality=='Flow': opt.input_channels = 2

    print("Preparing datatloaders ...")
    train_dataloader = DataLoader(train_data, batch_size = opt.batch_size, shuffle=True, num_workers = opt.n_workers, pin_memory = True, drop_last=True)
    val_dataloader   = DataLoader(val_data, batch_size = opt.batch_size, shuffle=True, num_workers = opt.n_workers, pin_memory = True, drop_last=True)
    print("Length of train datatloader = ",len(train_dataloader))
    print("Length of validation datatloader = ",len(val_dataloader))

    log_path = os.path.join(opt.result_path, opt.dataset)
    if not os.path.exists(log_path):
        os.makedirs(log_path)

    result_path = "{}/{}/".format(opt.result_path, opt.dataset)
    if not os.path.exists(result_path):
        os.makedirs(result_path)
    if opt.log == 1:
        epoch_logger = Logger_MARS(os.path.join(log_path, 'Fusion_{}_{}_train_batch{}_sample{}_clip{}_lr{}_nesterov{}_manualseed{}_model{}{}_ftbeginidx{}_alpha{}.log'
            .format(opt.dataset, opt.split, opt.batch_size, opt.sample_size, opt.sample_duration, opt.learning_rate, opt.nesterov, opt.manual_seed, opt.model, opt.model_depth, opt.ft_begin_index
                    , opt.MARS_alpha))
                        ,['epoch', 'loss', 'acc', 'lr'], opt.MARS_resume_path, opt.begin_epoch)
        val_logger   = Logger_MARS(os.path.join(log_path, 'Fusion_{}_{}_val_batch{}_sample{}_clip{}_lr{}_nesterov{}_manualseed{}_model{}{}_ftbeginidx{}_alpha{}.log'
                        .format(opt.dataset,opt.split,  opt.batch_size, opt.sample_size, opt.sample_duration, opt.learning_rate, opt.nesterov, opt.manual_seed, opt.model, opt.model_depth, opt.ft_begin_index,
                             opt.MARS_alpha))
                        ,['epoch', 'loss', 'acc'], opt.MARS_resume_path, opt.begin_epoch)

    if opt.nesterov: dampening = 0
    else: dampening = opt.dampening

    # define the model 
    print("Loading models... ", opt.model, opt.model_depth)
    model1, parameters1 = generate_model(opt)

    # if testing RGB+Flow streams change input channels
    opt.input_channels = 2
    model2, parameters2 = generate_model(opt)
    model_fusion = new_fusion_model(opt.n_finetune_classes)
    model_fusion = model_fusion.cuda()
    model_fusion = nn.DataParallel(model_fusion)

    if opt.resume_path1:
        print('Loading MARS model {}'.format(opt.resume_path1))
        checkpoint = torch.load(opt.resume_path1)
        assert opt.arch == checkpoint['arch']
        model1.load_state_dict(checkpoint['state_dict'])
    if opt.resume_path2:
        print('Loading Flow model {}'.format(opt.resume_path2))
        checkpoint = torch.load(opt.resume_path2)
        assert opt.arch == checkpoint['arch']
        model2.load_state_dict(checkpoint['state_dict'])

    if opt.resume_path3:
        print('Loading Fusion model {}'.format(opt.resume_path3))
        checkpoint = torch.load(opt.resume_path3)
        assert opt.arch == checkpoint['arch']
        model2.load_state_dict(checkpoint['state_dict'])

    model1.eval()
    model2.eval()
    model_fusion.train()
    for p in model1.parameters():
        # if p.requires_grad:
        #     print("Need to freeze the parameters")
        p.requires_grad = False
    for p in model2.parameters():
        # if p.requires_grad:
        #     print("Need to freeze the parameters..")
        p.requires_grad = False

    print("Initializing the optimizer ...")

    print("lr = {} \t momentum = {} \t dampening = {} \t weight_decay = {}, \t nesterov = {}"
          .format(opt.learning_rate, opt.momentum, dampening, opt.weight_decay, opt.nesterov))
    print("LR patience = ", opt.lr_patience)

    optimizer = optim.SGD(
        model_fusion.parameters(),
        lr=opt.learning_rate,
        momentum=opt.momentum,
        dampening=dampening,
        weight_decay=opt.weight_decay,
        nesterov=opt.nesterov)
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=opt.lr_patience)

    criterion = nn.CrossEntropyLoss().cuda()
    print('run')
    for epoch in range(opt.begin_epoch, opt.n_epochs + 1):
        batch_time = AverageMeter()
        data_time  = AverageMeter()
        losses = AverageMeter()
        accuracies = AverageMeter()
        weights=AverageMeter()
        end_time = time.time()
        for i, (inputs, targets) in enumerate(train_dataloader):
            data_time.update(time.time() - end_time)
            inputs_MARS = inputs[:, 0:3, :, :, :]
            inputs_Flow = inputs[:, 3:, :, :, :]

            targets = targets.cuda(non_blocking=True)
            inputs_MARS = Variable(inputs_MARS)
            inputs_Flow = Variable(inputs_Flow)
            targets = Variable(targets)
            outputs_MARS = model1(inputs_MARS)
            outputs_Flow = model2(inputs_Flow)

            weight,outputs_var =model_fusion(outputs_MARS.detach(),outputs_Flow.detach())
            loss=criterion(outputs_var,targets)
            acc = calculate_accuracy(outputs_var, targets)

            losses.update(loss.data, inputs.size(0))
            accuracies.update(acc, inputs.size(0))
            weights.update(weight[0][0].data, inputs.size(0))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            batch_time.update(time.time() - end_time)
            end_time = time.time()

            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Acc {acc.val:.3f} ({acc.avg:.3f})\t'
                  'Weight {weight.val:.3f} ({weight.avg:.3f})'.format(
                epoch,
                i + 1,
                len(train_dataloader),
                batch_time=batch_time,
                data_time=data_time,
                loss=losses,
                acc=accuracies,
                weight=weights
            ))

        if opt.log == 1:
            epoch_logger.log({
                'epoch': epoch,
                'loss': losses.avg,
                'acc': accuracies.avg,
                'lr': optimizer.param_groups[0]['lr']
            })

        if epoch % opt.checkpoint == 0:
            if opt.pretrain_path != '':
                save_file_path = os.path.join(log_path,
                                              'Fusion_{}_{}_train_batch{}_sample{}_clip{}_lr{}_nesterov{}_manualseed{}_model{}{}_ftbeginidx{}_alpha{}_{}.pth'
                                              .format(opt.dataset, opt.split, opt.batch_size, opt.sample_size,
                                                      opt.sample_duration, opt.learning_rate, opt.nesterov,
                                                      opt.manual_seed, opt.model, opt.model_depth,
                                                      opt.ft_begin_index,
                                                      opt.MARS_alpha, epoch))
            else:
                save_file_path = os.path.join(log_path,
                                              'Fusion_{}_{}_train_batch{}_sample{}_clip{}_lr{}_nesterov{}_manualseed{}_model{}{}_ftbeginidx{}_alpha{}_{}.pth'
                                              .format(opt.dataset, opt.split, opt.batch_size, opt.sample_size,
                                                      opt.sample_duration, opt.learning_rate, opt.nesterov,
                                                      opt.manual_seed, opt.model, opt.model_depth,
                                                      opt.ft_begin_index,
                                                      opt.MARS_alpha, epoch))
            states = {
                'epoch': epoch + 1,
                'arch': opt.arch,
                'state_dict': model_fusion.state_dict(),
                'optimizer': optimizer.state_dict(),
            }
            torch.save(states, save_file_path)

        model_fusion.eval()
        batch_time = AverageMeter()
        data_time = AverageMeter()
        losses = AverageMeter()
        accuracies = AverageMeter()

        end_time = time.time()
        with torch.no_grad():
            for i, (inputs, targets) in enumerate(val_dataloader):

                data_time.update(time.time() - end_time)
                inputs_MARS = inputs[:, 0:3, :, :, :]
                inputs_Flow = inputs[:, 3:, :, :, :]

                targets = targets.cuda(non_blocking=True)
                inputs_MARS = Variable(inputs_MARS)
                inputs_Flow=Variable(inputs_Flow)
                targets = Variable(targets)

                outputs_MARS = model1(inputs_MARS)
                outputs_Flow = model2(inputs_Flow)
                _,outputs_var=model_fusion(outputs_MARS,outputs_Flow)
                loss = criterion(outputs_var, targets)
                acc = calculate_accuracy(outputs_var, targets)

                losses.update(loss.data, inputs.size(0))
                accuracies.update(acc, inputs.size(0))

                batch_time.update(time.time() - end_time)
                end_time = time.time()

                print('Val_Epoch: [{0}][{1}/{2}]\t'
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      'Acc {acc.val:.3f} ({acc.avg:.3f})'.format(
                    epoch,
                    i + 1,
                    len(val_dataloader),
                    batch_time=batch_time,
                    data_time=data_time,
                    loss=losses,
                    acc=accuracies))

        if opt.log == 1:
            val_logger.log({'epoch': epoch, 'loss': losses.avg, 'acc': accuracies.avg})
        scheduler.step(losses.avg)
Exemplo n.º 14
0
def train(paths_dict, model, transformation, criterion, device, save_path,
          opt):

    since = time.time()

    dataloaders = dict()
    # Define transforms for data normalization and augmentation
    for domain in ['source', 'target']:
        subjects_domain_train = ImagesDataset(
            paths_dict[domain]['training'],
            transform=transformation['training'][domain])

        subjects_domain_val = ImagesDataset(
            paths_dict[domain]['validation'],
            transform=transformation['validation'][domain])

        # Number of workers
        workers = 10

        batch_loader_domain_train = infinite_iterable(
            DataLoader(subjects_domain_train, batch_size=batch_size))
        batch_loader_domain_val = infinite_iterable(
            DataLoader(subjects_domain_val, batch_size=batch_size))

        dataloaders_domain = dict()
        dataloaders_domain['training'] = batch_loader_domain_train
        dataloaders_domain['validation'] = batch_loader_domain_val
        dataloaders[domain] = dataloaders_domain

    # Training parameters are saved
    df_path = os.path.join(opt.model_dir, 'log.csv')
    if os.path.isfile(df_path):  # If the training already started
        df = pd.read_csv(df_path, index_col=False)
        epoch = df.iloc[-1]['epoch']
        best_epoch = df.iloc[-1]['best_epoch']

        val_eval_criterion_MA = df.iloc[-1]['MA']
        best_val_eval_criterion_MA = df.iloc[-1]['best_MA']

        initial_lr = df.iloc[-1]['lr']

        model.load_state_dict(torch.load(save_path.format('best')))

    else:  # If training from scratch
        df = pd.DataFrame(
            columns=['epoch', 'best_epoch', 'MA', 'best_MA', 'lr'])
        val_eval_criterion_MA = None
        best_epoch = 0
        epoch = 0
        initial_lr = opt.learning_rate

    model = model.to(device)

    # Optimisation policy
    optimizer = torch.optim.Adam(model.parameters(),
                                 initial_lr,
                                 weight_decay=weight_decay,
                                 amsgrad=True)
    lr_s = lr_scheduler.ReduceLROnPlateau(optimizer,
                                          mode='min',
                                          factor=0.2,
                                          patience=patience_lr,
                                          verbose=True,
                                          threshold=1e-3,
                                          threshold_mode="abs")

    # Loop parameters
    continue_training = True
    ind_batch_train = np.arange(
        0, samples_per_volume * len(paths_dict['source']['training']),
        batch_size)
    ind_batch_val = np.arange(
        0,
        samples_per_volume * max(len(paths_dict['source']['validation']),
                                 len(paths_dict['target']['validation'])),
        batch_size)
    ind_batch = dict()
    ind_batch['training'] = ind_batch_train
    ind_batch['validation'] = ind_batch_val

    # Loss initialisation
    crf_l = CRFLoss(alpha=opt.alpha, beta=opt.beta, is_da=False)
    crf_l_da = CRFLoss(alpha=0, beta=opt.beta_da, is_da=True)

    while continue_training:
        epoch += 1
        print('-' * 10)
        print('Epoch {}/'.format(epoch))
        for param_group in optimizer.param_groups:
            print("Current learning rate is: {}".format(param_group['lr']))

        # Each epoch has a training and validation phase
        for phase in ['training', 'validation']:
            print(phase)
            if phase == 'training':
                model.train()  # Set model to training mode
            else:
                model.eval()  # Set model to evaluate mode

            running_loss = 0.0
            running_loss_target = 0.0
            running_loss_source = 0.0
            epoch_samples = 0

            # Iterate over data
            for _ in tqdm(ind_batch[phase]):
                # Next source batch
                batch_source = next(dataloaders['source'][phase])
                labels_source = batch_source['label'][DATA].to(device).type(
                    torch.cuda.IntTensor)
                inputs_source = torch.cat(
                    [batch_source[k][DATA] for k in MODALITIES_SOURCE],
                    1).to(device)

                # Next target batch
                batch_target = next(dataloaders['target'][phase])
                scribbles_target = batch_target['scribble'][DATA].to(device)
                inputs_target = torch.cat(
                    [batch_target[k][DATA] for k in MODALITIES_TARGET],
                    1).to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # track history if only in train
                with torch.set_grad_enabled(phase == 'training'):

                    outputs, features = model(
                        torch.cat([inputs_source, inputs_target], 0), 'source')

                    outputs_source, features_source = outputs[:batch_size,
                                                              ...], features[:
                                                                             batch_size,
                                                                             ...]
                    outputs_target, features_target = outputs[
                        batch_size:, ...], features[batch_size:, ...]

                    # Loss Source with full Labels
                    loss_source = criterion(outputs_source, labels_source)

                    # Loss Target on Scribbles
                    loss_target = scribble_loss(outputs_target,
                                                scribbles_target, criterion)

                    # Within scans regularisation (target only)
                    if (opt.beta > 0 or opt.alpha > 0) and phase == 'training':
                        reg_target = opt.weight_crf / nb_voxels[
                            'target'] * crf_l(inputs_target, outputs_target)
                    else:
                        reg_target = 0.0

                    # Pairwise scans regularisation (DA)
                    if opt.beta_da > 0 and phase == 'training' and opt.warmup > epoch:
                        index = torch.LongTensor(2).random_(
                            0, features_source.shape[1])

                        features_crf = [
                            features_source[:, index, ...],
                            features_target[:, index, ...]
                        ]
                        features_crf = torch.cat(features_crf,
                                                 0).detach().cuda()

                        prob = [
                            onehot(labels_source, outputs_source.shape),
                            torch.nn.Softmax(1)(outputs_target)
                        ]
                        prob = torch.cat(prob, 0)

                        reg_da = opt.weight_crf / nb_voxels[
                            'target'] * crf_l_da(I=features_crf, U=prob)
                    else:
                        reg_da = 0.0

                    if phase == 'training':
                        loss = loss_source + loss_target + reg_target + reg_da
                    else:
                        loss = loss_source + loss_target

                    # backward + optimize only if in training phase
                    if phase == 'training':
                        loss.backward()
                        optimizer.step()

                # statistics
                epoch_samples += 1
                running_loss += loss.item()
                running_loss_source += loss_source.item()
                running_loss_target += loss_target.item()

            epoch_loss = running_loss / epoch_samples
            epoch_loss_source = running_loss_source / epoch_samples
            epoch_loss_target = running_loss_target / epoch_samples

            print('{}  Loss Seg Source: {:.4f}'.format(phase,
                                                       epoch_loss_source))
            print('{}  Loss Seg Target: {:.4f}'.format(phase,
                                                       epoch_loss_target))

            if phase == 'validation':
                if val_eval_criterion_MA is None:  # first iteration
                    val_eval_criterion_MA = epoch_loss
                    best_val_eval_criterion_MA = val_eval_criterion_MA

                else:  #update criterion
                    val_eval_criterion_MA = val_eval_criterion_alpha * val_eval_criterion_MA + (
                        1 - val_eval_criterion_alpha) * epoch_loss

                df = df.append(
                    {
                        'epoch': epoch,
                        'best_epoch': best_epoch,
                        'MA': val_eval_criterion_MA,
                        'best_MA': best_val_eval_criterion_MA,
                        'lr': param_group['lr']
                    },
                    ignore_index=True)
                df.to_csv(df_path, index=False)

                lr_s.step(val_eval_criterion_MA)

                if val_eval_criterion_MA < best_val_eval_criterion_MA:
                    best_val_eval_criterion_MA = val_eval_criterion_MA
                    best_epoch = epoch
                    torch.save(model.state_dict(), save_path.format('best'))

                else:
                    if epoch - best_epoch > nb_patience:
                        continue_training = False

                if epoch == opt.warmup:
                    torch.save(model.state_dict(), save_path.format('warmup'))

    time_elapsed = time.time() - since
    print('Training completed in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best epoch is {}'.format(best_epoch))
def train_reinforcement(grammar=True,
                        model=None,
                        EPOCHS=None,
                        BATCH_SIZE=None,
                        lr=2e-4,
                        main_dataset=None,
                        new_datasets=None,
                        plot_ignore_initial=0,
                        save_file=None,
                        plot_prefix='',
                        dashboard='main',
                        preload_weights=False):

    root_location = os.path.dirname(
        os.path.abspath(inspect.getfile(inspect.currentframe())))
    root_location = root_location + '/../'
    if save_file is not None:
        save_path = root_location + 'pretrained/' + save_file
    else:
        save_path = None
    molecules = True  # checking for validity only makes sense for molecules
    settings = get_settings(molecules=molecules, grammar=grammar)

    # TODO: separate settings for this?
    if EPOCHS is not None:
        settings['EPOCHS'] = EPOCHS
    if BATCH_SIZE is not None:
        settings['BATCH_SIZE'] = BATCH_SIZE

    if preload_weights:
        try:
            model.load(save_path)
        except:
            pass
    nice_params = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = optim.Adam(nice_params, lr=lr)

    # # create the composite loaders
    # train_loader, valid_loader = train_valid_loaders(main_dataset,
    #                                                  valid_fraction=0.1,
    #                                                  batch_size=BATCH_SIZE,
    #                                                  pin_memory=use_gpu)
    train_l = []
    valid_l = []
    for ds in new_datasets:
        train_loader, valid_loader = SamplingWrapper(ds)\
                        .get_train_valid_loaders(BATCH_SIZE,
                                                 valid_batch_size = 1+int(BATCH_SIZE/5),
                            dataset_name=['actions','seq_len','valid','sample_seq_ind'],
                                                 window=1000)
        train_l.append(train_loader)
        valid_l.append(valid_loader)
    train_gen = CombinedLoader(train_l, num_batches=90)
    valid_gen = CombinedLoader(valid_l, num_batches=10)

    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer,
                                               factor=0.2,
                                               patience=3,
                                               min_lr=0.0001,
                                               eps=1e-08)
    #scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.9)
    loss_obj = ReinforcementLoss()

    fitter = fit(train_gen=train_gen,
                 valid_gen=valid_gen,
                 model=model,
                 optimizer=optimizer,
                 scheduler=scheduler,
                 epochs=settings['EPOCHS'],
                 loss_fn=loss_obj,
                 save_path=save_path,
                 save_always=True,
                 dashboard_name=dashboard,
                 plot_ignore_initial=plot_ignore_initial,
                 plot_prefix=plot_prefix,
                 loss_display_cap=200)

    return fitter
Exemplo n.º 16
0
 def __init__(self,
              model,
              train_loader,
              val_loader,
              optimizer,
              log_dir="./cache/logs/",
              log_level=logging.INFO,
              checkpoint_dir="./cache/model_cache/",
              echo=False,
              device="cuda:0",
              use_tensorboard=False,
              use_amp=False,
              seed=12321,
              n_gpus=1,
              patience=20):
     super(BaseBot, self).__init__()
     self.criterion = torch.nn.CrossEntropyLoss()
     self.model = model
     self.train_loader = train_loader
     self.val_loader = val_loader
     self.patience = patience
     self.optimizer = optimizer
     self.lr = self.optimizer.param_groups[0]['lr']
     self.log_dir = log_dir
     self.log_level = log_level
     self.checkpoint_dir = checkpoint_dir
     self.checkpoint_path = os.path.join(self.checkpoint_dir,
                                         "checkpoint.pt")
     self.echo = echo
     self.device = device
     self.use_tensorboard = use_tensorboard
     self.use_amp = use_amp
     self.seed = seed
     self.n_gpus = n_gpus
     self.step = 0
     self.gradient_accumulation_steps = 1
     self.clip_grad = 0
     self.batch_dim = 0
     self.y_task = 2
     ###########################################################
     self.scheduler = lr_scheduler.ReduceLROnPlateau(self.optimizer,
                                                     factor=0.1,
                                                     patience=int(
                                                         self.patience / 2),
                                                     verbose=True)
     ###########################################################
     for path in [self.log_dir, self.checkpoint_dir]:
         if not os.path.exists(path) or not os.path.isdir(path):
             try:
                 os.makedirs(path)
             except:
                 print(f"make {path} failed!")
     ###########################################################
     if self.use_amp:
         try:
             from apex import amp
         except ImportError:
             raise ImportError(
                 "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
             )
         self.model, self.optimizer = amp.initialize(self.model,
                                                     self.optimizer,
                                                     opt_level="O1")
     if self.n_gpus > 1:
         self.model = torch.nn.DataParallel(self.model)
     self.model.to(self.device)
     ###########################################################
     # self.logger = Logger(
     #     self.name, str(self.log_dir), self.log_level,
     #     use_tensorboard=self.use_tensorboard, echo=self.echo)
     self.logger = logging.getLogger()
     self.logger.setLevel(logging.INFO)
     self.logger.info("SEED: %s", self.seed)
     ###########################################################
     self.count_model_parameters()
     ###########################################################
     self.set_seed(self.seed)
Exemplo n.º 17
0
def train_network():
    print('')
    print('')
    # Start measuring time - to evaluate performance of the training function
    start = timeit.default_timer()

    # Set seeds
    set_seed(args)

    # Make folders if not yet exist
    try:
        os.makedirs('save')
    except FileExistsError:
        pass

    # Save relevant arguments from a and set hardcoded arguments
    lr = args.lr  # learning rate
    batch_size = args.batch_size  # Mini-batch size
    num_epochs = args.num_epochs  # Number of epochs to train the network
    seq_len = args.seq_len

    # Network architecture:
    rnn_name = args.rnn_name
    inputs_list = args.inputs_list
    outputs_list = args.outputs_list

    load_rnn = args.load_rnn  # If specified this is the name of pretrained RNN which should be loaded
    path_save = args.path_save

    # Create rnn instance and update lists of input, outputs and its name (if pretraind net loaded)
    net, rnn_name, inputs_list, outputs_list \
        = create_rnn_instance(rnn_name, inputs_list, outputs_list, load_rnn, path_save, device)

    # Create log for this RNN and determine its full name
    rnn_full_name = create_log_file(rnn_name, inputs_list, outputs_list, path_save)
    net.rnn_full_name = rnn_full_name

    ########################################################
    # Create Dataset
    ########################################################

    train_dfs, _ = load_data(args, args.train_file_name)

    normalization_info =  calculate_normalization_info(train_dfs, args.path_save, rnn_full_name)

    test_dfs, time_axes_dev = load_data(args, args.val_file_name)

    train_dfs_norm = normalize_df(train_dfs, normalization_info)
    test_dfs_norm = normalize_df(test_dfs, normalization_info)

    del train_dfs, test_dfs

    train_set = Dataset(train_dfs_norm, args)
    dev_set = Dataset(test_dfs_norm, args, time_axes=time_axes_dev)
    print('Number of samples in training set: {}'.format(train_set.number_of_samples))
    print('The training sets sizes are: {}'.format(train_set.df_lengths))
    print('Number of samples in validation set: {}'.format(dev_set.number_of_samples))
    print('')


    plot_results(net=net, args=args, dataset=dev_set, seq_len=1024,
                 comment='This is the network at the beginning of the training',
                 inputs_list=inputs_list, outputs_list=outputs_list,
                 save=True,
                 closed_loop_enabled=True)

    # Create PyTorch dataloaders for train and dev set
    train_generator = data.DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True,
                                      num_workers=args.num_workers)
    dev_generator = data.DataLoader(dataset=dev_set, batch_size=512, shuffle=False, num_workers=args.num_workers)

    # Print parameter count
    print_parameter_count(net)  # Seems not to function well

    # Select Optimizer
    optimizer = optim.Adam(net.parameters(), amsgrad=True, lr=lr)

    # TODO: Verify if scheduler is working. Try tweaking parameters of below scheduler and try cyclic lr scheduler

    # scheduler = lr_scheduler.CyclicLR(optimizer, base_lr=lr, max_lr=0.1)
    # scheduler = lr_scheduler.StepLR(optimizer, step_size=200, gamma=0.5)
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min',patience=1, verbose=True)

    # Select Loss Function
    criterion = nn.MSELoss()  # Mean square error loss function
    '''
    Init Tensorboard
    '''
    comment = f' batch_size={batch_size} lr={lr} seq_len={seq_len}'
    tb = SummaryWriter(comment=comment)
    ########################################################
    # Training
    ########################################################
    print("Starting training...")
    print('')
    time.sleep(0.001)

    # Create dictionary to store training history
    dict_history = {}
    dict_history['epoch'] = []
    dict_history['time'] = []
    dict_history['lr'] = []
    dict_history['train_loss'] = []
    dict_history['dev_loss'] = []
    dict_history['dev_gain'] = []
    dict_history['test_loss'] = []
    dev_gain = 1

    # The epoch_saved variable will indicate from which epoch is the last RNN model,
    # which was good enough to be saved
    epoch_saved = -1
    for epoch in range(num_epochs):

        ###########################################################################################################
        # Training - Iterate batches
        ###########################################################################################################
        # Set RNN in training mode
        net = net.train()
        # Define variables accumulating training loss and counting training batchs
        train_loss = 0
        train_batches = 0

        # Iterate training over available batches
        # tqdm() is just a function which displays the progress bar
        # Otherwise the line below is the same as "for batch, labels in train_generator:"
        for batch, labels in tqdm(train_generator):  # Iterate through batches

            # Reset the network (internal states of hidden layers and output history not the weights!)
            net.reset()

            # Further modifying the input and output form to fit RNN requirements
            # If GPU available we send tensors to GPU (cuda)
            if torch.cuda.is_available():
                batch = batch.float().cuda().transpose(0, 1)
                labels = labels.float().cuda()
            else:
                batch = batch.float().transpose(0, 1)
                labels = labels.float()

            # # Reset memory of gradients
            # optimizer.zero_grad()

            # Warm-up (open loop prediction) to settle the internal state of RNN hidden layers
            net(rnn_input=batch[:args.warm_up_len, :, :])

            # Reset memory of gradients
            optimizer.zero_grad()

            # Forward propagation - These are the results from which we calculate the update to RNN weights
            # GRU Input size must be (seq_len, batch, input_size)
            net(rnn_input=batch[args.warm_up_len:, :, :])
            out = net.return_outputs_history()

            # Get loss
            loss = criterion(out[:, args.warm_up_len:, :],
                             labels[:, args.warm_up_len:, :])

            # Backward propagation
            loss.backward()

            # Gradient clipping - prevent gradient from exploding
            torch.nn.utils.clip_grad_norm_(net.parameters(), 100)

            # Update parameters
            optimizer.step()
            # scheduler.step()
            # Update variables for loss calculation
            batch_loss = loss.detach()
            train_loss += batch_loss  # Accumulate loss
            train_batches += 1  # Accumulate count so we can calculate mean later

        ###########################################################################################################
        # Validation - Iterate batches
        ###########################################################################################################

        # Set the network in evaluation mode
        net = net.eval()

        # Define variables accumulating evaluation loss and counting evaluation batches
        dev_loss = 0
        dev_batches = 0

        for (batch, labels) in tqdm(dev_generator):

            # Reset the network (internal states of hidden layers and output history not the weights!)
            net.reset()

            # Further modifying the input and output form to fit RNN requirements
            # If GPU available we send tensors to GPU (cuda)
            if torch.cuda.is_available():
                batch = batch.float().cuda().transpose(0, 1)
                labels = labels.float().cuda()
            else:
                batch = batch.float().transpose(0, 1)
                labels = labels.float()

            # Warm-up (open loop prediction) to settle the internal state of RNN hidden layers
            net(rnn_input=batch)
            out = net.return_outputs_history()


            # Get loss
            # For evaluation we always calculate loss over the whole maximal prediction period
            # This allow us to compare RNN models from different epochs
            loss = criterion(out[:, args.warm_up_len: args.seq_len],
                             labels[:, args.warm_up_len: args.seq_len])

            # Update variables for loss calculation
            batch_loss = loss.detach()
            dev_loss += batch_loss  # Accumulate loss
            dev_batches += 1  # Accumulate count so we can calculate mean later

        # Reset the network (internal states of hidden layers and output history not the weights!)
        net.reset()
        # Get current learning rate
        # TODO(Fixed. It does changes now): I think now the learning rate do not change during traing, or it is not a right way to get this info.

        for param_group in optimizer.param_groups:
            lr_curr = param_group['lr']

        scheduler.step(dev_loss)
        '''
        Add data for tensorboard
        TODO : Add network graph and I/O to tensorboard
        '''
        # tb.add_graph(net)
        tb.add_scalar('Train Loss', train_loss / train_batches, epoch)
        tb.add_scalar('Dev Loss', dev_loss / dev_batches, epoch)

        # Add the first sample of batch to tensorboard. Prediction is represented by Dotted line
        # TODO: Concatenate such graphs. But they are not continous
        # for i in range(labels.shape[2]):
        #     time_label = np.arange(0, labels.shape[1], 1)
        #     time_out = np.arange(0, out.shape[1], 1)
        #     true_data = labels[1, :, i]
        #     predicted_data = out[1, :, i]
        #     fig_tb = plt.figure(5)
        #     plt.plot(time_label, true_data.detach().cpu())
        #     plt.plot(time_out, predicted_data.detach().cpu(), linestyle='dashed')
        #     tb.add_figure(tag=str(a.outputs_list[i]), figure=fig_tb, global_step=epoch)

        for name, param in net.named_parameters():
            tb.add_histogram(name, param, epoch)
            tb.add_histogram(f'{name}.grad', param.grad, epoch)
        tb.close()

        # Write the summary information about the training for the just completed epoch to a dictionary

        dict_history['epoch'].append(epoch)
        dict_history['lr'].append(lr_curr)
        dict_history['train_loss'].append(
            train_loss.detach().cpu().numpy() / train_batches / (args.seq_len - args.warm_up_len))
        dict_history['dev_loss'].append(
            dev_loss.detach().cpu().numpy() / dev_batches / (args.seq_len - args.warm_up_len))

        # Get relative loss gain for network evaluation
        if epoch >= 1:
            dev_gain = (dict_history['dev_loss'][epoch - 1] - dict_history['dev_loss'][epoch]) / \
                       dict_history['dev_loss'][epoch - 1]
        dict_history['dev_gain'].append(dev_gain)

        # Print the summary information about the training for the just completed epoch
        print('\nEpoch: %3d of %3d | '
              'LR: %1.5f | '
              'Train-L: %6.4f | '
              'Val-L: %6.4f | '
              'Val-Gain: %3.2f |' % (dict_history['epoch'][epoch], num_epochs - 1,
                                     dict_history['lr'][epoch],
                                     dict_history['train_loss'][epoch],
                                     dict_history['dev_loss'][epoch],
                                     dict_history['dev_gain'][epoch] * 100))
        print('')

        # Save the best model with the lowest dev loss
        # Always save the model from epoch 0
        # TODO: this is a bug: you should only save the model from epoch 0 if there is no pretraind network
        if epoch == 0:
            min_dev_loss = dev_loss
        # If current loss smaller equal than minimal till now achieved loss,
        # save the current RNN model and save its loss as minimal ever achieved
        if dev_loss <= min_dev_loss:
            epoch_saved = epoch
            min_dev_loss = dev_loss
            torch.save(net.state_dict(), args.path_save + rnn_full_name + '.pt', _use_new_zipfile_serialization=False)
            print('>>> saving best model from epoch {}'.format(epoch))
            print('')

            plot_string = 'This is the network after {} training epoch'.format(epoch + 1)
            plot_results(net=net, args=args, dataset=dev_set, seq_len=1024,
                         comment=plot_string,
                         inputs_list=inputs_list, outputs_list=outputs_list, save=True,
                         closed_loop_enabled=True)
        else:
            print('>>> We keep model from epoch {}'.format(epoch_saved))
            print('')

        # Evaluate the performance of the current network
        # by checking its predictions on a randomly generated CartPole experiment
        # open_loop_prediction_experiment(net, a, val_file)

    # When finished the training print the final message
    print("Training Completed...                                               ")
    print(" ")

    # Calculate the total time it took to run the function
    stop = timeit.default_timer()
    total_time = stop - start

    # Return the total time it took to run the function
    return total_time
def main():
    fold = 0
    # 4.1 mkdirs
    if not os.path.exists(config.submit):
        os.makedirs(config.submit)
    if not os.path.exists(config.weights + config.model_name + os.sep +str(fold)):
        os.makedirs(config.weights + config.model_name + os.sep +str(fold))
    if not os.path.exists(config.best_models):
        os.mkdir(config.best_models)
    if not os.path.exists("./logs/"):
        os.mkdir("./logs/")
    
    #4.2 get model
    model = MultiModalNet("se_resnext50_32x4d","dpn26",0.5)  #se_resnext101_32x4d

    #4.3 optim & criterion
    optimizer = optim.SGD(model.parameters(),lr = config.lr,momentum=0.9,weight_decay=1e-4)
    # criterion = FocalLoss(alpha=[1,1,1,1,1,1,1,1,1]).to(device)
    criterion = nn.CrossEntropyLoss().to(device)

    start_epoch = 0
    best_acc=0
    best_loss = np.inf
    best_f1 = 0
    best_results = [0,np.inf,0]
    val_metrics = [0,np.inf,0]
    resume = False
    if resume:
        checkpoint_path = r'./checkpoints/best_models/multimodal_fold_0_model_best_loss.pth.tar'
        if not os.path.isfile(checkpoint_path):
            raise RuntimeError("=> no checkpoint found at '{}'".format(checkpoint_path))
        checkpoint = torch.load(checkpoint_path,map_location=device)
        best_acc = checkpoint['best_acc']
        best_loss = checkpoint['best_loss']
        best_f1 = checkpoint['best_f1']
        start_epoch = checkpoint['epoch']

        #args.cuda
        # if torch.cuda.is_available():
        #     model.module.load_state_dict(checkpoint['state_dict'])
        # else:
        #     model.load_state_dict(checkpoint['state_dict'])
        model.load_state_dict(checkpoint['state_dict'])
        # ft = True
        # if ft:
        #     optimizer.load_state_dict(checkpoint['optimizer'])

    # # Clear start epoch if fine-tuning
    # if args.ft:
    #     args.start_epoch = 0

    muti_gpu = False
    if torch.cuda.device_count() > 1 and muti_gpu == True:
        model = nn.DataParallel(model)
    model.to(device)

    all_files = pd.read_csv("/data/BaiDuBigData19-URFC/data/train_oversampling.csv")
    test_files = pd.read_csv("/data/BaiDuBigData19-URFC/data/test.csv")
    train_data_list,val_data_list = train_test_split(all_files, test_size=0.1, random_state = 2050)

    # load dataset
    train_gen = MultiModalDataset(train_data_list,config.train_data,config.train_vis,mode="train")
    train_loader = DataLoader(train_gen,batch_size=config.batch_size,shuffle=True,pin_memory=True,num_workers=16) #num_worker is limited by shared memory in Docker!

    val_gen = MultiModalDataset(val_data_list,config.train_data,config.train_vis,augument=False,mode="train")
    val_loader = DataLoader(val_gen,batch_size=config.batch_size,shuffle=False,pin_memory=True,num_workers=16)

    test_gen = MultiModalDataset(test_files,config.test_data,config.test_vis,augument=False,mode="test")
    test_loader = DataLoader(test_gen,1,shuffle=False,pin_memory=True,num_workers=16)

    #scheduler = lr_scheduler.StepLR(optimizer,step_size=10,gamma=0.1)
    #如果是best_acc 的话,mode = "max" ,如果是best_loss的话,mode = "min"
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer)
    #n_batches = int(len(train_loader.dataset) // train_loader.batch_size)
    #scheduler = CosineAnnealingLR(optimizer, T_max=n_batches*2)
    start = timer()

    #train
    for epoch in range(0,config.epochs):#config.epochs
        scheduler.step(epoch)
        # train
        train_metrics = train(train_loader,model,criterion,optimizer,epoch,val_metrics,best_results,start)
        # val
        val_metrics = evaluate(val_loader,model,criterion,epoch,train_metrics,best_results,start)
        # check results
        is_best_acc = val_metrics[0] > best_results[0]
        best_results[0] = max(val_metrics[0],best_results[0])
        is_best_loss = val_metrics[1] < best_results[1]
        best_results[1] = min(val_metrics[1],best_results[1])
        is_best_f1 = val_metrics[2] > best_results[2]
        best_results[2] = max(val_metrics[2],best_results[2])
        # save model
        save_checkpoint({
                    "epoch":epoch + 1,
                    "model_name":config.model_name,
                    "state_dict":model.state_dict(),
                    "best_acc":best_results[0],
                    "best_loss":best_results[1],
                    "optimizer":optimizer.state_dict(),
                    "fold":fold,
                    "best_f1":best_results[2],
        },is_best_acc,is_best_loss,is_best_f1,fold)
        # print logs
        print('\r',end='',flush=True)
        log.write('%s  %5.1f %6.1f      |   %0.3f   %0.3f   %0.3f     |  %0.3f   %0.3f    %0.3f    |   %s  %s  %s | %s' % (\
                "best", epoch, epoch,
                train_metrics[0], train_metrics[1],train_metrics[2],
                val_metrics[0],val_metrics[1],val_metrics[2],
                str(best_results[0])[:8],str(best_results[1])[:8],str(best_results[2])[:8],
                time_to_str((timer() - start),'min'))
            )
        log.write("\n")
        time.sleep(0.01)

    best_model = torch.load("%s/%s_fold_%s_model_best_loss.pth.tar"%(config.best_models,config.model_name,str(fold)))
    model.load_state_dict(best_model["state_dict"])
    evaluation(test_loader,model,fold)
Exemplo n.º 19
0
def do_test():

    device = "cuda:0"

    #set device

    sr_destination_base = 'Z:/SuperResolution/Labeled_Tiled_Datasets_Fix/BSDS200\Scale_3/'

    test_base_200 = 'Z:/SuperResolution/Labeled_Tiled_Datasets_Fix/BSDS100\Scale_3/'

    out_base = 'Z:\SuperResolution\Outputs\DRCNN_Baisc\\'

    checkFolder(out_base)

    #training online for a whole day
    batch_size = 32
    epochs = 500
    momentum = 0.9
    decay = 0.0001

    workers = 4

    sr_dataset = ImageLabelDataset(sr_destination_base,
                                   transform=transforms.ToTensor(),
                                   resize=False)

    sr_dataloader = DataLoader(sr_dataset,
                               batch_size=batch_size,
                               shuffle=True,
                               num_workers=workers,
                               drop_last=True)

    test_dataset_200 = ImageLabelDataset(test_base_200,
                                         transform=transforms.ToTensor(),
                                         resize=False)

    test_dataloader_200 = DataLoader(test_dataset_200,
                                     batch_size=batch_size,
                                     shuffle=False,
                                     num_workers=workers,
                                     drop_last=True)

    model = Basic_DRC(n_recursions=8, n_channels=3)
    model = model.to(device)

    mse = nn.MSELoss()

    #SGD optimizer where each layer has their own weights
    opt = torch.optim.SGD(params=[
        {
            'params': model.parameters(),
            'lr': 0.01
        },
    ],
                          momentum=momentum,
                          weight_decay=decay)

    sched = lr_scheduler.ReduceLROnPlateau(opt,
                                           'min',
                                           factor=0.001,
                                           patience=5,
                                           min_lr=10e-6)

    avg_loss = 0
    avg_test_loss = 0

    train_loss_list = []
    test_loss_list = []
    test_psnr_list = []
    test_ssim_list = []

    for e in range(epochs):
        #print("Train Epoch: " + str(e))
        for i, sample in tqdm(enumerate(sr_dataloader, 0),
                              total=len(sr_dataloader)):
            model.train()

            x = sample['input'].to(device)
            y = sample['label'].to(device)

            opt.zero_grad()

            out = model(x)

            loss = mse(out, y).to(device)

            avg_loss += loss.item()

            loss.backward()
            opt.step()

        epoch_train_loss = avg_loss / len(sr_dataloader)
        train_loss_list.append(epoch_train_loss)
        print("Train Loss: " + str(epoch_train_loss))
        avg_loss = 0
        avg_psnr = 0
        avg_ssim = 0

        force_test = False
        if e % 10 == 0 or force_test:
            with torch.no_grad():
                print("Testing Epoch: " + str(e))
                for i, sample in tqdm(enumerate(test_dataloader_200, 0),
                                      total=len(test_dataloader_200)):
                    model.eval()

                    x = sample['input'].to(device)
                    y = sample['label'].to(device)

                    opt.zero_grad()

                    out = model(x)

                    test_loss = mse(out, y).to(device)
                    sched.step(test_loss)

                    if out.dtype != y.dtype:
                        print("Dtype mixmatch")
                    if out.shape != y.shape:
                        print("shape mismatch")

                    avg_test_loss += test_loss.item()

                    avg_ssim += ssim(y.permute(0, 2, 3,
                                               1).detach().cpu().numpy(),
                                     out.permute(0, 2, 3,
                                                 1).detach().cpu().numpy(),
                                     multichannel=True)
                    avg_psnr += psnr(y.detach().cpu().numpy(),
                                     out.detach().cpu().numpy())

                    if i == 50:
                        t_o = out[0].permute(1, 2, 0).detach().cpu().numpy()

                        t_y = y[0].permute(1, 2, 0).detach().cpu().numpy()
                        t_x = x[0].permute(1, 2, 0).detach().cpu().numpy()

                epoch_test_loss = avg_test_loss / len(test_dataloader_200)

                avg_ssim /= len(test_dataloader_200)
                avg_psnr /= len(test_dataloader_200)

                test_loss_list.append(epoch_test_loss)
                test_psnr_list.append(avg_psnr)
                test_ssim_list.append(avg_ssim)

                print("Test Loss: " + str(epoch_test_loss))
                print("Avg SSIM: " + str(avg_ssim))
                print("Avg PSNR: " + str(avg_psnr))

                avg_test_loss = 0

                fig, ax = plt.subplots(3)

                ax[0].imshow(t_y)
                ax[1].imshow(t_x)
                ax[2].imshow(t_o)

                nb_out = len(os.listdir(out_base))
                fig.savefig(out_base + str(nb_out) + '.png', dpi=800)

                fig_l, ax_l = plt.subplots(4)

                ax_l[0].plot(train_loss_list, color='blue')
                ax_l[0].set_title("Train Loss")

                ax_l[1].plot(test_loss_list, color='red')
                ax_l[1].set_title("Test Loss")

                ax_l[2].plot(test_psnr_list)
                ax_l[2].set_title("Test Avg PSNR")

                ax_l[3].plot(test_ssim_list)
                ax_l[3].set_title("Test Avg SSIM")

                fig_l.tight_layout()

                fig_l.savefig(out_base + "test_metrics" + '.png', dpi=800)
Exemplo n.º 20
0
def train(clean_dir, adv_dir, attack_type):
	'''
	clean_dir:
		(str) path of the root folder of all clean images
	adv_dir:
		(str) path to the root folder of all attacked images
	attack_type:
		(str) type of attack name
	'''
	# Ignore all warnings
	import warnings
	warnings.filterwarnings("ignore")

	# Setup Model hyer-param
	z_size = 2048
	hidden_dim = 64
	drop_p = 0.5
	image_size = 224
	channel_num = 3
	is_res = True

	# Set up training hyer-params
	lr = 1e-3
	weight_decay = 1e-5
	batch_size = 64
	num_epochs = 50
	beta = 1
	visual_interval = 2
	best_loss = math.inf
	loss_record = {'train': {'total_loss': [], 'rec_loss':[], 'kl_loss':[]},
 				   'val':   {'total_loss': [], 'rec_loss':[], 'kl_loss':[]}}

	dataset = {x: ImageDataset(clean_dir, adv_dir, attack_type, x) for x in ['train', 'val']}
	dataset_sizes = {x: len(dataset[x]) for x in ['train', 'val']}
	print('Dataset size: train {}, val {}'.format(dataset_sizes['train'], dataset_sizes['val']))

	dataloaders = {'train': DataLoader(dataset['train'], batch_size=batch_size, shuffle=True,  num_workers=0),
                   'val'  : DataLoader(dataset['val'],   batch_size=batch_size, shuffle=False, num_workers=0)}

    # Initialize VAE model, optimizer and scheduler
	model = VAE(image_size, channel_num, hidden_dim, z_size, is_res, drop_p).to(device)
	optimizer = optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.999), weight_decay=weight_decay)
	scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=5, threshold=1e-7)

    # Training
	print ('Start training on {}...'.format(device))
	since = time.time()

	counter = 0
	for epoch in range(num_epochs):
		print('\nEpoch {}/{}, lr: {}, wd: {}'.format(epoch + 1, num_epochs,
			  optimizer.param_groups[0]['lr'], weight_decay))
		print('-' * 30)

		# early stop counter
		if optimizer.param_groups[0]['lr'] < 1e-6:
			counter += 1
		if counter >= 5:
			break

		for phase in ['train', 'val']:
			if phase == 'train':
				model.train()
			else:
				model.eval()

            # Initial running loss
			running_total_loss = 0.0
			running_rec_loss = 0.0
			running_kl_loss = 0.0

			for inputs, targets in tqdm(dataloaders[phase], desc='{} iterations'.format(phase), leave=False):
				inputs  = inputs.to(device)
				targets = targets.to(device)
            	# forward-prop
				with torch.set_grad_enabled(phase == 'train'):
					(mean, logvar), reconstructed = model(inputs)
					rec_loss = model.reconstruction_loss(reconstructed, targets)
					kl_loss = model.kl_divergence_loss(mean, logvar)
					total_loss = rec_loss + beta * kl_loss

                    # backward + optimize only if in training phase
					if phase == 'train':
						# zero the parameter gradients
						optimizer.zero_grad()
						# backward-prop
						total_loss.backward()
						optimizer.step()

				# compute loss for running loss
				running_kl_loss += kl_loss.item() * inputs.size(0)
				running_rec_loss += rec_loss.item() * inputs.size(0)
				running_total_loss += total_loss.item() * inputs.size(0)

			# Compute epoch loss
			epoch_kl_loss = running_kl_loss / dataset_sizes[phase]
			epoch_rec_loss = running_rec_loss / dataset_sizes[phase]
			epoch_total_loss = running_total_loss / dataset_sizes[phase]

			# Update loss records
			loss_record[phase]['total_loss'].append(epoch_total_loss)
			loss_record[phase]['rec_loss'].append(epoch_rec_loss)
			loss_record[phase]['kl_loss'].append(epoch_kl_loss)

			# Output training/val results
			print('{} Loss: total: {:.4f}, rec_loss: {:.4f}, kl_loss: {:.4f}'
				.format(phase, epoch_total_loss, epoch_rec_loss, epoch_kl_loss))

			# Save images
			if (epoch+1) % visual_interval == 0 and epoch > 0 and phase == 'val':
				rndIdx = random.randint(0, inputs.size(0)-1)
				print ('Save reconstructed images, random index={} in the last batch'.format(rndIdx))
				visualResults(inputs[rndIdx], reconstructed[rndIdx], targets[rndIdx], epoch+1)

			# Step optimizer scheduler
			if phase == 'val':
				scheduler.step(epoch_total_loss)

			# Copy best model
			if phase == 'val' and epoch_total_loss < best_loss:
				best_loss = epoch_total_loss
				best_model_wts = copy.deepcopy(model.state_dict())

	# End of training, return the best model
	time_elapsed = time.time() - since
	print('\nTraining complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
	print('Best val loss: {}'.format(best_loss))

	# Save the best weights and loss_records
	save_path = './trained_weights/'
	if not os.path.isdir(save_path):
		os.mkdir(save_path)
	weight_fname = 'vae_{}_zdim{}_hdim{}_e{}_lr{}.torch'.format(attack_type, z_size, hidden_dim, num_epochs, str(lr).split('.')[-1])
	s_path = os.path.join(save_path, weight_fname)
	torch.save(best_model_wts, s_path)
	print ('Best weight save to:', s_path)

	save_path = './trained_records/'
	if not os.path.isdir(save_path):
		os.mkdir(save_path)
	weight_fname = 'vae_{}_zdim{}_hdim{}_e{}_lr{}.pkl'.format(attack_type, z_size, hidden_dim, num_epochs, str(lr).split('.')[-1])
	s_path = os.path.join(save_path, weight_fname)
	torch.save(best_model_wts, s_path)
	print ('Training records save to:', s_path)
Exemplo n.º 21
0
def main(args):
    model_path = args.model_path
    save_dir = args.save_dir
    vec_dim = 128

    data_type = ['validation'
                 ] if args.phase == 'test' else ['train', 'validation']
    img_list, base_path, item_dict = read_data("DeepFashion2",
                                               bbox_gt=True,
                                               type_list=data_type)

    # model = ResNetbasedNet(vec_dim=vec_dim, max_pool=True, load_path=model_path, clf2_num=2, adv_eta=1e-4)
    model = ResNetbasedNet(vec_dim=vec_dim,
                           max_pool=True,
                           load_path=model_path,
                           clf2_num=2)

    domain_adap = args.domain_adap
    adv_train = args.adv_train
    is_cud = torch.cuda.is_available()
    device = torch.device("cuda" if is_cud else "cpu")
    if is_cud:
        if torch.cuda.device_count() > 1:
            model = nn.DataParallel(model)
        model.to(device)
    kwargs = {'num_workers': 8, 'pin_memory': True} if is_cud else {}

    if args.phase == 'train':
        train_dataset = DeepFashionDataset(img_list['train'],
                                           root=base_path,
                                           augment=True)
        train_batch_sampler = BalancedBatchSampler(train_dataset.labels,
                                                   train_dataset.source,
                                                   n_classes=64,
                                                   n_samples=4)
        online_train_loader = torch.utils.data.DataLoader(
            train_dataset, batch_sampler=train_batch_sampler, **kwargs)

        test_dataset = DeepFashionDataset(img_list['validation'],
                                          root=base_path)
        test_batch_sampler = BalancedBatchSampler(test_dataset.labels,
                                                  test_dataset.source,
                                                  n_classes=64,
                                                  n_samples=4)
        online_test_loader = torch.utils.data.DataLoader(
            test_dataset, batch_sampler=test_batch_sampler, **kwargs)

        margin = 0.2
        loss_fn = OnlineTripletLoss(margin,
                                    HardestNegativeTripletSelector(margin),
                                    domain_adap)
        # loss_fn = AllTripletLoss(margin)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=1e-5, weight_decay=5e-4)
        # scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", patience=4, threshold=0.001, cooldown=2, min_lr=1e-4 / (10 * 2),)
        scheduler = lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode="max",
            patience=4,
            threshold=1,
            cooldown=2,
            min_lr=1e-5 / (10 * 2),
        )
        n_epochs = 300
        log_interval = 200

        fit(online_train_loader,
            online_test_loader,
            model,
            loss_fn,
            optimizer,
            scheduler,
            n_epochs,
            is_cud,
            log_interval,
            save_dir,
            metrics=[AverageNonzeroTripletsMetric()],
            start_epoch=200,
            criterion=criterion,
            domain_adap=domain_adap,
            adv_train=adv_train)
        # fit(online_train_loader, online_test_loader, model, loss_fn, optimizer, scheduler, n_epochs, is_cud, log_interval,
        #     save_dir, metrics=[AverageNonzeroTripletsMetric()], start_epoch=0, criterion=criterion,
        #     adv_train=True, adv_epsilon=0.01, adv_alph=0.007, adv_iter=1)

    else:
        with torch.no_grad():
            model.eval()
            test_dataset = DeepFashionDataset(img_list['validation'],
                                              root=base_path)
            test_loader = torch.utils.data.DataLoader(test_dataset,
                                                      batch_size=256,
                                                      shuffle=False,
                                                      num_workers=4)
            embedding_mtx = torch.zeros((len(test_dataset), vec_dim))
            labels = np.zeros(len(test_dataset))
            top_k = 500
            idx_ = 0
            start_time = time.time()
            cf_mtx = np.zeros(
                4, dtype=float
            )  # predict_user_real_user / predict_user_real_shop / predict_shop_real_user / predict_shop_real_shop

            for idx, (data, target, _, source) in enumerate(test_loader):
                emb_vecs = model(data.cuda())
                embedding_mtx[idx_:idx_ + len(data)] = emb_vecs[0]
                predict = torch.argmax(emb_vecs[1], dim=1).cpu().numpy()
                real = source.cpu().numpy()
                cf_mtx[0] += np.sum((predict == 0) & (real == 0))
                cf_mtx[1] += np.sum((predict == 0) & (real == 1))
                cf_mtx[2] += np.sum((predict == 1) & (real == 0))
                cf_mtx[3] += np.sum((predict == 1) & (real == 1))
                labels[idx_:idx_ + len(data)] = np.asarray(target)
                idx_ += len(data)
                if idx % 20 == 0:
                    print('processing {}/{}... elapsed time {}s'.format(
                        idx + 1, len(test_loader),
                        time.time() - start_time))

        print('Total: {}, Domain Classification Acc: {:.5f}'.format(
            np.sum(cf_mtx), (cf_mtx[0] + cf_mtx[3]) / np.sum(cf_mtx)))
        print('Recall User Photo: {:.5f}'.format(cf_mtx[0] /
                                                 (cf_mtx[0] + cf_mtx[2])))
        print('Recall Shop Photo: {:.5f}'.format(cf_mtx[3] /
                                                 (cf_mtx[1] + cf_mtx[3])))

        np.save(os.path.join(save_dir, 'emb_mtx.npy'), embedding_mtx)
        with open(os.path.join(save_dir, 'file_info.txt'), 'w') as f:
            for i in range(len(test_dataset)):
                f.write('{},{},{},{}\n'.format(img_list['validation'][i][0],
                                               test_dataset[i][1],
                                               test_dataset[i][2],
                                               test_dataset[i][3]))
        print('save files!')

        distance_mtx = pdist(embedding_mtx)
        sorted_idx = torch.argsort(distance_mtx, dim=1).cpu().numpy()
        result_arr = np.zeros((sorted_idx.shape[0], top_k))
        for idx in range(sorted_idx.shape[0]):
            result_arr[idx] = sorted_idx[idx][sorted_idx[idx] != idx][:top_k]
            result_arr[idx] = labels[result_arr[idx].astype(
                np.int)] == labels[idx]
            if idx % 1000 == 0:
                print(idx)

        for k in [1, 5, 10, 20, 100, 200, 500]:
            topk_accuracy = np.sum(
                np.sum(result_arr[:, :k], axis=1) > 0) / result_arr.shape[0]
            print('Top-{} Accuracy: {:.5f}'.format(k, topk_accuracy))
Exemplo n.º 22
0
def main():
    args = parse_command()
    print(args)

    # if setting gpu id, the using single GPU
    if args.gpu:
        print('Single GPU Mode.')
        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu

    best_result = Result()
    best_result.set_to_worst()

    # set random seed
    torch.manual_seed(args.manual_seed)
    torch.cuda.manual_seed(args.manual_seed)
    np.random.seed(args.manual_seed)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        args.batch_size = args.batch_size * torch.cuda.device_count()
    else:
        print("Let's use GPU ", torch.cuda.current_device())

    train_loader, val_loader = create_loader(args)

    if args.resume:
        assert os.path.isfile(args.resume), \
            "=> no checkpoint found at '{}'".format(args.resume)
        print("=> loading checkpoint '{}'".format(args.resume))
        checkpoint = torch.load(args.resume)

        start_epoch = checkpoint['epoch'] + 1
        best_result = checkpoint['best_result']
        optimizer = checkpoint['optimizer']

        # solve 'out of memory'
        model = checkpoint['model']

        print("=> loaded checkpoint (epoch {})".format(checkpoint['epoch']))

        # clear memory
        del checkpoint
        # del model_dict
        torch.cuda.empty_cache()
    else:
        print("=> creating Model")
        model = get_models(args)
        print("=> model created.")
        start_epoch = 0

        # different modules have different learning rate
        train_params = [{
            'params': model.get_1x_lr_params(),
            'lr': args.lr
        }, {
            'params': model.get_10x_lr_params(),
            'lr': args.lr * 10
        }]

        optimizer = torch.optim.SGD(train_params,
                                    lr=args.lr,
                                    momentum=args.momentum,
                                    weight_decay=args.weight_decay)

        # You can use DataParallel() whether you use Multi-GPUs or not
        model = nn.DataParallel(model).cuda()

    # when training, use reduceLROnPlateau to reduce learning rate
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer,
                                               'min',
                                               patience=args.lr_patience)

    # loss function
    criterion = criteria._CrossEntropyLoss2d(size_average=True,
                                             batch_average=True)

    # create directory path
    output_directory = utils.get_output_directory(args)
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    best_txt = os.path.join(output_directory, 'best.txt')
    config_txt = os.path.join(output_directory, 'config.txt')

    # write training parameters to config file
    if not os.path.exists(config_txt):
        with open(config_txt, 'w') as txtfile:
            args_ = vars(args)
            args_str = ''
            for k, v in args_.items():
                args_str = args_str + str(k) + ':' + str(v) + ',\t\n'
            txtfile.write(args_str)

    # create log
    log_path = os.path.join(
        output_directory, 'logs',
        datetime.now().strftime('%b%d_%H-%M-%S') + '_' + socket.gethostname())
    if os.path.isdir(log_path):
        shutil.rmtree(log_path)
    os.makedirs(log_path)
    logger = SummaryWriter(log_path)

    start_iter = len(train_loader) * start_epoch + 1
    max_iter = len(train_loader) * (args.epochs - start_epoch + 1) + 1
    iter_save = len(train_loader)
    # iter_save = 1

    # train
    model.train()
    if args.freeze:
        model.module.freeze_backbone_bn()
    output_directory = utils.get_output_directory(args, check=True)

    average_meter = AverageMeter()
    train_meter = AverageMeter()

    for it in tqdm(range(start_iter, max_iter + 1),
                   total=max_iter,
                   leave=False,
                   dynamic_ncols=True):
        optimizer.zero_grad()

        loss = 0

        data_time = 0
        gpu_time = 0

        for _ in range(args.iter_size):

            end = time.time()

            try:
                samples = next(loader_iter)
            except:
                loader_iter = iter(train_loader)
                samples = next(loader_iter)

            input = samples['image'].cuda()
            target = samples['label'].cuda()

            torch.cuda.synchronize()
            data_time_ = time.time()
            data_time += data_time_ - end

            with torch.autograd.detect_anomaly():
                preds = model(input)  # @wx 注意输出

                # print('#train preds size:', len(preds))
                # print('#train preds[0] size:', preds[0].size())
                iter_loss = 0
                if args.msc:
                    for pred in preds:
                        # Resize labels for {100%, 75%, 50%, Max} logits
                        target_ = utils.resize_labels(target,
                                                      shape=(pred.size()[-2],
                                                             pred.size()[-1]))
                        # print('#train pred size:', pred.size())
                        iter_loss += criterion(pred, target_)
                else:
                    pred = preds
                    target_ = utils.resize_labels(target,
                                                  shape=(pred.size()[-2],
                                                         pred.size()[-1]))
                    # print('#train pred size:', pred.size())
                    # print('#train target size:', target.size())
                    iter_loss += criterion(pred, target_)

                # Backpropagate (just compute gradients wrt the loss)
                iter_loss /= args.iter_size
                iter_loss.backward()

                loss += float(iter_loss)

            gpu_time += time.time() - data_time_

        torch.cuda.synchronize()

        # Update weights with accumulated gradients
        optimizer.step()

        # measure accuracy and record loss
        result = Result()
        pred = F.softmax(pred, dim=1)

        result.evaluate(pred.data.cpu().numpy(),
                        target.data.cpu().numpy(),
                        n_class=21)
        average_meter.update(result, gpu_time, data_time, input.size(0))
        train_meter.update(result, gpu_time, data_time, input.size(0))

        if it % args.print_freq == 0:
            print('=> output: {}'.format(output_directory))
            print('Train Iter: [{0}/{1}]\t'
                  't_Data={data_time:.3f}({average.data_time:.3f}) '
                  't_GPU={gpu_time:.3f}({average.gpu_time:.3f})\n\t'
                  'Loss={Loss:.5f} '
                  'MeanAcc={result.mean_acc:.3f}({average.mean_acc:.3f}) '
                  'MIOU={result.mean_iou:.3f}({average.mean_iou:.3f}) '.format(
                      it,
                      max_iter,
                      data_time=data_time,
                      gpu_time=gpu_time,
                      Loss=loss,
                      result=result,
                      average=average_meter.average()))
            logger.add_scalar('Train/Loss', loss, it)
            logger.add_scalar('Train/mean_acc', result.mean_iou, it)
            logger.add_scalar('Train/mean_iou', result.mean_acc, it)

        if it % iter_save == 0:
            epoch = it // iter_save
            resu1t, img_merge = validate(args,
                                         val_loader,
                                         model,
                                         epoch=epoch,
                                         logger=logger)

            # when rml doesn't fall, reduce learning rate
            scheduler.step(result.mean_iou)

            # save the change of learning_rate
            for i, param_group in enumerate(optimizer.param_groups):
                old_lr = float(param_group['lr'])
                logger.add_scalar('Lr/lr_' + str(i), old_lr, it)

            # vis the change between train and test
            train_avg = train_meter.average()
            logger.add_scalars(
                'TrainVal/mean_acc', {
                    'train_mean_acc': train_avg.mean_acc,
                    'test_mean_acc': result.mean_acc
                }, epoch)
            logger.add_scalars(
                'TrainVal/mean_iou', {
                    'train_mean_iou': train_avg.mean_iou,
                    'test_mean_iou': result.mean_iou
                }, epoch)
            train_meter.reset()
            # remember best rmse and save checkpoint
            is_best = result.mean_iou < best_result.mean_iou
            if is_best:
                best_result = result
                with open(best_txt, 'w') as txtfile:
                    txtfile.write("epoch={}, mean_iou={:.3f}, mean_acc={:.3f}"
                                  "t_gpu={:.4f}".format(
                                      epoch, result.mean_iou, result.mean_acc,
                                      result.gpu_time))
                if img_merge is not None:
                    img_filename = output_directory + '/comparison_best.png'
                    utils.save_image(img_merge, img_filename)

            # save checkpoint for each epoch
            utils.save_checkpoint(
                {
                    'args': args,
                    'epoch': epoch,
                    'model': model,
                    'best_result': best_result,
                    'optimizer': optimizer,
                }, is_best, it, output_directory)

            # change to train mode
            model.train()
            if args.freeze:
                model.module.freeze_backbone_bn()

    logger.close()
Exemplo n.º 23
0
            nn.BatchNorm1d(64),
            nn.ReLU(),            
            nn.Linear(64, 10),
        )       
    def forward(self,x):
        out = self.layer(x)
        out = out.view(batch_size, -1)
        out = self.fc_layer(out)
        return out

#머신 러닝 과정
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = CNN().to(device)
loss_func = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer,threshold=0.1, patience=1, mode='min')    
for i in range(1, num_epoch+1):
    for _,[image,label] in enumerate(train_loader):
        
        x = image.to(device)
        y_= label.to(device)

        optimizer.zero_grad()
        output = model.forward(x)
        loss = loss_func(output, y_)
        loss.backward()
        optimizer.step()
    scheduler.step(loss)      
    print('Epoch: {}, Loss: {}, LR: {}'.format(i, loss.item(), scheduler.optimizer.state_dict()['param_groups'][0]['lr']))

#정확도 계산
Exemplo n.º 24
0
    def __init__(self, argpath, mode='cn'):
        super(TRADE, self).__init__()

        self.init_session()

        self.crosswoz_root = os.path.dirname(os.path.abspath(__file__))
        self.download_model()
        self.download_data()

        directory = argpath.split("/")
        HDD = directory[2].split('HDD')[1].split('BSZ')[0]
        decoder = directory[1].split('-')[0]
        BSZ = int(args['batch']) if args['batch'] else int(directory[2].split('BSZ')[1].split('DR')[0])
        args["decoder"] = decoder


        train, dev, test, test_special, lang, SLOTS_LIST, gating_dict, max_word = prepare_data_seq_cn(False, 'dst',
                                                                                                      False,
                                                                                                      batch_size=4)
        self.slot_list = SLOTS_LIST
        self.test_set = test
        hidden_size = int(HDD)
        lang = lang
        path = argpath
        lr=0
        task = 'dst'
        dropout = 0
        slots = SLOTS_LIST
        gating_dict = gating_dict
        nb_train_vocab = max_word

        self.mode = mode
        self.name = "TRADE"
        self.task = task
        self.hidden_size = hidden_size
        self.lang = lang[0]
        self.mem_lang = lang[1]
        self.lr = lr
        self.dropout = dropout
        self.slots = slots[0]
        self.slot_temp = slots[2]
        self.gating_dict = gating_dict
        self.nb_gate = len(gating_dict)
        self.cross_entorpy = nn.CrossEntropyLoss()

        self.encoder = EncoderRNN(self.lang.n_words, hidden_size, self.dropout, mode=mode)
        self.decoder = Generator(self.lang, self.encoder.embedding, self.lang.n_words, hidden_size, self.dropout,
                                 self.slots, self.nb_gate)
        model_root = os.path.dirname(os.path.abspath(__file__))
        if path:
            path = os.path.join(model_root, path)
            # if USE_CUDA:
            #     print("MODEL {} LOADED".format(str(path)))
            #     trained_encoder = torch.load(str(path) + '/enc.th')
            #     trained_decoder = torch.load(str(path) + '/dec.th')
            # else:
            #     print("MODEL {} LOADED".format(str(path)))
            #     trained_encoder = torch.load(str(path) + '/enc.th', lambda storage, loc: storage)
            #     trained_decoder = torch.load(str(path) + '/dec.th', lambda storage, loc: storage)

            self.encoder.load_state_dict(torch.load(str(path) + '/enc.pr'))
            self.decoder.load_state_dict(torch.load(str(path) + '/dec.pr'))

        # Initialize optimizers and criterion
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.scheduler = lr_scheduler.ReduceLROnPlateau(self.optimizer, mode='max', factor=0.5, patience=1,
                                                        min_lr=0.0001, verbose=True)

        self.reset()
        if USE_CUDA:
            self.encoder.cuda()
            self.decoder.cuda()
Exemplo n.º 25
0
def train(args):
    device = torch.device(args.device)

    text_field = TextField()
    label_field = LabelField()
    train_dataset, valid_dataset, test_dataset = load_data(
        root='data', text_field=text_field, label_field=label_field)
    # Our model will be run in 'open-vocabulary' mode.
    text_field.build_vocab(train_dataset, valid_dataset, test_dataset)
    label_field.build_vocab(train_dataset)
    text_field.vocab.load_vectors(args.word_vector)

    # Trim training data to make them shorter than the max length
    trim_dataset(train_dataset, max_length=args.max_length)

    train_loader, valid_loader, test_loader = data.Iterator.splits(
        datasets=(train_dataset, valid_dataset, test_dataset),
        batch_size=args.batch_size,
        device=device)

    config_path = os.path.join(args.save_dir, 'config.yml')
    with open(config_path, 'r') as f:
        config = yaml.load(f)
    model = QuoraModel(num_words=len(text_field.vocab),
                       num_classes=len(label_field.vocab),
                       **config['model'])
    model.word_embedding.weight.data.set_(text_field.vocab.vectors)
    model.word_embedding.weight.requires_grad = args.tune_word_embeddings
    print(model)
    model.to(device)

    num_params = sum(p.numel() for p in model.parameters())
    num_intrinsic_params = num_params - model.word_embedding.weight.numel()
    logger.info(f'* # of params: {num_params}')
    logger.info(f'  - Intrinsic: {num_intrinsic_params}')
    logger.info(f'  - Word embedding: {num_params - num_intrinsic_params}')

    trainable_params = [p for p in model.parameters() if p.requires_grad]
    optimizer = optim.Adam(trainable_params)
    assert not args.warm_restart or args.cosine_lr
    if args.cosine_lr:
        if not args.warm_restart:
            scheduler = lr_scheduler.CosineAnnealingLR(
                optimizer=optimizer, T_max=len(train_loader) * args.max_epoch)
        else:
            scheduler = lr_scheduler.CosineAnnealingLR(optimizer=optimizer,
                                                       T_max=len(train_loader))
    else:
        scheduler = lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                   mode='max',
                                                   factor=0.5,
                                                   patience=10,
                                                   verbose=True)
    criterion = nn.CrossEntropyLoss()

    def run_iter(batch):
        pre_text, pre_length = batch.text1
        hyp_text, hyp_length = batch.text2
        label = batch.label
        logit = model(pre_inputs=pre_text,
                      pre_length=pre_length,
                      hyp_inputs=hyp_text,
                      hyp_length=hyp_length)
        clf_loss = criterion(input=logit, target=label)
        pred = logit.max(1)[1]
        accuracy = torch.eq(pred, label).float().mean()
        if model.training:
            if args.l2_weight > 0:
                l2_norm = sum(p.pow(2).sum() for p in trainable_params).sqrt()
            else:
                l2_norm = 0
            loss = clf_loss + args.l2_weight * l2_norm
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(trainable_params, max_norm=5)
            optimizer.step()
        return clf_loss.item(), accuracy.item()

    def validate(loader):
        model.eval()
        clf_loss_sum = accuracy_sum = 0
        num_valid_data = len(loader.dataset)
        with torch.no_grad():
            for valid_batch in loader:
                clf_loss, accuracy = run_iter(valid_batch)
                clf_loss_sum += clf_loss * valid_batch.batch_size
                accuracy_sum += accuracy * valid_batch.batch_size
        clf_loss = clf_loss_sum / num_valid_data
        accuracy = accuracy_sum / num_valid_data
        return clf_loss, accuracy

    train_summary_writer = SummaryWriter(
        os.path.join(args.save_dir, 'log', 'train'))
    valid_summary_writer = SummaryWriter(
        os.path.join(args.save_dir, 'log', 'valid'))

    validate_every = len(train_loader) // args.verbosity
    best_valid_accuracy = 0
    global_step = 0
    logger.info('Training starts!')
    for train_batch in train_loader:
        if not model.training:
            model.train()
        train_clf_loss, train_accuracy = run_iter(train_batch)
        global_step += 1
        if args.cosine_lr:
            if not args.warm_restart:
                scheduler.step()
            else:
                if scheduler.last_epoch == scheduler.T_max:
                    scheduler.T_max = scheduler.T_max * 2
                    scheduler.step(0)
                    logger.info('Warm-restarted the learning rate!')
                else:
                    scheduler.step()

        train_summary_writer.add_scalar(tag='clf_loss',
                                        scalar_value=train_clf_loss,
                                        global_step=global_step)
        train_summary_writer.add_scalar(tag='accuracy',
                                        scalar_value=train_accuracy,
                                        global_step=global_step)

        if global_step % validate_every == 0:
            progress = train_loader.iterations / len(train_loader)
            logger.info(f'* Epoch {progress:.2f}')
            logger.info(f'  - lr = {optimizer.param_groups[0]["lr"]:.6f}')
            logger.info(f'  - Validation starts')
            valid_clf_loss, valid_accuracy = validate(valid_loader)
            _, test_accuracy = validate(test_loader)
            if not args.cosine_lr:
                scheduler.step(valid_accuracy)
            valid_summary_writer.add_scalar(tag='clf_loss',
                                            scalar_value=valid_clf_loss,
                                            global_step=global_step)
            valid_summary_writer.add_scalar(tag='accuracy',
                                            scalar_value=valid_accuracy,
                                            global_step=global_step)
            valid_summary_writer.add_scalar(
                tag='lr',
                scalar_value=optimizer.param_groups[0]['lr'],
                global_step=global_step)
            logger.info(f'  - Valid clf loss: {valid_clf_loss:.5f}')
            logger.info(f'  - Valid accuracy: {valid_accuracy:.5f}')
            logger.info(f'  - Test accuracy: {test_accuracy:.5f}')
            if valid_accuracy > best_valid_accuracy:
                best_valid_accuracy = valid_accuracy
                model_filename = (f'best-{progress:.2f}'
                                  f'-{valid_clf_loss:.5f}'
                                  f'-{valid_accuracy:.5f}.pt')
                model_path = os.path.join(args.save_dir, model_filename)
                torch.save(model.state_dict(), model_path)
                logger.info(f'  - Saved the new best model to: {model_path}')
            elif args.save_every_epoch and global_step % (validate_every *
                                                          10) == 0:
                model_filename = (f'model-{progress:.2f}'
                                  f'-{valid_clf_loss:.5f}'
                                  f'-{valid_accuracy:.5f}.pt')
                model_path = os.path.join(args.save_dir, model_filename)
                torch.save(model.state_dict(), model_path)
                logger.info(f'  - Saved the new model to: {model_path}')

        if train_loader.epoch > args.max_epoch:
            break
Exemplo n.º 26
0
def main():
    fold = 0
    # 4.1 mkdirs
    if not os.path.exists(config.submit):
        os.makedirs(config.submit)
    if not os.path.exists(config.weights + config.model_name + os.sep +
                          str(fold)):
        os.makedirs(config.weights + config.model_name + os.sep + str(fold))
    if not os.path.exists(config.best_models):
        os.mkdir(config.best_models)
    if not os.path.exists("./logs/"):
        os.mkdir("./logs/")

    #4.2 get model
    # model=MultiModalNet("se_resnext101_32x4d","dpn107",0.5)
    model = MultiModalNet("se_resnext50_32x4d", "dpn26", 0.5)

    #4.3 optim & criterion
    #optimizer = optim.SGD(model.parameters(),lr = config.lr,momentum=0.9,weight_decay=1e-4)
    criterion = nn.CrossEntropyLoss().to(device)  #多任务处理,所以选择交叉熵

    # optimizer = optim.SGD([{'params': model.base.parameters()},
    #                        {'params': model.classifier.parameters(), 'lr': config.lr*0.1}], lr=1e-5,momentum=0.9,weight_decay=1e-4)
    #betas = (0.9,0.999), eps = 1e-08,
    optimizer = optim.Adam(
        model.parameters(),
        lr=config.lr,
        betas=(0.9, 0.999),
        weight_decay=1e-4)  #betas = (0.9,0.999), eps = 1e-08,

    # class SGD(Optimizer):
    #     def __init__(self, params, lr=required, momentum=0, dampening=0, weight_decay1=0, weight_decay2=0, nesterov=False):
    #         defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
    #                         weight_decay1=weight_decay1, weight_decay2=weight_decay2, nesterov=nesterov)
    #         if nesterov and (momentum <= 0 or dampening != 0):
    #             raise ValueError("Nesterov momentum requires a momentum and zero dampening")
    #         super(SGD, self).__init__(params, defaults)

    #     def __setstate__(self, state):
    #         super(SGD, self).__setstate__(state)
    #         for group in self.param_groups:
    #             group.setdefault('nesterov', False)

    #     def step(self, closure=None):
    #         """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """
    #         loss = None
    #         if closure is not None:
    #             loss = closure()

    #         for group in self.param_groups:
    #             weight_decay1 = group['weight_decay1']
    #             weight_decay2 = group['weight_decay2']
    #             momentum = group['momentum']
    #             dampening = group['dampening']
    #             nesterov = group['nesterov']

    #             for p in group['params']:
    #                 if p.grad is None:
    #                     continue
    #                 d_p = p.grad.data
    #                 if weight_decay1 != 0:
    #                     d_p.add_(weight_decay1, torch.sign(p.data))
    #                 if weight_decay2 != 0:
    #                     d_p.add_(weight_decay2, p.data)
    #                 if momentum != 0:
    #                     param_state = self.state[p]
    #                     if 'momentum_buffer' not in param_state:
    #                         buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
    #                         buf.mul_(momentum).add_(d_p)
    #                     else:
    #                         buf = param_state['momentum_buffer']
    #                         buf.mul_(momentum).add_(1 - dampening, d_p)
    #                     if nesterov:
    #                         d_p = d_p.add(momentum, buf)
    #                     else:
    #                         d_p = buf

    #                 p.data.add_(-group['lr'], d_p)

    #         return loss

    start_epoch = 0
    best_acc = 0
    best_loss = np.inf
    best_f1 = 0
    best_results = [0, np.inf, 0]
    val_metrics = [0, np.inf, 0]
    resume = False
    if resume:
        checkpoint = torch.load(
            r'./checkpoints/best_models/seresnext101_dpn107_defrog_multimodal_fold_0_model_best_loss.pth.tar'
        )
        best_acc = checkpoint['best_acc']
        best_loss = checkpoint['best_loss']
        best_f1 = checkpoint['best_f1']
        start_epoch = checkpoint['epoch']

    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)
    model.to(device)

    all_files = pd.read_csv("./train.csv")
    test_files = pd.read_csv("./test.csv")
    train_data_list, val_data_list = train_test_split(all_files,
                                                      test_size=0.1,
                                                      random_state=2050)

    # load dataset
    train_gen = MultiModalDataset(train_data_list,
                                  config.train_data,
                                  config.train_vis,
                                  mode="train")
    train_loader = DataLoader(
        train_gen,
        batch_size=config.batch_size,
        shuffle=True,
        pin_memory=True,
        num_workers=1)  #num_worker is limited by shared memory in Docker!

    val_gen = MultiModalDataset(val_data_list,
                                config.train_data,
                                config.train_vis,
                                augument=False,
                                mode="train")
    val_loader = DataLoader(val_gen,
                            batch_size=config.batch_size,
                            shuffle=False,
                            pin_memory=True,
                            num_workers=1)

    test_gen = MultiModalDataset(test_files,
                                 config.test_data,
                                 config.test_vis,
                                 augument=False,
                                 mode="test")
    test_loader = DataLoader(test_gen,
                             1,
                             shuffle=False,
                             pin_memory=True,
                             num_workers=1)

    #scheduler = lr_scheduler.StepLR(optimizer,step_size=10,gamma=0.1,last_epoch = -1)
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer)
    #n_batches = int(len(train_loader.dataset) // train_loader.batch_size)
    #scheduler = CosineAnnealingLR(optimizer, T_max=n_batches*2)
    start = timer()

    #train
    for epoch in range(0, config.epochs):
        scheduler.step(epoch)
        # train
        train_metrics = train(train_loader, model, criterion, optimizer, epoch,
                              val_metrics, best_results, start)
        # val
        val_metrics = evaluate(val_loader, model, criterion, epoch,
                               train_metrics, best_results, start)
        # check results
        is_best_acc = val_metrics[0] > best_results[0]
        best_results[0] = max(val_metrics[0], best_results[0])
        is_best_loss = val_metrics[1] < best_results[1]
        best_results[1] = min(val_metrics[1], best_results[1])
        is_best_f1 = val_metrics[2] > best_results[2]
        best_results[2] = max(val_metrics[2], best_results[2])
        # save model
        save_checkpoint(
            {
                "epoch": epoch + 1,
                "model_name": config.model_name,
                "state_dict": model.state_dict(),
                "best_acc": best_results[0],
                "best_loss": best_results[1],
                "optimizer": optimizer.state_dict(),
                "fold": fold,
                "best_f1": best_results[2],
            }, is_best_acc, is_best_loss, is_best_f1, fold)
        # print logs
        print('\r', end='', flush=True)
        log.write('%s  %5.1f %6.1f      |   %0.3f   %0.3f   %0.3f     |  %0.3f   %0.3f    %0.3f    |   %s  %s  %s | %s' % (\
                "best", epoch, epoch,
                train_metrics[0], train_metrics[1],train_metrics[2],
                val_metrics[0],val_metrics[1],val_metrics[2],
                str(best_results[0])[:8],str(best_results[1])[:8],str(best_results[2])[:8],
                time_to_str((timer() - start),'min'))
            )
        log.write("\n")
        time.sleep(0.01)

    best_model = torch.load("%s/%s_fold_%s_model_best_loss.pth.tar" %
                            (config.best_models, config.model_name, str(fold)))
    model.load_state_dict(best_model["state_dict"])
    test(test_loader, model, fold)
Exemplo n.º 27
0
# ## Load a model

# In[19]:

model_conv = models.inception_v3(pretrained=True)
model_conv.aux_logits = False
num_ftrs = model_conv.fc.in_features
model_conv.fc = nn.Linear(num_ftrs, 3)
model_conv = model_conv.to(device)
criterion = nn.CrossEntropyLoss()

# In[20]:

optimizer_ft = optim.Adam(model_conv.parameters(), lr=0.001)
exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer_ft,
                                                  patience=5,
                                                  verbose=True)

# ## Run the model

# In[21]:

model_conv, train_loss, train_acc, val_loss, val_acc, true_train_labels, predicted_train_labels = train_model(
    model_conv, criterion, optimizer_ft, exp_lr_scheduler, num_epochs=10)

# In[24]:

plt.hist(train_labels['level3'])

# In[25]:
Exemplo n.º 28
0
def main(args):
    source_train_set = custom_dataset(args.train_data_path, args.train_gt_path)
    target_train_set = custom_dataset(args.target_data_path,
                                      args.target_gt_path)
    valid_train_set = valid_dataset(args.val_data_path, args.val_gt_path)

    source_train_loader = data.DataLoader(source_train_set,
                                          batch_size=args.batch_size,
                                          shuffle=True,
                                          num_workers=args.num_workers,
                                          drop_last=True)
    target_train_loader = data.DataLoader(target_train_set,
                                          batch_size=args.batch_size,
                                          shuffle=True,
                                          num_workers=args.num_workers,
                                          drop_last=True)
    valid_loader = data.DataLoader(valid_train_set,
                                   batch_size=args.batch_size,
                                   shuffle=False,
                                   num_workers=args.num_workers,
                                   drop_last=False)

    criterion = Loss().to(device)
    # domain loss
    loss_domain = torch.nn.CrossEntropyLoss().to(device)

    best_loss = 1000
    best_num = 0

    model = EAST()
    if args.pretrained_model_path:
        model.load_state_dict(torch.load(args.pretrained_model_path))

    # resume
    if args.resume:
        checkpoint = torch.load(args.resume)
        model.load_state_dict(checkpoint['state_dict'])
        best_loss = checkpoint['best_loss']
        current_epoch_num = checkpoint['epoch']

    data_parallel = False
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)
        data_parallel = True

    model.to(device)

    total_epoch = args.epochs
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    # scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[total_epoch // 3, total_epoch * 2 // 3], gamma=0.1)
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer,
                                               mode='min',
                                               factor=0.1,
                                               patience=6,
                                               threshold=args.lr / 100)
    current_epoch_num = 0

    # resume
    if args.resume:
        checkpoint = torch.load(args.resume)
        scheduler.load_state_dict(checkpoint['scheduler'])

    for epoch in range(current_epoch_num, total_epoch):
        each_epoch_start = time.time()
        # scheduler.step(epoch)
        # add lr in tensorboardX
        writer.add_scalar('epoch/lr', get_learning_rate(optimizer), epoch)

        train(source_train_loader, target_train_loader, model, criterion,
              loss_domain, optimizer, epoch)

        val_loss = eval(model, valid_loader, criterion, loss_domain, epoch)
        scheduler.step(val_loss)

        if val_loss < best_loss:
            best_num = epoch + 1
            best_loss = val_loss
            best_model_wts = copy.deepcopy(model.module.state_dict(
            ) if data_parallel else model.state_dict())
            # save best model

            torch.save(
                {
                    'epoch': epoch + 1,
                    'state_dict': best_model_wts,
                    'best_loss': best_loss,
                    'scheduler': scheduler.state_dict(),
                }, os.path.join(save_folder, "model_epoch_best.pth"))

            log.write('best model num:{}, best loss is {:.8f}'.format(
                best_num, best_loss))
            log.write('\n')

        if (epoch + 1) % int(args.save_interval) == 0:
            state_dict = model.module.state_dict(
            ) if data_parallel else model.state_dict()
            torch.save(
                {
                    'epoch': epoch + 1,
                    'state_dict': state_dict,
                    'best_loss': best_loss,
                    'scheduler': scheduler.state_dict(),
                },
                os.path.join(save_folder,
                             'model_epoch_{}.pth'.format(epoch + 1)))
            log.write('save model')
            log.write('\n')

        log.write('=' * 50)
        log.write('\n')
def main_worker():
    seed = 1
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    opt = parse_opts()
    train_data = get_training_data(cfg)
    val_data = get_validation_data(cfg)
    train_loader = DataLoader(train_data,
                              num_workers=opt.num_workers,
                              collate_fn=collater,
                              batch_size=opt.batch_size,
                              shuffle=True)
    val_loader = DataLoader(val_data,
                            num_workers=opt.num_workers,
                            collate_fn=collater,
                            batch_size=opt.batch_size,
                            shuffle=True)

    print(f"Training dataset size : {len(train_loader.dataset)}")
    print(f"Validation dataset size : {len(val_loader.dataset)}")

    dataiterator = iter(train_loader)

    faster_rcnn = FasterRCNN()

    # if torch.cuda.device_count() > 1 and opt.multi_gpu :
    #     print("Let's use", torch.cuda.device_count(), "GPUs!")
    #     faster_rcnn = nn.DataParallel(faster_rcnn)

    # loading model from a ckpt
    if opt.weight_path:
        load_from_ckpt(opt, faster_rcnn)
    faster_rcnn.to(cfg.DEVICE)

    if opt.lr is not None:
        cfg.TRAIN.LEARNING_RATE = opt.lr
    lr = cfg.TRAIN.LEARNING_RATE
    print(f"Learning rate : {lr}")

    if opt.weight_decay is not None:
        cfg.TRAIN.WEIGHT_DECAY = opt.weight_decay
    print(f"Weight Decay : {cfg.TRAIN.WEIGHT_DECAY}")

    ### Optimizer ###
    # record backbone params, i.e., conv_body and box_head params
    backbone_bias_params = []
    backbone_bias_param_names = []
    prd_branch_bias_params = []
    prd_branch_bias_param_names = []
    backbone_nonbias_params = []
    backbone_nonbias_param_names = []
    prd_branch_nonbias_params = []
    prd_branch_nonbias_param_names = []
    for key, value in dict(faster_rcnn.named_parameters()).items():
        if value.requires_grad:
            if 'fpn' in key or 'box_head' in key or 'box_predictor' in key or 'rpn' in key:
                if 'bias' in key:
                    backbone_bias_params.append(value)
                    backbone_bias_param_names.append(key)
                else:
                    backbone_nonbias_params.append(value)
                    backbone_nonbias_param_names.append(key)
            else:
                if 'bias' in key:
                    prd_branch_bias_params.append(value)
                    prd_branch_bias_param_names.append(key)
                else:
                    prd_branch_nonbias_params.append(value)
                    prd_branch_nonbias_param_names.append(key)
    params = [
        {
            'params': backbone_nonbias_params,
            'lr': cfg.TRAIN.LEARNING_RATE,
            'weight_decay': cfg.TRAIN.WEIGHT_DECAY
        },
        {
            'params': backbone_bias_params,
            'lr': cfg.TRAIN.LEARNING_RATE * (cfg.TRAIN.DOUBLE_BIAS + 1),
            'weight_decay':
            cfg.TRAIN.WEIGHT_DECAY if cfg.TRAIN.BIAS_DECAY else 0
        },
        {
            'params': prd_branch_nonbias_params,
            'lr': cfg.TRAIN.LEARNING_RATE,
            'weight_decay': cfg.TRAIN.WEIGHT_DECAY
        },
        {
            'params': prd_branch_bias_params,
            'lr': cfg.TRAIN.LEARNING_RATE * (cfg.TRAIN.DOUBLE_BIAS + 1),
            'weight_decay':
            cfg.TRAIN.WEIGHT_DECAY if cfg.TRAIN.BIAS_DECAY else 0
        },
    ]

    if cfg.TRAIN.TYPE == "ADAM":
        optimizer = torch.optim.Adam(params)

    elif cfg.TRAIN.TYPE == "SGD":
        optimizer = torch.optim.SGD(params, momentum=cfg.TRAIN.MOMENTUM)

    # scheduler
    if opt.scheduler == "plateau":
        scheduler = lr_scheduler.ReduceLROnPlateau(optimizer,
                                                   'min',
                                                   patience=5)
    elif opt.scheduler == "multi_step":
        scheduler = lr_scheduler.MultiStepLR(optimizer,
                                             milestones=[83631, 111508])
    elif opt.scheduler == "step_lr":
        scheduler = lr_scheduler.StepLR(optimizer,
                                        step_size=5,
                                        gamma=0.1,
                                        last_epoch=-1)

    if opt.weight_path:
        opt.begin_iter = load_train_utils(opt, optimizer, scheduler)

    # lr of non-backbone parameters, for commmand line outputs.
    lr = optimizer.param_groups[0]['lr']
    # lr of backbone parameters, for commmand line outputs.
    # backbone_lr = optimizer.param_groups[0]['lr']

    summary_writer = Metrics(log_dir='tf_logs')

    losses_sbj = AverageMeter('Sbj loss: ', ':.2f')
    losses_obj = AverageMeter('Obj loss: ', ':.2f')
    losses_rel = AverageMeter('Rel loss: ', ':.2f')
    losses_total = AverageMeter('Total loss: ', ':.2f')
    progress = ProgressMeter(
        [losses_sbj, losses_obj, losses_rel, losses_total], prefix='Train: ')

    faster_rcnn.train()
    th = 10000
    for step in range(opt.begin_iter, opt.max_iter):
        try:
            input_data = next(dataiterator)
        except StopIteration:
            dataiterator = iter(train_loader)
            input_data = next(dataiterator)

        images, targets = input_data
        _, metrics = faster_rcnn(images, targets)
        final_loss = metrics["loss_objectness"] + metrics["loss_rpn_box_reg"] + \
            metrics["loss_classifier"] + metrics["loss_box_reg"] + \
            metrics["loss_sbj"] + metrics["loss_obj"] + metrics["loss_rlp"]

        optimizer.zero_grad()
        final_loss.backward()
        optimizer.step()

        losses_sbj.update(metrics["loss_sbj"].item(), len(images))
        losses_obj.update(metrics["loss_obj"].item(), len(images))
        losses_rel.update(metrics["loss_rlp"].item(), len(images))
        losses_total.update(final_loss.item(), len(images))

        if opt.scheduler != "plateau":
            scheduler.step()

        if (step) % 10 == 0:
            progress.display(step)

        if step % 2500 == 0:
            train_losses = {}
            train_losses['total_loss'] = losses_total.avg
            train_losses['sbj_loss'] = losses_sbj.avg
            train_losses['obj_loss'] = losses_obj.avg
            train_losses['rel_loss'] = losses_rel.avg
            val_losses = val_epoch(faster_rcnn, val_loader)

            if opt.scheduler == "plateau":
                scheduler.step(val_losses['total_loss'])

            lr = optimizer.param_groups[0]['lr']

            # if val_losses['total_loss'] < th:
            #     save_model(faster_rcnn, optimizer, scheduler, step)
            #     print(f"*** Saved model ***")
            #     th = val_losses['total_loss']
            save_model(faster_rcnn, optimizer, scheduler, step)

            # write summary
            summary_writer.log_metrics(train_losses, val_losses, step, lr)

            print(
                f"* Average training loss : {train_losses['total_loss']:.3f}")
            print(
                f"* Average validation loss : {val_losses['total_loss']:.3f}")

            losses_sbj.reset()
            losses_obj.reset()
            losses_rel.reset()
            losses_total.reset()
            faster_rcnn.train()
Exemplo n.º 30
0
def train(ref_only_df,
          cons_train_df,
          cons_val_df,
          label_encoder,
          torch_transform,
          labelcol,
          batch_size,
          _,
          args,
          n_epochs,
          results_dir=None,
          add_perspective=False
          ):
    dataloaders = create_dataloaders(
        ref_only_df,
        cons_train_df,
        cons_val_df,
        label_encoder,
        torch_transform,
        labelcol,
        batch_size,
        add_perspective=add_perspective
        )

    model, device = init_mod_dev(args, label_encoder)

    if args.optimizer == 'momentum':
        optimizer = optim.SGD(list(model.parameters()), lr=args.init_lr)
    elif args.optimizer == 'adamdelta':
        optimizer = optim.Adadelta(model.parameters(), lr=args.init_lr)
    else:
        optimizer = optim.Adam(model.parameters(), lr=args.init_lr)

    # Reduces the LR on plateaus
    exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer,
                                                      factor=args.lr_factor,
                                                      patience=args.lr_patience,
                                                      verbose=True)

    if results_dir is None:
        results_dir = os.path.join(args.results_dir, build_logid_string(args))

    print("Starting multihead training")
    loss_weights = {'ce': args.ce_w, 'arcface': args.arcface_w, 'contrastive': args.contrastive_w, 'triplet': args.triplet_w, 'focal': args.focal_w}
    print("loss_weights", loss_weights)
    onlinecriterion = MultiheadLoss(len(label_encoder.classes_),
        args.metric_margin, HardNegativePairSelector(),
        args.metric_margin, RandomNegativeTripletSelector(args.metric_margin),
        use_cosine=args.metric_evaluator_type == 'cosine',
        weights=loss_weights,
        focal_gamma=args.focal_gamma,
        use_side_labels=args.train_with_side_labels)

    # switch evaluator for monitoring validation performance
    val_evaluator = 'logit'
    if loss_weights['triplet'] > 0 or loss_weights['contrastive'] > 0 or loss_weights['arcface'] > 0:
        val_evaluator = 'metric'

    print(f'Will use {val_evaluator} evaluator for validation')

    model, best_val_metrics = hneg_train_model(model, optimizer,
                             exp_lr_scheduler, device, dataloaders,
                             results_dir, label_encoder, onlinecriterion,
                             num_epochs=n_epochs, 
                             earlystop_patience=3 * (args.lr_patience + 1),
                             simul_sidepairs=args.metric_simul_sidepairs_eval,
                             sidepairs_agg=args.sidepairs_agg,
                             train_with_side_labels=args.train_with_side_labels,
                             metric_evaluator_type=args.metric_evaluator_type,
                             val_evaluator=val_evaluator)

    return model, best_val_metrics