Exemplo n.º 1
0
def train_model(sym_net,
                model_prefix,
                dataset,
                input_conf,
                clip_length=16,
                train_frame_interval=2,
                resume_epoch=-1,
                batch_size=4,
                save_frequency=1,
                lr_base=0.01,
                lr_factor=0.1,
                lr_steps=[400000, 800000],
                end_epoch=1000,
                distributed=False,
                fine_tune=False,
                **kwargs):

    assert torch.cuda.is_available(), "Currently, we only support CUDA version"

    # data iterator
    iter_seed = torch.initial_seed() + 100 + max(0, resume_epoch) * 100
    train_iter = iter_fac.creat(name=dataset,
                                batch_size=batch_size,
                                clip_length=clip_length,
                                train_interval=train_frame_interval,
                                mean=input_conf['mean'],
                                std=input_conf['std'],
                                seed=iter_seed)
    # wapper (dynamic model)
    net = model(
        net=sym_net,
        criterion=nn.CrossEntropyLoss().cuda(),
        model_prefix=model_prefix,
        step_callback_freq=50,
        save_checkpoint_freq=save_frequency,
        opt_batch_size=batch_size,
    )
    net.net.cuda()

    # config optimization
    param_base_layers = []
    param_new_layers = []
    name_base_layers = []
    for name, param in net.net.named_parameters():
        if fine_tune:
            if ('classifier' in name) or ('fc' in name):
                param_new_layers.append(param)
            else:
                param_base_layers.append(param)
                name_base_layers.append(name)
        else:
            param_new_layers.append(param)

    if name_base_layers:
        out = "[\'" + '\', \''.join(name_base_layers) + "\']"
        logging.info(
            "Optimizer:: >> recuding the learning rate of {} params: {}".
            format(
                len(name_base_layers),
                out if len(out) < 300 else out[0:150] + " ... " + out[-150:]))

    net.net = torch.nn.DataParallel(net.net).cuda()

    optimizer = torch.optim.SGD([{
        'params': param_base_layers,
        'lr_mult': 0.2
    }, {
        'params': param_new_layers,
        'lr_mult': 1.0
    }],
                                lr=lr_base,
                                momentum=0.9,
                                weight_decay=0.0001,
                                nesterov=True)

    # load params from pretrained 3d network
    if resume_epoch > 0:
        logging.info("Initializer:: resuming model from previous training")

    # resume training: model and optimizer
    if resume_epoch < 0:
        epoch_start = 0
        step_counter = 0
    else:
        net.load_checkpoint(epoch=resume_epoch, optimizer=optimizer)
        epoch_start = resume_epoch
        step_counter = epoch_start * train_iter.__len__()

    # set learning rate scheduler
    num_worker = dist.get_world_size() if torch.distributed.is_initialized(
    ) else 1
    lr_scheduler = MultiFactorScheduler(
        base_lr=lr_base,
        steps=[int(x / (batch_size * num_worker)) for x in lr_steps],
        factor=lr_factor,
        step_counter=step_counter)
    # define evaluation metric
    metrics = metric.MetricList(
        metric.Loss(name="loss-ce"),
        metric.Accuracy(name="top1", topk=1),
        metric.Accuracy(name="top5", topk=5),
    )

    net.fit(
        train_iter=train_iter,
        optimizer=optimizer,
        lr_scheduler=lr_scheduler,
        metrics=metrics,
        epoch_start=epoch_start,
        epoch_end=end_epoch,
    )
Exemplo n.º 2
0
def train_model(sym_net, model_prefix, dataset, input_conf,
                clip_length=16, train_frame_interval=2, val_frame_interval=2,
                resume_epoch=-1, batch_size=4, save_frequency=1,
                lr_base=0.01, lr_factor=0.1, lr_steps=[400000, 800000],
                end_epoch=1000, distributed=False, 
                pretrained_3d=None, fine_tune=False,
                load_from_frames=False, use_flow=False, triplet_loss=False,
                **kwargs):

    assert torch.cuda.is_available(), "Currently, we only support CUDA version"

    # data iterator
    iter_seed = torch.initial_seed() \
                + (torch.distributed.get_rank() * 10 if distributed else 100) \
                + max(0, resume_epoch) * 100
    train_iter, eval_iter = iterator_factory.creat(name=dataset,
                                                   batch_size=batch_size,
                                                   clip_length=clip_length,
                                                   train_interval=train_frame_interval,
                                                   val_interval=val_frame_interval,
                                                   mean=input_conf['mean'],
                                                   std=input_conf['std'],
                                                   seed=iter_seed,
                                                   load_from_frames=load_from_frames,
                                                   use_flow=use_flow)
    # wapper (dynamic model)
    if use_flow:
        class LogNLLLoss(torch.nn.Module):
            def __init__(self):
                super(LogNLLLoss, self).__init__()
                self.loss = torch.nn.NLLLoss()

            def forward(self, output, target):
                output = torch.log(output)
                loss = self.loss(output, target)
                return loss
        # criterion = LogNLLLoss().cuda()
        criterion = torch.nn.CrossEntropyLoss().cuda()
    elif triplet_loss:
        logging.info("Using triplet loss")
        criterion=torch.nn.MarginRankingLoss().cuda()
    else:
        criterion = torch.nn.CrossEntropyLoss().cuda()
    net = model(net=sym_net,
                criterion=criterion,
                triplet_loss=triplet_loss,
                model_prefix=model_prefix,
                step_callback_freq=50,
                save_checkpoint_freq=save_frequency,
                opt_batch_size=batch_size, # optional
                )
    net.net.cuda()

    # config optimization
    param_base_layers = []
    param_new_layers = []
    name_base_layers = []
    for name, param in net.net.named_parameters():
        if fine_tune:
            # if name.startswith('classifier'):
            if 'classifier' in name or 'fc' in name:
                param_new_layers.append(param)
            else:
                param_base_layers.append(param)
                name_base_layers.append(name)
        else:
            param_new_layers.append(param)

    if name_base_layers:
        out = "[\'" + '\', \''.join(name_base_layers) + "\']"
        logging.info("Optimizer:: >> recuding the learning rate of {} params: {}".format(len(name_base_layers),
                     out if len(out) < 300 else out[0:150] + " ... " + out[-150:]))

    if distributed:
        net.net = torch.nn.parallel.DistributedDataParallel(net.net).cuda()
    else:
        net.net = torch.nn.DataParallel(net.net).cuda()

    optimizer = torch.optim.SGD([{'params': param_base_layers, 'lr_mult': 0.2},
                                 {'params': param_new_layers, 'lr_mult': 1.0}],
                                lr=lr_base,
                                momentum=0.9,
                                weight_decay=0.0001,
                                nesterov=True)

    # load params from pretrained 3d network
    if pretrained_3d:
        if resume_epoch < 0:
            if os.path.exists(pretrained_3d):
                # assert os.path.exists(pretrained_3d), "cannot locate: '{}'".format(pretrained_3d)
                logging.info("Initializer:: loading model states from: `{}'".format(pretrained_3d))
                checkpoint = torch.load(pretrained_3d)
                net.load_state(checkpoint['state_dict'], mode='ada')
            else:
                logging.warning("cannot locate: '{}'".format(pretrained_3d))
        else:
            logging.info("Initializer:: skip loading model states from: `{}'"
                + ", since it's going to be overwrited by the resumed model".format(pretrained_3d))

    # resume training: model and optimizer
    if resume_epoch < 0:
        epoch_start = 0
        step_counter = 0
    else:
        net.load_checkpoint(epoch=resume_epoch, optimizer=optimizer)
        epoch_start = resume_epoch
        step_counter = epoch_start * train_iter.__len__()

    # set learning rate scheduler
    num_worker = 1
    lr_scheduler = MultiFactorScheduler(base_lr=lr_base,
                                        steps=[int(x/(batch_size*num_worker)) for x in lr_steps],
                                        factor=lr_factor,
                                        step_counter=step_counter)
    # define evaluation metric
    if triplet_loss:
        metrics = metric.MetricList(metric.Loss(name="loss-triplet"),
                                    metric.TripletAccuracy(name="acc"), )
    else:
        metrics = metric.MetricList(metric.Loss(name="loss-ce"),
                                    metric.Accuracy(name="top1", topk=1),
                                    metric.Accuracy(name="top5", topk=5), )
    # enable cudnn tune
    cudnn.benchmark = True

    net.fit(train_iter=train_iter,
            eval_iter=eval_iter,
            optimizer=optimizer,
            lr_scheduler=lr_scheduler,
            metrics=metrics,
            epoch_start=epoch_start,
            epoch_end=end_epoch,)
Exemplo n.º 3
0
def train_model(net_name,
                sym_net,
                model_prefix,
                dataset,
                input_conf,
                modality='rgb',
                split=1,
                clip_length=16,
                train_frame_interval=2,
                val_frame_interval=2,
                resume_epoch=-1,
                batch_size=4,
                save_frequency=1,
                lr_base=0.01,
                lr_base2=0.01,
                lr_d=None,
                lr_factor=0.1,
                lr_steps=[400000, 800000],
                end_epoch=1000,
                distributed=False,
                pretrained_3d=None,
                fine_tune=False,
                iter_size=1,
                optim='sgd',
                accumulate=True,
                ds_factor=16,
                epoch_thre=1,
                score_dir=None,
                mv_minmaxnorm=False,
                mv_loadimg=False,
                detach=False,
                adv=0,
                new_classifier=False,
                **kwargs):

    assert torch.cuda.is_available(), "Currently, we only support CUDA version"
    torch.multiprocessing.set_sharing_strategy('file_system')
    import resource
    rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
    resource.setrlimit(resource.RLIMIT_NOFILE, (2048, rlimit[1]))
    # data iterator
    iter_seed = torch.initial_seed() \
                + (torch.distributed.get_rank() * 10 if distributed else 100) \
                + max(0, resume_epoch) * 100

    train_iter, eval_iter = iterator_factory.creat(
        name=dataset,
        batch_size=batch_size,
        clip_length=clip_length,
        train_interval=train_frame_interval,
        val_interval=val_frame_interval,
        mean=input_conf['mean'],
        std=input_conf['std'],
        seed=iter_seed,
        modality=modality,
        split=split,
        net_name=net_name,
        accumulate=accumulate,
        ds_factor=ds_factor,
        mv_minmaxnorm=mv_minmaxnorm,
        mv_loadimg=mv_loadimg)
    #define an instance of class model
    net = model(
        net=sym_net,
        criterion=torch.nn.CrossEntropyLoss().cuda(),
        model_prefix=model_prefix,
        step_callback_freq=50,
        save_checkpoint_freq=save_frequency,
        opt_batch_size=batch_size,  # optional
        criterion2=torch.nn.MSELoss().cuda()
        if modality == 'flow+mp4' else None,
        criterion3=torch.nn.CrossEntropyLoss().cuda() if adv > 0. else None,
        adv=adv,
    )
    net.net.cuda()
    print(torch.cuda.current_device(), torch.cuda.device_count())
    # config optimization
    param_base_layers = []
    param_new_layers = []
    name_base_layers = []
    params_gf = []
    params_d = []
    for name, param in net.net.named_parameters():
        if modality == 'flow+mp4':
            if name.startswith('gen_flow_model'):
                params_gf.append(param)
            elif name.startswith('discriminator'):
                params_d.append(param)
            else:
                if (name.startswith('conv3d_0c_1x1')
                        or name.startswith('classifier')):
                    #if name.startswith('classifier'):
                    param_new_layers.append(param)
                else:
                    param_base_layers.append(param)
                    name_base_layers.append(name)
            #else:
            #    #print(name)
            #    param_new_layers.append(param)
        else:
            if fine_tune:
                if name.startswith('classifier') or name.startswith(
                        'conv3d_0c_1x1'):
                    #if name.startswith('classifier'):
                    param_new_layers.append(param)
                else:
                    param_base_layers.append(param)
                    name_base_layers.append(name)
            else:
                param_new_layers.append(param)
    if modality == 'flow+mp4':
        if fine_tune:
            lr_mul = 0.2
        else:
            lr_mul = 0.5
    else:
        lr_mul = 0.2
    #print(params_d)
    if name_base_layers:
        out = "[\'" + '\', \''.join(name_base_layers) + "\']"
        logging.info(
            "Optimizer:: >> recuding the learning rate of {} params: {} by factor {}"
            .format(
                len(name_base_layers),
                out if len(out) < 300 else out[0:150] + " ... " + out[-150:],
                lr_mul))
    if net_name == 'I3D':
        weight_decay = 0.0001
    else:
        raise ValueError('UNKOWN net_name', net_name)
    logging.info("Train_Model:: weight_decay: `{}'".format(weight_decay))
    if distributed:
        net.net = torch.nn.parallel.DistributedDataParallel(net.net).cuda()
    else:
        net.net = torch.nn.DataParallel(net.net).cuda()

    if optim == 'adam':
        optimizer = torch.optim.Adam([{
            'params': param_base_layers,
            'lr_mult': lr_mul
        }, {
            'params': param_new_layers,
            'lr_mult': 1.0
        }],
                                     lr=lr_base,
                                     weight_decay=weight_decay)
        optimizer_2 = torch.optim.Adam([{
            'params': param_base_layers,
            'lr_mult': lr_mul
        }, {
            'params': param_new_layers,
            'lr_mult': 1.0
        }],
                                       lr=lr_base2,
                                       weight_decay=weight_decay)
    else:
        optimizer = torch.optim.SGD([{
            'params': param_base_layers,
            'lr_mult': lr_mul
        }, {
            'params': param_new_layers,
            'lr_mult': 1.0
        }],
                                    lr=lr_base,
                                    momentum=0.9,
                                    weight_decay=weight_decay,
                                    nesterov=True)
        optimizer_2 = torch.optim.SGD([{
            'params': param_base_layers,
            'lr_mult': lr_mul
        }, {
            'params': param_new_layers,
            'lr_mult': 1.0
        }],
                                      lr=lr_base2,
                                      momentum=0.9,
                                      weight_decay=weight_decay,
                                      nesterov=True)
    if adv > 0.:
        optimizer_3 = torch.optim.Adam(params_d,
                                       lr=lr_base,
                                       weight_decay=weight_decay,
                                       eps=0.001)
    else:
        optimizer_3 = None
    if modality == 'flow+mp4':
        if optim == 'adam':
            optimizer_mse = torch.optim.Adam(params_gf,
                                             lr=lr_base,
                                             weight_decay=weight_decay,
                                             eps=1e-08)
            optimizer_mse_2 = torch.optim.Adam(params_gf,
                                               lr=lr_base2,
                                               weight_decay=weight_decay,
                                               eps=0.001)
        else:
            optimizer_mse = torch.optim.SGD(params_gf,
                                            lr=lr_base,
                                            momentum=0.9,
                                            weight_decay=weight_decay,
                                            nesterov=True)
            optimizer_mse_2 = torch.optim.SGD(params_gf,
                                              lr=lr_base2,
                                              momentum=0.9,
                                              weight_decay=weight_decay,
                                              nesterov=True)
    else:
        optimizer_mse = None
        optimizer_mse_2 = None
    # load params from pretrained 3d network
    if pretrained_3d and not pretrained_3d == 'False':
        if resume_epoch < 0:
            assert os.path.exists(pretrained_3d), "cannot locate: `{}'".format(
                pretrained_3d)
            logging.info(
                "Initializer:: loading model states from: `{}'".format(
                    pretrained_3d))
            if net_name == 'I3D':
                checkpoint = torch.load(pretrained_3d)
                keys = list(checkpoint.keys())
                state_dict = {}
                for name in keys:
                    state_dict['module.' + name] = checkpoint[name]
                del checkpoint
                net.load_state(state_dict, strict=False)
                if new_classifier:
                    checkpoint = torch.load(
                        './network/pretrained/model_flow.pth')
                    keys = list(checkpoint.keys())
                    state_dict = {}
                    for name in keys:
                        state_dict['module.' + name] = checkpoint[name]
                    del checkpoint
                    net.load_state(state_dict, strict=False)
            else:
                checkpoint = torch.load(pretrained_3d)
                net.load_state(checkpoint['state_dict'], strict=False)
        else:
            logging.info(
                "Initializer:: skip loading model states from: `{}'" +
                ", since it's going to be overwrited by the resumed model".
                format(pretrained_3d))

    # resume training: model and optimizer
    if resume_epoch < 0:
        epoch_start = 0
        step_counter = 0
    else:
        net.load_checkpoint(epoch=resume_epoch,
                            optimizer=optimizer,
                            optimizer_mse=optimizer_mse)
        epoch_start = resume_epoch
        step_counter = epoch_start * train_iter.__len__()

    # set learning rate scheduler
    num_worker = dist.get_world_size() if torch.distributed._initialized else 1
    lr_scheduler = MultiFactorScheduler(
        base_lr=lr_base,
        steps=[int(x / (batch_size * num_worker)) for x in lr_steps],
        factor=lr_factor,
        step_counter=step_counter)
    if modality == 'flow+mp4':
        lr_scheduler2 = MultiFactorScheduler(
            base_lr=lr_base2,
            steps=[int(x / (batch_size * num_worker)) for x in lr_steps],
            factor=lr_factor,
            step_counter=step_counter)
        if lr_d == None:
            lr_scheduler3 = MultiFactorScheduler(
                base_lr=lr_d,
                steps=[int(x / (batch_size * num_worker)) for x in lr_steps],
                factor=lr_factor,
                step_counter=step_counter)
        else:
            print("_____________", lr_d)
            lr_scheduler3 = MultiFactorScheduler(
                base_lr=lr_d,
                steps=[int(x / (batch_size * num_worker)) for x in lr_steps],
                factor=lr_factor,
                step_counter=step_counter)
    else:
        lr_scheduler2 = None
        lr_scheduler3 = None
    # define evaluation metric
    metrics_D = None
    if modality == 'flow+mp4':
        metrics = metric.MetricList(
            metric.Loss(name="loss-ce"),
            metric.Loss(name="loss-mse"),
            metric.Accuracy(name="top1", topk=1),
            metric.Accuracy(name="top5", topk=5),
        )
        if adv > 0:
            metrics_D = metric.MetricList(metric.Loss(name="classi_D"),
                                          metric.Loss(name="adv_D"))

    else:
        metrics = metric.MetricList(
            metric.Loss(name="loss-ce"),
            metric.Accuracy(name="top1", topk=1),
            metric.Accuracy(name="top5", topk=5),
        )
    # enable cudnn tune
    cudnn.benchmark = True
    net.fit(train_iter=train_iter,
            eval_iter=eval_iter,
            optimizer=optimizer,
            lr_scheduler=lr_scheduler,
            metrics=metrics,
            epoch_start=epoch_start,
            epoch_end=end_epoch,
            iter_size=iter_size,
            optimizer_mse=optimizer_mse,
            optimizer_2=optimizer_2,
            optimizer_3=optimizer_3,
            optimizer_mse_2=optimizer_mse_2,
            lr_scheduler2=lr_scheduler2,
            lr_scheduler3=lr_scheduler3,
            metrics_D=metrics_D,
            epoch_thre=epoch_thre,
            score_dir=score_dir,
            detach=detach)
import datetime
from network.srtg_resnet import srtg_r2plus1d_50
from data import iterator_factory
from train import metric
from train.model import model

#%% Define params
# define path
dataset_path = '/nfs/s2/userhome/zhouming/workingdir/Video/HACS/dataset'
working_path = '/nfs/s2/userhome/zhouming/workingdir/Video/HACS/train_model'
output_path  = f'{working_path}/out'

#%% prepare model
net = srtg_r2plus1d_50(num_classes=200)
net = torch.nn.DataParallel(net, device_ids=[0])
model = model(net=net,
              criterion=torch.nn.CrossEntropyLoss().cuda())
info = torch.load(f'{working_path}/models/srtg_r2plus1d_50_best.pth')
model.load_state(info['state_dict'],strict=True)
del info

#%%  prepare dataset
# data iterator - randomisation based on date and time values
iter_seed = torch.initial_seed() + 100 
now = datetime.datetime.now()
iter_seed += now.year + now.month + now.day + now.hour + now.minute + now.second

# Create custom loaders for validation
eval_loader = iterator_factory.create(
    name='HACS',
    batch_size=8,
    val_clip_length=16,
def train_model(sym_net, name, model_prefix, input_conf,
                clip_length=32, clip_size=224, train_frame_interval=2, val_frame_interval=2,
                resume_epoch=-1, batch_size=16, save_frequency=1,
                lr_base=0.01, lr_factor=0.1, lr_steps=[50,100,150],
                enable_long_cycles=True, enable_short_cycles=True, end_epoch=300,
                pretrained_3d=None, fine_tune=False, dataset_location='Kinetics', net_name='r3d_50', gpus=4,
                **kwargs):

    assert torch.cuda.is_available(), "Only support CUDA devices."

    # Make results directory for .csv files if it does not exist
    results_path = str('./results/'+str(name)+'/'+str(net_name))
    if not os.path.exists(results_path):
        os.makedirs(results_path)

    # data iterator - randomisation based on date and time values
    iter_seed = torch.initial_seed() + 100 + max(0, resume_epoch) * 100
    now = datetime.datetime.now()
    iter_seed += now.year + now.month + now.day + now.hour + now.minute + now.second

    # Get parent location
    # - `data` folder should include all the dataset examples.
    # - `labels` folder should inclde all labels in .csv format.
    # We use a global label formating - you can have a look at the link in the `README.md` to download the files.
    data_location = dataset_location.split('/data/')[0]

    clip_length = int(clip_length)
    clip_size = int(clip_size)

    train_loaders = {}

    # Create custom loaders for train and validation
    train_data, eval_loader, train_length = iterator_factory.create(
        name=name,
        batch_size=batch_size,
        return_len=True,
        clip_length=clip_length,
        clip_size=clip_size,
        val_clip_length=clip_length,
        val_clip_size=clip_size,
        train_interval=train_frame_interval,
        val_interval=val_frame_interval,
        mean=input_conf['mean'],
        std=input_conf['std'],
        seed=iter_seed,
        data_root=data_location)

    # Create model
    net = model(net=sym_net,
                criterion=torch.nn.CrossEntropyLoss().cuda(),
                model_prefix=model_prefix,
                step_callback_freq=1,
                save_checkpoint_freq=save_frequency,
                opt_batch_size=batch_size, # optional
                )
    net.net.cuda()


    # Parameter LR configuration for optimiser
    # Base layers are based on the layers as loaded to the model
    param_base_layers = []
    base_layers_mult = 1.0

    # New layers are based on fine-tuning
    param_new_layers = []
    new_layers_mult = 1.0

    name_base_layers = []

    param_transpose_layers = []
    transpose_layers_mult = 1.0

    param_rec_layers = []
    rec_layers_mult = 1.0

    # Iterate over all parameters
    for name, param in net.net.named_parameters():
        if fine_tune:
            if 'transpose' in name.lower():
                param_transpose_layers.append(param)
                transpose_layers_mult = .2
            elif name.lower().startswith('classifier'):
                new_layers_mult = .1
                param_new_layers.append(param)
            elif ('lstm' or 'gru') in name.lower():
                param_transpose_layers.append(param)
                rec_layers_mult = .5
            else:
                param_base_layers.append(param)
                base_layers_mult = .6
                name_base_layers.append(name)
        else:
            if 'transpose' in name.lower():
                param_transpose_layers.append(param)
                transpose_layers_mult = .8
            elif ('lstm' or 'gru') in name.lower():
                param_transpose_layers.append(param)
                rec_layers_mult = 1.
            else:
                param_new_layers.append(param)


    # User feedback
    if name_base_layers:
        out = "[\'" + '\', \''.join(name_base_layers) + "\']"
        logging.info("Optimiser:: >> recuding the learning rate of {} params: {}".format(len(name_base_layers),
                     out if len(out) < 300 else out[0:150] + " ... " + out[-150:]))

    optimiser = torch.optim.SGD([
        {'params': param_base_layers, 'lr_mult': base_layers_mult},
        {'params': param_new_layers, 'lr_mult': new_layers_mult},
        {'params': param_rec_layers, 'lr_mult': rec_layers_mult},
        {'params': param_transpose_layers, 'lr_mult': transpose_layers_mult},],
        lr=lr_base,
        momentum=0.9,
        weight_decay=0.0001,
        nesterov=True
        )

    # Use Apex for Mixed precidion - Note: Please comment out any apex code on `train_model.py` and `model.py`
    # in case you wish to switch back to standard float32. "O0" opt_level still has some bugs when also using DataParallel
    net.net, optimiser = amp.initialize(net.net, optimiser, opt_level="O1")

    # Create DataParallel wrapper
    net.net = torch.nn.DataParallel(net.net, device_ids=[i for i in range(int(gpus))])

    # load params from pretrained 3d network
    if pretrained_3d:
        assert os.path.exists(pretrained_3d), "cannot locate: `{}'".format(pretrained_3d)
        logging.info("Initialiser:: loading model states from: `{}'".format(pretrained_3d))
        checkpoint = torch.load(pretrained_3d)
        net.load_state(checkpoint['state_dict'], strict=False)


    num_steps = train_length // batch_size

    # Long Cycle steps
    if (enable_long_cycles):

        count = 0
        index = 0
        iter_sizes = [8, 4, 2, 1]
        initial_num = num_steps

        # Expected to find the number of batches that fit exactly to the number of iterations.
        # So the sum of the floowing batch sizes should be less or equal to the number of batches left.
        while sum(iter_sizes[index:]) <= num_steps:
            # Case 1: 8 x B
            if iter_sizes[index] == 8:
                count += 1
                index = 1
                num_steps -= 8
            # Case 2: 4 x B
            elif iter_sizes[index] == 4:
                count += 1
                index = 2
                num_steps -= 4
            # Case 3: 2 x B
            elif iter_sizes[index] == 2:
                count += 1
                index = 3
                num_steps -= 2
            # Base case
            elif iter_sizes[index] == 1:
                count += 1
                index = 0
                num_steps -= 1

        print ("New number of batches per epoch is {:d} being equivalent to {:1.3f} of original number of batches with Long cycles".format(count,float(count)/float(initial_num)))
        num_steps = count

    # Short Cycle steps
    if (enable_short_cycles):

        # Iterate for *every* batch
        i = 0

        while i <= num_steps:
            m = i%3
            # Case 1: Base case
            if (m==0):
                num_steps -= 1
            # Case 2: b = 2 x B
            if (m==1):
                num_steps -= 2
            # Case 3: b = 4 x B
            else:
                num_steps -= 4

            i += 1

        # Update new number of batches
        print ("New number of batches per epoch is {:d} being equivalent to {:1.3f} of original number of batches with Short cycles".format(i,float(i)/float(initial_num)))
        num_steps = i

    # Split the batch number to four for every change in the long cycles
    long_steps = None
    if (enable_long_cycles):
        step = num_steps//4
        long_steps = list(range(num_steps))[0::step]
        num_steps = long_steps[-1]

        # Create full list of long steps (for all batches)
        for epoch in range(1,end_epoch):
            end = long_steps[-1]
            long_steps = long_steps + [x.__add__(end) for x in long_steps[-4:]]

        # Fool-proofing
        if (long_steps[0]==0):
            long_steps[0]=1


    # resume training: model and optimiser - (account of various batch sizes)
    if resume_epoch < 0:
        epoch_start = 0
        step_counter = 0
    else:
        # Try to load previous state dict in case `pretrained_3d` is None
        if not pretrained_3d:
            try:
                net.load_checkpoint(epoch=resume_epoch, optimizer=optimiser)
            except Exception:
                logging.warning('Initialiser:: No previous checkpoint found in the directory! You can specify the path explicitly with `pretrained_3d` argument.')
        epoch_start = resume_epoch
        step_counter = epoch_start * num_steps

    # Step dictionary creation
    iteration_steps = {'long_0':[],'long_1':[],'long_2':[],'long_3':[],'short_0':[],'short_1':[],'short_2':[]}
    #Populate dictionary
    for batch_i in range(0,num_steps):

        # Long cycle cases
        if batch_i>=0 and batch_i<num_steps//4:
            iteration_steps['long_0'].append(batch_i)
        elif batch_i>=num_steps//4 and batch_i<num_steps//2:
            iteration_steps['long_1'].append(batch_i)
        elif batch_i>=num_steps//2 and batch_i<(3*num_steps)//4:
            iteration_steps['long_2'].append(batch_i)
        else:
            iteration_steps['long_3'].append(batch_i)

        # Short cases
        if (batch_i%3==0):
            iteration_steps['short_0'].append(batch_i)
        elif (batch_i%3==1):
            iteration_steps['short_1'].append(batch_i)
        else:
            iteration_steps['short_2'].append(batch_i)



    # set learning rate scheduler
    lr_scheduler = MultiFactorScheduler(base_lr=lr_base,
                                        steps=[x*num_steps for x in lr_steps],
                                        iterations_per_epoch=num_steps,
                                        iteration_steps=iteration_steps,
                                        factor=lr_factor,
                                        step_counter=step_counter)
    # define evaluation metric
    metrics = metric.MetricList(metric.Loss(name="loss-ce"),
                                metric.Accuracy(name="top1", topk=1),
                                metric.Accuracy(name="top5", topk=5),
                                metric.BatchSize(name="batch_size"),
                                metric.LearningRate(name="lr"))
    # enable cudnn tune
    cudnn.benchmark = True

    # Main training happens here
    net.fit(train_iter=train_data,
            eval_iter=eval_loader,
            batch_shape=(int(batch_size),int(clip_length),int(clip_size),int(clip_size)),
            workers=8,
            no_cycles=(not(enable_long_cycles) and not(enable_short_cycles)),
            optimiser=optimiser,
            long_short_steps_dir=iteration_steps,
            lr_scheduler=lr_scheduler,
            metrics=metrics,
            iter_per_epoch=num_steps,
            epoch_start=epoch_start,
            epoch_end=end_epoch,
            directory=results_path)
Exemplo n.º 6
0
def train_model(Hash_center, sym_net, model_prefix, dataset, input_conf, hash_bit,
                clip_length=16, train_frame_interval=2, val_frame_interval=2,
                resume_epoch=-1, batch_size=4, save_frequency=1,
                lr_base=0.01, lr_factor=0.1, lr_steps=[400000, 800000],
                end_epoch=1000, distributed=False, 
                pretrained_3d=None, fine_tune=False,
                **kwargs):

    assert torch.cuda.is_available(), "Currently, we only support CUDA version"

    # data iterator
    iter_seed = torch.initial_seed()  \
                + (torch.distributed.get_rank() * 10 if distributed else 100) \
                + max(0, resume_epoch) * 100
    train_iter, eval_iter = iterator_factory.creat(name=dataset,
                                                   batch_size=batch_size,
                                                   clip_length=clip_length,
                                                   train_interval=train_frame_interval,
                                                   val_interval=val_frame_interval,
                                                   mean=input_conf['mean'],
                                                   std=input_conf['std'],
                                                   seed=iter_seed)
    print(len(train_iter))
    print(len(eval_iter))
    # wapper (dynamic model)
    net = model(net=sym_net,
                criterion=torch.nn.BCELoss().cuda(),
                model_prefix=model_prefix,
                step_callback_freq=50,
                save_checkpoint_freq=save_frequency,
                opt_batch_size=batch_size, # optional
                dataset=dataset,  # dataset name
                hash_bit=hash_bit,
                )
    net.net.cuda()

    # config optimization
    param_base_layers = []
    param_new_layers = []
    name_base_layers = []
    for name, param in net.net.named_parameters():
        if fine_tune:
            #print(f'fine tune {fine_tune}')
            if name.startswith('hash'):
                param_new_layers.append(param)
            else:
                param_base_layers.append(param)
                name_base_layers.append(name)
        else:
            param_new_layers.append(param)

    if name_base_layers:
        out = "[\'" + '\', \''.join(name_base_layers) + "\']"
        logging.info("Optimizer:: >> recuding the learning rate of {} params: {}".format(len(name_base_layers),
                     out if len(out) < 300 else out[0:150] + " ... " + out[-150:]))

    if distributed:
        net.net = torch.nn.parallel.DistributedDataParallel(net.net).cuda()
    else:
        net.net = torch.nn.DataParallel(net.net).cuda()

    optimizer = torch.optim.SGD([{'params': param_base_layers, 'lr_mult': 0.2},
                                 {'params': param_new_layers, 'lr_mult': 1.0}],
                                lr=lr_base,
                                momentum=0.9,
                                weight_decay=0.0001,
                                nesterov=True)

    # load params from pretrained 3d network
    if pretrained_3d:
        if resume_epoch < 0:
            assert os.path.exists(pretrained_3d), "cannot locate: `{}'".format(pretrained_3d)
            logging.info("Initializer:: loading model states from: `{}'".format(pretrained_3d))
            checkpoint = torch.load(pretrained_3d)
            net.load_state(checkpoint['state_dict'], strict=False)
        else:
            logging.info("Initializer:: skip loading model states from: `{}'"
                + ", since it's going to be overwrited by the resumed model".format(pretrained_3d))

    # resume training: model and optimizer
    if resume_epoch < 0:
        epoch_start = 0
        step_counter = 0
    else:
        net.load_checkpoint(epoch=resume_epoch, optimizer=optimizer)
        epoch_start = resume_epoch
        step_counter = epoch_start * train_iter.__len__()

    # set learning rate scheduler
    num_worker = dist.get_world_size() if torch.distributed._initialized else 1
    lr_scheduler = MultiFactorScheduler(base_lr=lr_base,
                                        steps=[int(x/(batch_size*num_worker)) for x in lr_steps],
                                        factor=lr_factor,
                                        step_counter=step_counter)
    # define evaluation metric
    metrics = metric.MetricList(metric.Loss(name="loss-ce"))
                        
    # enable cudnn tune
    cudnn.benchmark = True

    net.fit(train_iter=train_iter,
            eval_iter=eval_iter,
            optimizer=optimizer,
            lr_scheduler=lr_scheduler,
            metrics=metrics,
            epoch_start=epoch_start,
            epoch_end=end_epoch,
            Hash_center=Hash_center,)
Exemplo n.º 7
0
def train_model(sym_net,
                model_prefix,
                dataset,
                fold,
                clip_length=8,
                train_frame_interval=2,
                val_frame_interval=2,
                resume_epoch=-1,
                batch_size=4,
                save_frequency=1,
                lr_base=0.01,
                lr_factor=0.1,
                lr_steps=[400000, 800000],
                end_epoch=1000,
                distributed=False,
                fine_tune=False,
                epoch_div_factor=4,
                precise_bn=False,
                **kwargs):

    assert torch.cuda.is_available(), "Currently, we only support CUDA version"

    # data iterator
    iter_seed = torch.initial_seed() \
                + (torch.distributed.get_rank() * 10 if distributed else 100) \
                + max(0, resume_epoch) * 100
    train_iter, eval_iter = iterator_factory_brats.create(
        name=dataset,
        batch_size=batch_size,
        fold=fold,
        # clip_length=clip_length,
        # train_interval=train_frame_interval,
        # val_interval=val_frame_interval,
        # mean=input_conf['mean'],
        # std=input_conf['std'],
        seed=iter_seed)
    # model (dynamic)
    net = model(
        net=sym_net,
        criterion=torch.nn.CrossEntropyLoss().cuda(),
        model_prefix=model_prefix,
        step_callback_freq=50,
        save_checkpoint_freq=save_frequency,
        opt_batch_size=batch_size,  # optional
        single_checkpoint=
        precise_bn,  # TODO: use shared filesystem to rsync running mean/var
    )
    # if True:
    #     for name, module in net.net.named_modules():
    #         if name.endswith("bn"): module.momentum = 0.005
    net.net.cuda()

    # config optimization, [[w/ wd], [w/o wd]]
    param_base_layers = [[[], []], [[], []]]
    param_new_layers = [[[], []], [[], []]]
    name_freeze_layers, name_base_layers = [], []
    for name, param in net.net.named_parameters():
        idx_wd = 0 if name.endswith('.bias') else 1
        idx_bn = 0 if name.endswith(('.bias', 'bn.weight')) else 1
        if fine_tune:
            if not name.startswith('classifier'):
                param_base_layers[idx_bn][idx_wd].append(param)
                name_base_layers.append(name)
            else:
                param_new_layers[idx_bn][idx_wd].append(param)
        else:
            if "conv_m2" in name:
                param_base_layers[idx_bn][idx_wd].append(param)
                name_base_layers.append(name)
            else:
                param_new_layers[idx_bn][idx_wd].append(param)

    if name_freeze_layers:
        out = "[\'" + '\', \''.join(name_freeze_layers) + "\']"
        logging.info("Optimizer:: >> freezing {} params: {}".format(
            len(name_freeze_layers),
            out if len(out) < 300 else out[0:150] + " ... " + out[-150:]))
    if name_base_layers:
        out = "[\'" + '\', \''.join(name_base_layers) + "\']"
        logging.info(
            "Optimizer:: >> recuding the learning rate of {} params: {}".
            format(
                len(name_base_layers),
                out if len(out) < 300 else out[0:150] + " ... " + out[-150:]))

    if distributed:
        net.net = torch.nn.parallel.DistributedDataParallel(net.net).cuda()
    else:
        net.net = torch.nn.DataParallel(net.net).cuda()

    # optimizer = torch.optim.SGD(sym_net.parameters(),
    wd = 0.0001
    optimizer = custom_optim.SGD(
        [
            {
                'params': param_base_layers[0][0],
                'lr_mult': 0.5,
                'weight_decay': 0.
            },
            {
                'params': param_base_layers[0][1],
                'lr_mult': 0.5,
                'weight_decay': wd
            },
            {
                'params': param_base_layers[1][0],
                'lr_mult': 0.5,
                'weight_decay': 0.,
                'name': 'precise.bn'
            },  # *.bias
            {
                'params': param_base_layers[1][1],
                'lr_mult': 0.5,
                'weight_decay': wd,
                'name': 'precise.bn'
            },  # bn.weight
            {
                'params': param_new_layers[0][0],
                'lr_mult': 1.0,
                'weight_decay': 0.
            },
            {
                'params': param_new_layers[0][1],
                'lr_mult': 1.0,
                'weight_decay': wd
            },
            {
                'params': param_new_layers[1][0],
                'lr_mult': 1.0,
                'weight_decay': 0.,
                'name': 'precise.bn'
            },  # *.bias
            {
                'params': param_new_layers[1][1],
                'lr_mult': 1.0,
                'weight_decay': wd,
                'name': 'precise.bn'
            }
        ],  # bn.weight
        lr=lr_base,
        momentum=0.9,
        nesterov=True)

    # resume: model and optimizer
    if resume_epoch < 0:
        epoch_start = 0
        step_counter = 0
    else:
        net.load_checkpoint(epoch=resume_epoch, optimizer=optimizer)
        epoch_start = resume_epoch
        step_counter = epoch_start * int(
            train_iter.__len__() / epoch_div_factor)

    num_worker = torch.distributed.get_world_size(
    ) if torch.distributed.is_initialized() else 1
    lr_scheduler = MultiFactorScheduler(
        base_lr=lr_base,
        steps=[int(x / (batch_size * num_worker)) for x in lr_steps],
        factor=lr_factor,
        step_counter=step_counter)

    metrics = metric.MetricList(
        metric.Loss(name="loss-ce"),
        metric.Accuracy(name="top1", topk=1),
        metric.Accuracy(name="top2", topk=2),
    )

    cudnn.benchmark = True
    # cudnn.fastest = False
    # cudnn.enabled = False

    net.fit(
        train_iter=train_iter,
        eval_iter=eval_iter,
        optimizer=optimizer,
        lr_scheduler=lr_scheduler,
        metrics=metrics,
        epoch_start=epoch_start,
        epoch_end=end_epoch,
        epoch_div_factor=epoch_div_factor,
        precise_bn=precise_bn,
    )
Exemplo n.º 8
0
def train_model(sym_net,
                dataset,
                input_conf,
                train_list,
                val_list,
                clip_length=16,
                resume_epoch=-1,
                batch_size=32,
                save_frequency=1,
                lr_base=0.1,
                lr_factor=0.1,
                lr_steps=[400000, 800000],
                end_epoch=120,
                distributed=False,
                pretrained_3d=False,
                fine_tune=False,
                **kwargs):
    import argparse
    parse = argparse.ArgumentParser(description="PyTorch resume checkpoint")
    parse.add_argument(
        '--resume',
        default=
        '/workspace/mnt/group/algorithm/kanghaidong/video_project/PyTorch-MFNet/checkpoint/best_model_mfnet_3d_120.pth.tar',
        type=str,
        metavar='PATH',
        help='path to latest checkpoint (default: none)')

    #parse.add_argument('--resume', default='', type=str, metavar='PATH',help='path to latest checkpoint (default: none)')
    args = parse.parse_args()
    assert torch.cuda.is_available(), "Currently, we only support CUDA version"

    # data iterator
    iter_seed = torch.initial_seed() \
                + (torch.distributed.get_rank() * 10 if distributed else 100) \
                + max(0, resume_epoch) * 100
    train_iter, eval_iter = iterator_fac.creat(
        name=dataset,  # #  enclosed DataLoader()
        train_list=train_list,
        val_list=val_list,
        clip_length=clip_length,
        batch_size=batch_size,
        mean=input_conf['mean'],
        std=input_conf['std'],
        seed=iter_seed)
    # wapper (dynamic model)
    net = model(
        net=sym_net,
        criterion=torch.nn.CrossEntropyLoss().cuda(
        ),  # using CrossEntropyLoss;
        step_callback_freq=50,
        save_checkpoint_freq=save_frequency,
        opt_batch_size=batch_size,  # optional
    )
    net.net.cuda()

    # config optimization
    param_base_layers = []
    param_new_layers = []
    name_base_layers = []
    for name, param in net.net.named_parameters():
        if fine_tune:
            if name.startswith('classifier'):
                param_new_layers.append(param)
            else:
                param_base_layers.append(param)
                name_base_layers.append(name)
        else:
            param_new_layers.append(param)

    if name_base_layers:
        out = "[\'" + '\', \''.join(name_base_layers) + "\']"
        logging.info(
            "Optimizer:: >> recuding the learning rate of {} params: {}".
            format(
                len(name_base_layers),
                out if len(out) < 300 else out[0:150] + " ... " + out[-150:]))

    if distributed:
        net.net = torch.nn.parallel.DistributedDataParallel(net.net).cuda()
    else:
        net.net = torch.nn.DataParallel(net.net).cuda()

    optimizer = torch.optim.SGD([{
        'params': param_base_layers,
        'lr_mult': 0.2
    }, {
        'params': param_new_layers,
        'lr_mult': 1.0
    }],
                                lr=lr_base,
                                momentum=0.9,
                                weight_decay=0.0001,
                                nesterov=True)

    # load params from pretrained 3d network
    #忽略了一个问题就是,你要加载的模型,你想着全用人家的参数,但是fc层你不行,因为param的shape不一样;就是分类的label数目不一样;
    #怎么办?1,去掉fc的param,然后加载之前的参数,这个你需要构造新的fc;并且在加载的时候要特定的写出来,一种就是和当前模型对比一下dict,去掉不一样的参数;
    #即,删除与当前model不一样的key;这个比较高效!
    if args.resume:
        if os.path.isfile(args.resume):
            print("loading checkpoint:'{}".format(args.resume))
            checkpoint = torch.load(args.resume)
            #for key, v in checkpoint.items():
            #    print(key, v)
            if pretrained_3d:
                #pretrained_state = checkpoint['state_dict']
                model_state = net.net.state_dict()
                pretrained_state = {
                    k: v
                    for k, v in checkpoint.items() if k in model_state
                }  #  删除与当前model不一致的key
                #for key, v in pretrained_state.items():
                #    print(key,v)
                model_state.update(pretrained_state)
                net.net.load_state_dict(model_state)
            else:
                print('loading sus')
                epoch_start = checkpoint['epoch']
                net.net.load_state_dict(checkpoint['state_state'])
        else:
            epoch_start = 0
            print("training start new scratch !")
            print("no checkpoint found at '{}".format(args.resume))

    # define evaluation metric
    """
    metrics = metric.MetricList(metric.Loss(name="loss-ce"),
                                metric.Accuracy(name="top1", topk=1),
                                metric.Accuracy(name="top5", topk=5),)
    """
    # enable cudnn tune
    cudnn.benchmark = True

    net.fit(
        train_iter=train_iter,
        eval_iter=eval_iter,
        optimizer=optimizer,
        #lr_scheduler=lr_scheduler,
        #metrics=metrics,
        epoch_start=epoch_start,
        epoch_end=end_epoch,
    )