if args.split_method == "incremental": splitter = IncrementalInstanceSplitter(train_data, partitions=partitions) else: raise ValueError("Provided split method does not exist") train_partitions = splitter.split_data() query_loader, gallery_loader = get_test_loaders(query_data, gallery_data) metrics = initialize_metrics() for p_id in partitions_train: utils.print_log("Starting partition training for id: {}".format(p_id)) model = initialize_model(args.model, args.embedding_dim) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) train_loader = get_train_loader(train_partitions[p_id]) model_name = experiment_name + "_{}".format(p_id) train(model, criterion, train_loader, query_loader, gallery_loader, optimizer, model_name) utils.print_log(model_name) ks = evaluation(model, query_loader, gallery_loader) metrics = update_metrics(ks, *metrics) if args.continuous_learning_method == "lfl": criterion = Triplet_LFL(triplet_criterion=criterion, lamb=args.lambda_lfl) if args.continuous_learning_method == "lwf": criterion = Triplet_LWF(triplet_criterion=criterion, lamb=args.lambda_lwf) if args.continuous_learning_method == "ewc": criterion = Triplet_EWC(triplet_criterion=criterion, lamb=args.lambda_ewc)
# f = open('result/' + args.cv_dir + 'mAP.txt', 'a') # f.write('Epoch:'+str(epoch)+': it_mAP = '+str(it_mAP)+', ti_mAP = '+str(ti_mAP)+'\n') # f.close() if __name__ == '__main__': # setting opt = get_args() start_epoch = 0 total_tst_time = 0 test_cnt = 0 loss_print = 0 MODEL_UPDATE_ITER = 0 # get loader train_loader = get_train_loader(opt=opt) # define net imageNet = ImageNet() imageNet.cuda() # text net tokenizer = BertTokenizer.from_pretrained( '/home/poac/code/Multi_modal_Retrieval/experiments/pretrained_models/bert-base-uncased-vocab.txt' ) textNet = TextNet(code_length=opt.hashbits) textNet.cuda() # embedding net embNet = EmbNet(opt) embNet.cuda()
from torch.optim import lr_scheduler import numpy as np import pandas as pd import time import os import metaData import data import argparse from torch.autograd import Variable import csv #Data loader from torchvision import transforms, datasets, utils #true values trainloaders, mean, std = data.get_train_loader() print("Training data loaded") valloaders = data.get_val_loader() print("Validation data loaded") resnet18 = models.resnet18(pretrained=True) modules = list(resnet18.children())[:-1] resnet18 = nn.Sequential(*modules) for param in resnet18.parameters(): param.requires_grad = False model_reg = torch.nn.Sequential(torch.nn.Linear(512, 128), torch.nn.ReLU(), torch.nn.Dropout(0.4), torch.nn.Linear(128, 32), torch.nn.ReLU(), torch.nn.Dropout(0.4), torch.nn.Linear(32, 4))
logging.basicConfig(format="%(asctime)s %(message)s", filename=log_dir + "/" + cfg.prefix + ".txt", filemode="a") logger = logging.getLogger() logger.setLevel(logging.INFO) stream_handler = logging.StreamHandler() stream_handler.setLevel(logging.DEBUG) logger.addHandler(stream_handler) logger.info(pprint.pformat(customized_cfg)) # data loader train_loader = get_train_loader(root=os.path.join(cfg.root, cfg.train), batch_size=cfg.batch_size, image_size=cfg.image_size, random_crop=cfg.random_crop, random_erase=cfg.random_erase, random_mirror=cfg.random_mirror, num_workers=4) query_loader = None gallery_loader = None if cfg.validate_interval > 0: query_loader = get_test_loader(root=os.path.join(cfg.root, cfg.query), batch_size=512, image_size=cfg.image_size, num_workers=4) gallery_loader = get_test_loader(root=os.path.join( cfg.root, cfg.gallery), batch_size=512,
img_names = os.path.join(video_path, str(j) + '.jpg') im = Image.open(img_names) im_re = im.resize((image_width, image_width), Image.ANTIALIAS) imm = np.array(im_re).astype(np.float32) / 255 img_array_train[j - ii_num] = imm array_train_all[array_id] = img_array_train array_id += 1 print('\n') return array_train_all if data_type == 'cholec80': seq_train = get_seq_path_train(data_root_80) train_lstm_loader = get_train_loader(seq_train, batch_size) # --------- load a dataset ------------------------------------ def get_seq_path_test(data_root): video_idx = os.listdir(data_root) video_idx.sort(key=lambda x: int(x.split('.')[0])) train_idx_2 = int(len(video_idx) * 0.5) stop_idx_2 = len(video_idx) print('loading test data lstm') array_test_all = np.zeros( (num_sample, max_frames, image_width, image_width, 3), dtype=np.float32) array_id = 0
def train(): net.train() #load the two dataset for face rectangles and landmarks respectively print('Loading Dataset...') batch_size = args.batch_size train_loader = get_train_loader( imgs_root=os.path.join(args.dataset_dir, 'WIDER_train/images'), annos_file=os.path.join(args.dataset_dir, 'trainset.json'), batch_size=batch_size, num_workers=num_workers, device_id=0, local_seed=-1, shuffle=True, shuffle_after_epoch=False, num_gpus=1, ) for epoch in range(args.resume_epoch, max_epoch): lr = adjust_learning_rate_poly(optimizer, args.lr, epoch, max_epoch) #for computing average losses in this epoch loss_bbox_epoch = [] loss_iouhead_epoch = [] loss_lm_epoch = [] loss_cls_epoch = [] loss_epoch = [] # the start time load_t0 = time.time() # for each iteration in this epoch num_iter_in_epoch = len(train_loader) for iter_idx, one_batch_data in enumerate(train_loader): # load train data images, targets = one_batch_data images = images.to(device) targets = [anno.to(device) for anno in targets] # forward out = net(images) # loss loss_bbox_eiou, loss_iouhead_smoothl1, loss_lm_smoothl1, loss_cls_ce = criterion( out, priors, targets) loss = args.lambda_bbox_eiou * loss_bbox_eiou + \ args.lambda_iouhead_smoothl1 * loss_iouhead_smoothl1 + \ args.lambda_lm_smoothl1 * loss_lm_smoothl1 + \ args.lambda_cls_ce * loss_cls_ce # backprop optimizer.zero_grad() loss.backward() optimizer.step() # put losses to lists to average for printing loss_bbox_epoch.append(loss_bbox_eiou.item()) loss_iouhead_epoch.append(loss_iouhead_smoothl1.item()) loss_lm_epoch.append(loss_lm_smoothl1.item()) loss_cls_epoch.append(loss_cls_ce.item()) loss_epoch.append(loss.item()) if args.use_tensorboard: logger.add_scalar(tag='Iter/loss_bbox', scalar_value=loss_bbox_eiou.item(), global_step=iter_idx + epoch * num_iter_in_epoch) logger.add_scalar(tag='Iter/loss_iou', scalar_value=loss_iouhead_smoothl1.item(), global_step=iter_idx + epoch * num_iter_in_epoch) logger.add_scalar(tag='Iter/loss_landmark', scalar_value=loss_lm_smoothl1.item(), global_step=iter_idx + epoch * num_iter_in_epoch) logger.add_scalar(tag='Iter/loss_cls', scalar_value=loss_cls_ce.item(), global_step=iter_idx + epoch * num_iter_in_epoch) # print loss if (iter_idx % 20 == 0 or iter_idx == num_iter_in_epoch - 1): print( 'Epoch:{}/{} || iter: {}/{} || L: {:.2f}({:.2f}) IOU: {:.2f}({:.2f}) LM: {:.2f}({:.2f}) C: {:.2f}({:.2f}) All: {:.2f}({:.2f}) || LR: {:.8f}' .format(epoch, max_epoch, iter_idx, num_iter_in_epoch, loss_bbox_eiou.item(), np.mean(loss_bbox_epoch), loss_iouhead_smoothl1.item(), np.mean(loss_iouhead_epoch), loss_lm_smoothl1.item(), np.mean(loss_lm_epoch), loss_cls_ce.item(), np.mean(loss_cls_epoch), loss.item(), np.mean(loss_epoch), lr)) if args.use_tensorboard: logger.add_scalar(tag='Epoch/loss_bbox', scalar_value=np.mean(loss_bbox_epoch), global_step=epoch) logger.add_scalar(tag='Epoch/loss_iouhead', scalar_value=np.mean(loss_iouhead_epoch), global_step=epoch) logger.add_scalar(tag='Epoch/loss_landmark', scalar_value=np.mean(loss_lm_epoch), global_step=epoch) logger.add_scalar(tag='Epoch/loss_cls', scalar_value=np.mean(loss_cls_epoch), global_step=epoch) if (epoch % 50 == 0 and epoch > 0): torch.save( net.state_dict(), args.weight_filename_prefix + '_epoch_' + str(epoch) + '.pth') #the end time load_t1 = time.time() epoch_time = (load_t1 - load_t0) / 60 print('Epoch time: {:.2f} minutes; Time left: {:.2f} hours'.format( epoch_time, (epoch_time) * (max_epoch - epoch - 1) / 60)) torch.save(net.state_dict(), args.weight_filename_prefix + '_final.pth')
def train(cfg): num_gpus = torch.cuda.device_count() if num_gpus > 1: torch.distributed.init_process_group(backend="nccl", world_size=num_gpus) # set logger log_dir = os.path.join("logs/", cfg.source_dataset, cfg.prefix) if not os.path.isdir(log_dir): os.makedirs(log_dir, exist_ok=True) logging.basicConfig(format="%(asctime)s %(message)s", filename=log_dir + "/" + "log.txt", filemode="a") logger = logging.getLogger() logger.setLevel(logging.INFO) stream_handler = logging.StreamHandler() stream_handler.setLevel(logging.INFO) logger.addHandler(stream_handler) # writer = SummaryWriter(log_dir, purge_step=0) if dist.is_initialized() and dist.get_rank() != 0: logger = writer = None else: logger.info(pprint.pformat(cfg)) # training data loader if not cfg.joint_training: # single domain train_loader = get_train_loader(root=os.path.join( cfg.source.root, cfg.source.train), batch_size=cfg.batch_size, image_size=cfg.image_size, random_flip=cfg.random_flip, random_crop=cfg.random_crop, random_erase=cfg.random_erase, color_jitter=cfg.color_jitter, padding=cfg.padding, num_workers=4) else: # cross domain source_root = os.path.join(cfg.source.root, cfg.source.train) target_root = os.path.join(cfg.target.root, cfg.target.train) train_loader = get_cross_domain_train_loader( source_root=source_root, target_root=target_root, batch_size=cfg.batch_size, random_flip=cfg.random_flip, random_crop=cfg.random_crop, random_erase=cfg.random_erase, color_jitter=cfg.color_jitter, padding=cfg.padding, image_size=cfg.image_size, num_workers=8) # evaluation data loader query_loader = None gallery_loader = None if cfg.eval_interval > 0: query_loader = get_test_loader(root=os.path.join( cfg.target.root, cfg.target.query), batch_size=512, image_size=cfg.image_size, num_workers=4) gallery_loader = get_test_loader(root=os.path.join( cfg.target.root, cfg.target.gallery), batch_size=512, image_size=cfg.image_size, num_workers=4) # model num_classes = cfg.source.num_id num_cam = cfg.source.num_cam + cfg.target.num_cam cam_ids = train_loader.dataset.target_dataset.cam_ids if cfg.joint_training else train_loader.dataset.cam_ids num_instances = len( train_loader.dataset.target_dataset) if cfg.joint_training else None model = Model(num_classes=num_classes, drop_last_stride=cfg.drop_last_stride, joint_training=cfg.joint_training, num_instances=num_instances, cam_ids=cam_ids, num_cam=num_cam, neighbor_mode=cfg.neighbor_mode, neighbor_eps=cfg.neighbor_eps, scale=cfg.scale, mix=cfg.mix, alpha=cfg.alpha) model.cuda() # optimizer ft_params = model.backbone.parameters() new_params = [ param for name, param in model.named_parameters() if not name.startswith("backbone.") ] param_groups = [{ 'params': ft_params, 'lr': cfg.ft_lr }, { 'params': new_params, 'lr': cfg.new_params_lr }] optimizer = optim.SGD(param_groups, momentum=0.9, weight_decay=cfg.wd) # convert model for mixed precision distributed training model, optimizer = amp.initialize(model, optimizer, enabled=cfg.fp16, opt_level="O2") lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer=optimizer, milestones=cfg.lr_step, gamma=0.1) if dist.is_initialized(): model = parallel.DistributedDataParallel(model, delay_allreduce=True) # engine checkpoint_dir = os.path.join("checkpoints", cfg.source_dataset, cfg.prefix) engine = get_trainer( model=model, optimizer=optimizer, lr_scheduler=lr_scheduler, logger=logger, # writer=writer, non_blocking=True, log_period=cfg.log_period, save_interval=10, save_dir=checkpoint_dir, prefix=cfg.prefix, eval_interval=cfg.eval_interval, query_loader=query_loader, gallery_loader=gallery_loader) # training engine.run(train_loader, max_epochs=cfg.num_epoch) if dist.is_initialized(): dist.destroy_process_group()
banet.load_state_dict(torch.load(banet_pth_path)) if use_cuda: banet.cuda() # 初始化损失函数和优化器 criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(banet.parameters(), lr=learning_rate) if os.path.exists(optimizer_pth_path) and use_checkpoint: optimizer.load_state_dict(torch.load(optimizer_pth_path)) # 打印训练环境的参数设置情况 print('Learning rate: %.4f' % learning_rate) print('Batch size: %d' % batch_size) # 初始化数据加载器 train_loader = get_train_loader(train_caption_pkl_path, feature_h5_path, batch_size) total_step = len(train_loader) # 准备一下验证用的ground-truth reference_json_path = '{0}.json'.format(test_reference_txt_path) reference = COCO(reference_json_path) # 开始训练模型 best_meteor = 0 loss_count = 0 for epoch in range(num_epochs): epsilon = max(0.6, ss_factor / (ss_factor + np.exp(epoch / ss_factor))) print('epoch:%d\tepsilon:%.8f' % (epoch, epsilon)) log_value('epsilon', epsilon, epoch) for i, (videos, captions, cap_lens, video_ids) in enumerate(train_loader, start=1): # 构造mini batch的Variable
train_path = "processed_corpus/all.json" special_symbols = code_intent_pair.get_special_symbols() word_size = code_intent_pair.get_word_size() code_size = code_intent_pair.get_code_size() # print('word size : ', word_size) # print('code size : ', code_size) # train_path = 'processed_corpus/train.json' # train_path = 'processed_corpus/all.json' if args.data_mode == "train": train_entries = code_intent_pair.load_entries(train_path) elif args.data_mode == "all": train_entries = code_intent_pair.load_entries(train_path, upper_bound=20000) code_intent_pair.pad() trainloader = get_train_loader(train_entries, special_symbols, hyperP) # define model model = Seq2Seq(word_size, code_size, hyperP) if is_cuda: model.to(device) if hyperP['load_pretrain_code_embed']: model.decoder.embed[0].load_state_dict( torch.load('./pretrain_code_lm/embedding-1556211835.t7')) if hyperP['freeze_embed']: for param in model.decoder.embed[0].parameters(): param.requires_grad = False #%% md ### Training optimizer = optim.Adam(model.parameters(), lr=hyperP['lr']) loss_f = torch.nn.CrossEntropyLoss()
def train(cfg): # set logger log_dir = os.path.join("logs/", cfg.dataset, cfg.prefix) if not os.path.isdir(log_dir): os.makedirs(log_dir, exist_ok=True) logging.basicConfig(format="%(asctime)s %(message)s", filename=log_dir + "/" + "log.txt", filemode="a") logger = logging.getLogger() logger.setLevel(logging.INFO) stream_handler = logging.StreamHandler() stream_handler.setLevel(logging.INFO) logger.addHandler(stream_handler) logger.info(pprint.pformat(cfg)) # training data loader train_loader = get_train_loader(dataset=cfg.dataset, root=cfg.data_root, sample_method=cfg.sample_method, batch_size=cfg.batch_size, p_size=cfg.p_size, k_size=cfg.k_size, random_flip=cfg.random_flip, random_crop=cfg.random_crop, random_erase=cfg.random_erase, color_jitter=cfg.color_jitter, padding=cfg.padding, image_size=cfg.image_size, num_workers=8) # evaluation data loader gallery_loader, query_loader = None, None if cfg.eval_interval > 0: gallery_loader, query_loader = get_test_loader( dataset=cfg.dataset, root=cfg.data_root, batch_size=512, image_size=cfg.image_size, num_workers=4) # model model = Baseline(num_classes=cfg.num_id, dual_path=cfg.dual_path, drop_last_stride=cfg.drop_last_stride, triplet=cfg.triplet, classification=cfg.classification) model.cuda() # optimizer assert cfg.optimizer in ['adam', 'sgd'] if cfg.optimizer == 'adam': optimizer = optim.Adam(model.parameters(), lr=cfg.lr, weight_decay=cfg.wd) else: optimizer = optim.SGD(model.parameters(), lr=cfg.lr, momentum=0.9, weight_decay=cfg.wd) # convert model for mixed precision training model, optimizer = amp.initialize(model, optimizer, enabled=cfg.fp16, opt_level="O2") lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer=optimizer, milestones=cfg.lr_step, gamma=0.1) # engine checkpoint_dir = os.path.join("checkpoints", cfg.dataset, cfg.prefix) engine = get_trainer(model=model, optimizer=optimizer, lr_scheduler=lr_scheduler, logger=logger, non_blocking=True, log_period=cfg.log_period, save_dir=checkpoint_dir, prefix=cfg.prefix, eval_interval=cfg.eval_interval, gallery_loader=gallery_loader, query_loader=query_loader, dataset=cfg.dataset) # training engine.run(train_loader, max_epochs=cfg.num_epoch)
print('Learning rate: %.4f' % lr) infos = {} infos_best = {} histories = {} if use_checkpoint is True and os.path.isfile( os.path.join(file_path, 'infos.pkl')): with open(os.path.join(file_path, 'infos.pkl')) as f: infos = pickle.load(f) if os.path.isfile(os.path.join(file_path, 'histories.pkl')): with open(os.path.join(file_path, 'histories.pkl')) as f: histories = pickle.load(f) model = BERTLM(feature_size).cuda() train_loader = get_train_loader(train_feat_path, sim_path, batch_size, shuffle=True) itera = 0 epoch = 0 if use_checkpoint: model.load_state_dict(torch.load(file_path + '/9288.pth')) itera = infos.get('iter', 0) epoch = infos.get('epoch', 0) optimizer = Adam(model.parameters(), lr=lr) optim_schedule = ScheduledOptim(optimizer, hidden_size, n_warmup_steps=10000) if os.path.exists(best_optimizer_pth_path) and use_checkpoint: optimizer.load_state_dict(torch.load(optimizer_pth_path))
def train(cfg): # set logger log_dir = os.path.join("logs/", cfg.dataset, cfg.prefix) if not os.path.isdir(log_dir): os.makedirs(log_dir, exist_ok=True) logging.basicConfig(format="%(asctime)s %(message)s", filename=log_dir + "/" + "log.txt", filemode="w") logger = logging.getLogger() logger.setLevel(logging.INFO) stream_handler = logging.StreamHandler() stream_handler.setLevel(logging.INFO) logger.addHandler(stream_handler) logger.info(pprint.pformat(cfg)) # training data loader train_loader = get_train_loader(dataset=cfg.dataset, root=cfg.data_root, sample_method=cfg.sample_method, batch_size=cfg.batch_size, p_size=cfg.p_size, k_size=cfg.k_size, random_flip=cfg.random_flip, random_crop=cfg.random_crop, random_erase=cfg.random_erase, color_jitter=cfg.color_jitter, padding=cfg.padding, image_size=cfg.image_size, num_workers=8) # evaluation data loader gallery_loader, query_loader = None, None if cfg.eval_interval > 0: gallery_loader, query_loader = get_test_loader(dataset=cfg.dataset, root=cfg.data_root, batch_size=64, image_size=cfg.image_size, num_workers=4) # model model = Baseline(num_classes=cfg.num_id, pattern_attention=cfg.pattern_attention, modality_attention=cfg.modality_attention, mutual_learning=cfg.mutual_learning, drop_last_stride=cfg.drop_last_stride, triplet=cfg.triplet, k_size=cfg.k_size, center_cluster=cfg.center_cluster, center=cfg.center, margin=cfg.margin, num_parts=cfg.num_parts, weight_KL=cfg.weight_KL, weight_sid=cfg.weight_sid, weight_sep=cfg.weight_sep, update_rate=cfg.update_rate, classification=cfg.classification) def get_parameter_number(net): total_num = sum(p.numel() for p in net.parameters()) trainable_num = sum(p.numel() for p in net.parameters() if p.requires_grad) return {'Total': total_num, 'Trainable': trainable_num} print(get_parameter_number(model)) model.cuda() # optimizer assert cfg.optimizer in ['adam', 'sgd'] if cfg.optimizer == 'adam': optimizer = optim.Adam(model.parameters(), lr=cfg.lr, weight_decay=cfg.wd) else: optimizer = optim.SGD(model.parameters(), lr=cfg.lr, momentum=0.9, weight_decay=cfg.wd) # convert model for mixed precision training model, optimizer = amp.initialize(model, optimizer, enabled=cfg.fp16, opt_level="O1") if cfg.center: model.center_loss.centers = model.center_loss.centers.float() lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer=optimizer, milestones=cfg.lr_step, gamma=0.1) if cfg.resume: checkpoint = torch.load(cfg.resume) model.load_state_dict(checkpoint) # engine checkpoint_dir = os.path.join("checkpoints", cfg.dataset, cfg.prefix) engine = get_trainer(dataset=cfg.dataset, model=model, optimizer=optimizer, lr_scheduler=lr_scheduler, logger=logger, non_blocking=True, log_period=cfg.log_period, save_dir=checkpoint_dir, prefix=cfg.prefix, eval_interval=cfg.eval_interval, start_eval=cfg.start_eval, gallery_loader=gallery_loader, query_loader=query_loader, rerank=cfg.rerank) # training engine.run(train_loader, max_epochs=cfg.num_epoch)