opt = main() ####################### Output path, logger, device and random seed configuration ################# exp_path = opt.read_model_path if opt.testing else hyperparam_pseudo_method( opt) if not os.path.exists(exp_path): os.makedirs(exp_path) logger = set_logger(exp_path, testing=opt.testing) logger.info("Parameters: " + str(json.dumps(vars(opt), indent=4))) logger.info("Experiment path: %s" % (exp_path)) sp_device, qg_device = set_torch_device(opt.deviceId[0]), set_torch_device( opt.deviceId[1]) set_random_seed(opt.seed, device='cuda') ################################ Vocab and Data Reader ########################### sp_copy, qg_copy = 'copy__' in opt.read_sp_model_path, 'copy__' in opt.read_qg_model_path sp_vocab, qg_vocab = Vocab(opt.dataset, task='semantic_parsing', copy=sp_copy), Vocab(opt.dataset, task='question_generation', copy=qg_copy) logger.info("Semantic Parsing model vocabulary ...") logger.info("Vocab size for input natural language sentence is: %s" % (len(sp_vocab.word2id))) logger.info("Vocab size for output logical form is: %s" % (len(sp_vocab.lf2id))) logger.info("Question Generation model vocabulary ...")
def main(): # 为了看看repo提供的model.pth.tar-9在validation集的mAp和rank-1 # 我自己训练的tar-9只有mAP: 15.1%; Rank-1: 23.3% ,不知道为什么 # 更改args.load_weights = '/model/caohw9/track3_model/model.pth.tar-9' global args print(args) set_random_seed(args.seed) if not args.use_avai_gpus: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_devices use_gpu = torch.cuda.is_available() if args.use_cpu: use_gpu = False sys.stdout = Logger(osp.join(args.save_dir, "log.txt")) if use_gpu: print('Currently using GPU {}'.format(args.gpu_devices)) cudnn.benchmark = True else: warnings.warn( 'Currently using CPU, however, GPU is highly recommended') # 初始化loader print('Initializing image data manager') dm = ImageDataManager(use_gpu, **trainset_kwargs(args)) trainloader, testloader_dict = dm.return_dataloaders( ) #trainloader用于训练,testloader_dict包含['query']和['gallery']2个loader print('suffessfully initialize loaders!') # 初始化模型 print('Initializing model: {}'.format( args.arch)) #args.arch default='resnet101' model = models.init_model(name=args.arch, num_classes=dm.num_train_pids, loss={'xent', 'htri'}, pretrained=not args.no_pretrained, use_gpu=use_gpu) print('Model size: {:.3f} M'.format(count_num_param(model))) # 加载预训练参数 if args.load_weights and check_isfile(args.load_weights): load_pretrained_weights(model, args.load_weights) #加载训练过的模型后,先看看validation print('=> Validation') print('Evaluating {} ...'.format( args.test_set)) #args.test_set应该是指的validation set? queryloader = testloader_dict['query'] galleryloader = testloader_dict['test'] model = nn.DataParallel(model).cuda() if use_gpu else model rank1 = test(model, queryloader, galleryloader, use_gpu) #validation! # 多GPU训练 else: model = nn.DataParallel(model).cuda() if use_gpu else model # 定义loss,optimizer, lr_scheduler criterion_xent = CrossEntropyLoss(num_classes=dm.num_train_pids, use_gpu=use_gpu, label_smooth=args.label_smooth) criterion_htri = TripletLoss(margin=args.margin) optimizer = init_optimizer(model, **optimizer_kwargs(args)) scheduler = init_lr_scheduler(optimizer, **lr_scheduler_kwargs(args)) # 是否是resume训练 if args.resume and check_isfile(args.resume): args.start_epoch = resume_from_checkpoint( args.resume, model, optimizer=optimizer) #获取中断时刻的epoch数 # 开始训练! time_start = time.time() print('=> Start training') for epoch in range(args.start_epoch, args.max_epoch): train(epoch, model, criterion_xent, criterion_htri, optimizer, trainloader, use_gpu) #训练 scheduler.step() #更新lr # 当epoch数超过args.start_eval,每隔一定频率args.eval_freq,或者达到最后一个epoch,进行validation+存储checkpoint if (epoch + 1) > args.start_eval and args.eval_freq > 0 and ( epoch + 1) % args.eval_freq == 0 or (epoch + 1) == args.max_epoch: print('=> Validation') print('Evaluating {} ...'.format( args.test_set)) #args.test_set应该是指的validation set? queryloader = testloader_dict['query'] galleryloader = testloader_dict['test'] rank1 = test(model, queryloader, galleryloader, use_gpu) #validation! save_checkpoint( { 'state_dict': model.state_dict(), #模型的状态字典 'rank1': rank1, 'epoch': epoch + 1, 'arch': args.arch, #default='resnet101' 'optimizer': optimizer.state_dict( ), #优化器对象的状态字典,包含优化器的状态和超参数(如lr, momentum,weight_decay等) }, args.save_dir) #validation同时保存checkpoint # 训练结束! elapsed = round(time.time() - time_start) #持续时间 elapsed = str(datetime.timedelta(seconds=elapsed)) print('Elapsed {}'.format(elapsed))
assert opt.read_model_path return opt opt = main() ####################### Output path, logger, device and random seed configuration ################# exp_path = opt.read_model_path if opt.testing else hyperparam_lm(opt) if not os.path.exists(exp_path): os.makedirs(exp_path) logger = set_logger(exp_path, testing=opt.testing) logger.info("Parameters: " + str(json.dumps(vars(opt), indent=4))) logger.info("Experiment path: %s" % (exp_path)) opt.device = set_torch_device(opt.deviceId) set_random_seed(opt.seed, device=opt.device.type) ################################ Vocab and Data Reader ########################### lm_vocab = Vocab(opt.dataset, task='language_model') if opt.side == 'question': word2id = lm_vocab.word2id logger.info("Vocab size for natural language sentence is: %s" % (len(word2id))) else: word2id = lm_vocab.lf2id logger.info("Vocab size for logical form is: %s" % (len(word2id))) logger.info("Read dataset %s starts at %s" % (opt.dataset, time.asctime(time.localtime(time.time())))) Example.set_domain(opt.dataset) if not opt.testing: train_dataset, dev_dataset = Example.load_dataset(choice='train')
parser.add_argument('--deviceId', type=int, default=-1, help='train model on ith gpu. -1:cpu, 0:auto_select') parser.add_argument('--seed', type=int, default=999) args = parser.parse_args() assert (not args.testing) or args.read_model_path if args.testing: exp_path = args.read_model_path else: exp_path = set_hyperparam_path(args) if not os.path.exists(exp_path): os.makedirs(exp_path) logger = set_logger(exp_path, testing=args.testing) device, args.deviceId = set_torch_device(args.deviceId) set_random_seed(args.seed, device=device) logger.info("Parameters:" + str(json.dumps(vars(args), indent=4))) logger.info("Experiment path: %s" % (exp_path)) logger.info("Read dataset starts at %s" % (time.asctime(time.localtime(time.time())))) start_time = time.time() Example.set_tokenizer(args.bert) # set bert tokenizer if not args.testing: train_loader = DataLoader(QNLIDataset('train'), batch_size=args.batch_size, shuffle=True, collate_fn=collate_fn_labeled) dev_loader = DataLoader(QNLIDataset('dev'), batch_size=args.batch_size,
def main(): global args set_random_seed(args.seed) if not args.use_avai_gpus: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_devices use_gpu = torch.cuda.is_available() if args.use_cpu: use_gpu = False sys.stdout = Logger(osp.join(args.save_dir, "log.txt")) if use_gpu: print('Currently using GPU {}'.format(args.gpu_devices)) cudnn.benchmark = True else: warnings.warn( 'Currently using CPU, however, GPU is highly recommended') print('Initializing image data manager') dm = ImageDataManager(use_gpu, **trainset_kwargs(args)) trainloader, testloader_dict = dm.return_dataloaders() print('Initializing model: {}'.format(args.arch)) model = models.init_model(name=args.arch, num_classes=dm.num_train_pids, loss={'xent', 'htri'}, pretrained=not args.no_pretrained, use_gpu=use_gpu) print('Model size: {:.3f} M'.format(count_num_param(model))) if args.load_weights and check_isfile(args.load_weights): load_pretrained_weights(model, args.load_weights) model = nn.DataParallel(model).cuda() if use_gpu else model criterion_xent = CrossEntropyLoss(num_classes=dm.num_train_pids, use_gpu=use_gpu, label_smooth=args.label_smooth) criterion_htri = TripletLoss(margin=args.margin) optimizer = init_optimizer(model, **optimizer_kwargs(args)) scheduler = init_lr_scheduler(optimizer, **lr_scheduler_kwargs(args)) if args.resume and check_isfile(args.resume): args.start_epoch = resume_from_checkpoint(args.resume, model, optimizer=optimizer) time_start = time.time() print('=> Start training') for epoch in range(args.start_epoch, args.max_epoch): train(epoch, model, criterion_xent, criterion_htri, optimizer, trainloader, use_gpu) scheduler.step() if (epoch + 1) > args.start_eval and args.eval_freq > 0 and ( epoch + 1) % args.eval_freq == 0 or (epoch + 1) == args.max_epoch: print('=> Validation') print('Evaluating {} ...'.format(args.test_set)) queryloader = testloader_dict['query'] galleryloader = testloader_dict['test'] rank1 = test(model, queryloader, galleryloader, use_gpu) save_checkpoint( { 'state_dict': model.state_dict(), 'rank1': rank1, 'epoch': epoch + 1, 'arch': args.arch, 'optimizer': optimizer.state_dict(), }, args.save_dir) elapsed = round(time.time() - time_start) elapsed = str(datetime.timedelta(seconds=elapsed)) print('Elapsed {}'.format(elapsed))