def main(): # 读取配置文件 with open('config/default.yml') as fin: config = yaml.load(fin, Loader=yaml.SafeLoader) # 生成 train 和 valid 数据集 train_config = config['dataset']['train'] train_df = pd.read_csv(train_config['data_path'], sep='\t') train_df.sample(frac=1) train, valid = train_test_split(train_df, test_size=config['train_valid_split']) train_dataset = build_dataloader(train, train_config, device=device) valid_dataset = build_dataloader(valid, train_config, device=device) # 建立模型 model_config = config['model'] model = BertClassifier(model_config) model.to(device) optimizer = build_optimizer(model, config['optimizer']) # 计算训练步数 num_train_steps = int( len(train_dataset) / train_dataset.batch_size * config['num_epochs']) num_warmup_steps = int(num_train_steps * config['optimizer']['warmup_proportion']) scheduler = build_scheduler(optimizer, num_train_steps, num_warmup_steps) # 训练 trainer.do_train(model, train_loader=train_dataset, valid_loader=valid_dataset, optimizer=optimizer, scheduler=scheduler, cfg=config)
def main(): logger = get_logger() global_config = config['Global'] # 初始化设备 use_gpu = global_config['use_gpu'] if global_config['local_rank'] == -1 or not use_gpu: device = torch.device( "cuda" if torch.cuda.is_available() and use_gpu else "cpu") global_config.update( {'n_gpu': torch.cuda.device_count() if use_gpu else 1}) else: torch.cuda.set_device(global_config['local_rank']) device = torch.device('cuda', global_config['local_rank']) dist.init_process_group(backend='nccl') global_config.update({'n_gpu': 1}) global_config.update({'device': device}) logger.warning( f"\n\tProcess Rank:{global_config['local_rank']} \n" f"\tDevice: {device}\n" f"\tGpus: {global_config['n_gpu']}\n" f"\tDistributed: {bool(global_config['local_rank'] != -1)}\n" f"\t16-bits training: {global_config['fp16']}") rank_id = global_config['local_rank'] set_seed(global_config['seed'], use_gpu) # 阻塞子进程,下面的操作仅主进程进行 if not is_main_process(rank_id): dist.barrier() post_process = build_post_process(config['PostProcess'], global_config) # 构建模型 arch_config = config.pop('Architecture') if hasattr(post_process, 'character'): char_num = len(getattr(post_process, 'character')) arch_config["Head"]['out_channels'] = char_num logger.info(f"\nModel Info:" f"\n{json.dumps(arch_config, indent=4)}") model = build_model(arch_config) state_dict = torch.load(global_config['pretrained_model']) model.load_state_dict(state_dict) # 加载训练数据 if global_config['local_rank'] == 0: dist.barrier() logger.info(f"\nLoad train Data:" f"\n{json.dumps(config['Train'], indent=4)}") train_dataloader = build_dataloader(config, logger, 'Train') logger.info(f"\nLoad Eval Data:" f"\n{json.dumps(config['Eval'], indent=4)}") eval_dataloader = build_dataloader(config, logger, 'Eval') if global_config['local_rank'] == 0: dist.barrier() model.to(device)
def train(cfg, local_rank, distributed): num_classes = COCODataset(cfg.data.train[0], cfg.data.train[1]).num_classes model = EfficientDet(num_classes=num_classes, model_name=cfg.model.name) inp_size = model.config['inp_size'] device = torch.device(cfg.device) model.to(device) optimizer = build_optimizer(model, **optimizer_kwargs(cfg)) lr_scheduler = build_lr_scheduler(optimizer, **lr_scheduler_kwargs(cfg)) use_mixed_precision = cfg.dtype == "float16" amp_opt_level = 'O1' if use_mixed_precision else 'O0' model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, find_unused_parameters=True) arguments = {} arguments["iteration"] = 0 output_dir = cfg.output_dir save_to_disk = comm.get_rank() == 0 checkpointer = Checkpointer(model, optimizer, lr_scheduler, output_dir, save_to_disk) extra_checkpoint_data = checkpointer.load(cfg.model.resume) arguments.update(extra_checkpoint_data) train_dataloader = build_dataloader(cfg, inp_size, is_train=True, distributed=distributed, start_iter=arguments["iteration"]) test_period = cfg.test.test_period if test_period > 0: val_dataloader = build_dataloader(cfg, inp_size, is_train=False, distributed=distributed) else: val_dataloader = None checkpoint_period = cfg.solver.checkpoint_period log_period = cfg.solver.log_period do_train(cfg, model, train_dataloader, val_dataloader, optimizer, lr_scheduler, checkpointer, device, checkpoint_period, test_period, log_period, arguments) return model
def eval_run(run_dir, batch_size=128, epoch=None, verbose=True, tqdm_leave=True): cfg = load_config(run_dir) datasets_dir = config.get('DATASETS_DIR') trn_dl = build_dataloader(datasets_dir, cfg.ds, cfg.split, 'train', batch_size, shuffle=False) tst_dl = build_dataloader(datasets_dir, cfg.ds, cfg.split, 'test', batch_size, shuffle=False) if epoch is not None: print(f'Evaluating {cfg.run} at epoch {epoch}') model = load_model(run_dir, cfg, epoch) trn_loss, trn_acc = eval_subset(model, trn_dl) tst_loss, tst_acc = eval_subset(model, tst_dl) print( f'{cfg.run}' f' loss=({trn_loss:.2f},{tst_loss:.2f})' f' acc=({trn_acc:.2f},{tst_acc:.2f})' ) return trn_dir = join(run_dir, 'etrn') tst_dir = join(run_dir, 'etst') if isdir(trn_dir): shutil.rmtree(trn_dir) if isdir(tst_dir): shutil.rmtree(tst_dir) trn_writer = tf.summary.create_file_writer(trn_dir) tst_writer = tf.summary.create_file_writer(tst_dir) if verbose: print(f'Evaluating {cfg.run}') best_acc, best_epoch = 0, 0 for epoch in trange(cfg.epochs, leave=tqdm_leave): model = load_model(run_dir, cfg, epoch) trn_loss, trn_acc = eval_subset(model, trn_dl) tst_loss, tst_acc = eval_subset(model, tst_dl) with trn_writer.as_default(): tf.summary.scalar(f'loss/{cfg.ds}', trn_loss, epoch) tf.summary.scalar(f'acc/{cfg.ds}', trn_acc, epoch) with tst_writer.as_default(): tf.summary.scalar(f'loss/{cfg.ds}', tst_loss, epoch) tf.summary.scalar(f'acc/{cfg.ds}', tst_acc, epoch) if tst_acc > best_acc: best_acc, best_epoch = tst_acc, epoch firsts = ['run', 'ds', 'split'] columns = [k for k in sorted(cfg.keys()) if k not in firsts] columns = firsts + ['acc', 'epoch'] + columns data = dict(cfg) data['acc'] = best_acc data['epoch'] = best_epoch df = pd.DataFrame(data, columns=columns, index=[0]) df.to_csv(f'{run_dir}/results.csv') if verbose: print(df.head())
def __init__(self, opt): ''' opt in type <class 'argparse.Namespace'> ''' self.opt = opt self.device = self.opt.device train_dataloader, test_dataloader, self.train_num, self.test_num = build_dataloader( self.device) self.dataloader = {'train': train_dataloader, 'test': test_dataloader} self.net = PredNet(img_shape=self.opt.shape, num_masks=self.opt.num_masks, is_robot_state_used=1, iter_num=-1, k=900, device=self.device) self.net.to(self.device) print('Net has', sum(param.numel() for param in self.net.parameters()), 'parameters...') self.mse_loss = torch.nn.MSELoss() self.w_state = 1e-4 # TODO problems if self.opt.pretrained_model_path: self.load_weight() self.optimizer = torch.optim.Adam(self.net.parameters(), self.opt.learning_rate)
def __init__(self, name, full_name, datasets_dir, run_dir, cfg): self.name = name self.writer = tf.summary.create_file_writer(join(run_dir, name)) self.tasks = [] batch_size = cfg.train_batch // len(cfg._dss) Task = namedtuple('Task', ('name', 'dl', 'loss', 'acc')) if name == 'trn': transform = cfg.dss_augment sampling = cfg.dss_sampling # batch_size = cfg.train_tbatch shuffle = True else: transform = False sampling = 'fixed' # batch_size = cfg.train_ebatch shuffle = False for ds in cfg._dss: dl = build_dataloader(datasets_dir=datasets_dir, ds=ds.name, split=ds.split, subset=full_name, transform=transform, sampling=sampling, cache=cfg.dss_cache, batch_size=batch_size, shuffle=shuffle, num_workers=cfg.dss_num_workers) loss = tf.keras.metrics.SparseCategoricalCrossentropy() acc = tf.keras.metrics.SparseCategoricalAccuracy() self.tasks.append(Task(ds.name, dl, loss, acc)) self.dls = [task.dl for task in self.tasks]
def train(config, experiment_name=None): num_classes = config.MODEL.NUM_CLASSES # dataloader for training train_period = 'train' train_loader = build_dataloader(cfg=config, period=train_period, loader_type='train') val_loader = build_dataloader(cfg=config, period=train_period, loader_type='val') # prepare model model = build_model(cfg=config) print('The loss type is', config.MODEL.LOSS_TYPE) loss_func = build_loss(config, num_classes) optimizer = build_optimizer(config, model) # Add for using self trained model if config.MODEL.PRETRAIN_CHOICE == 'self': start_epoch = eval( config.MODEL.PRETRAIN_PATH.split('/')[-1].split('.')[0].split('_') [-1]) print('Start epoch:', start_epoch) path_to_optimizer = config.MODEL.PRETRAIN_PATH.replace( 'model', 'optimizer') print('Path to the checkpoint of optimizer:', path_to_optimizer) model.load_state_dict(torch.load(config.MODEL.PRETRAIN_PATH)) optimizer.load_state_dict(torch.load(path_to_optimizer)) scheduler = WarmUpMultiStepLR(optimizer, config.SOLVER.STEPS, config.SOLVER.GAMMA, config.SOLVER.WARMUP_FACTOR, config.SOLVER.WARMUP_ITERS, config.SOLVER.WARMUP_METHOD) print('------------------ Start Training -------------------') do_train(config, model, train_loader, val_loader, optimizer, scheduler, loss_func, experiment_name) print('---------------- Training Completed ---------------- ')
def train(cfg): model_dir = join(config.get('RESULTS_DIR'), cfg.exp_name, cfg.run) print(f"Trainig {cfg.run}") cfg.save_params(model_dir) datasets_dir = config.get('DATASETS_DIR') trn_dl = build_dataloader(datasets_dir, cfg.ds, cfg.split, 'train', cfg.tbatch_size) etrn_dl = build_dataloader(datasets_dir, cfg.ds, cfg.split, 'train', cfg.ebatch_size) etst_dl = build_dataloader(datasets_dir, cfg.ds, cfg.split, 'test', cfg.ebatch_size) num_classes = 51 if cfg.ds == 'hmdb51' else 101 ModelClass = models.get_model_class(cfg.model) model = ModelClass(cfg, num_classes) loss_fn = tf.keras.losses.SparseCategoricalCrossentropy() optimizer = tf.keras.optimizers.SGD(learning_rate=cfg.lr) trn_loss_epoch = tf.keras.metrics.SparseCategoricalCrossentropy() trn_acc_epoch = tf.keras.metrics.SparseCategoricalAccuracy() tst_loss_epoch = tf.keras.metrics.SparseCategoricalCrossentropy() tst_acc_epoch = tf.keras.metrics.SparseCategoricalAccuracy() trn_writer = tf.summary.create_file_writer(join(model_dir, 'trn')) tst_writer = tf.summary.create_file_writer(join(model_dir, 'tst')) trn_eval_step = (etrn_dl, trn_loss_epoch, trn_acc_epoch) tst_eval_step = (etst_dl, tst_loss_epoch, tst_acc_epoch) trn_eval_epoch = (trn_loss_epoch, trn_acc_epoch, trn_writer) tst_eval_epoch = (tst_loss_epoch, tst_acc_epoch, tst_writer) weights_dir = join(model_dir, 'weights') for epoch in trange(cfg.epochs): for x, y_true in trn_dl: train_step(x, y_true, model, loss_fn, optimizer) eval_step(model, trn_eval_step, tst_eval_step) eval_epoch(epoch, cfg.ds, trn_eval_epoch, tst_eval_epoch) model.save_weights(join(weights_dir, f'{epoch:03d}.ckpt'))
def test(config, experiment_name=None): # dataloader for test test_period = 'test' test_loader = build_dataloader(cfg=config, period=test_period, loader_type='test') # prepare model model = build_model(cfg=config) model.load_param(config.TEST.WEIGHT) print('------------------ Start Test -------------------') do_test(config, model, test_loader, experiment_name) print('---------------- Inference Completed -----------------')
def main(): global_config = config['Global'] use_gpu = global_config['use_gpu'] n_gpus = 1 device = torch.device('cpu') if use_gpu: if torch.cuda.is_available(): n_gpus = torch.cuda.device_count() device = torch.device('cuda') else: logger.warning("未发现可用于计算的GPU设备") # 创建数据集 config['Eval']['loader'].update({ 'batch_size': config['Eval']['loader']['batch_size_per_card'] * n_gpus }) dataloader = build_dataloader(config, device, logger, 'Eval') batch_size = config['Eval']['loader']['batch_size'] logger.info(f'测试数据共 {len(dataloader)}个batch, 每个batch包含{batch_size}个样本') post_process_class = build_post_process(config['PostProcess'], global_config) if hasattr(post_process_class, 'character'): config['Architecture']["Head"]['out_channels'] = len( getattr(post_process_class, 'character')) model = build_model(config['Architecture']) # 加载预训练模型 state_dict = torch.load(global_config['pretrained_model'], map_location=torch.device('cpu')) model.load_state_dict(state_dict) model.to(device) eval_class = build_metric(config['Metric']) metric = train_utils.eval(model, dataloader, post_process_class, eval_class, device) logger.info('metric eval ***************') for k, v in metric.items(): logger.info('{}:{}'.format(k, v))
def build_tasks_eval(datasets_dir, run_dir, batch_size, cfg): """Builds tasks evaluation object.""" TasksEval = namedtuple('TasksEval', ('etrn', 'etst')) Subset = namedtuple('Subset', ('tasks', 'writer')) Task = namedtuple('Task', ('name', 'dl', 'loss', 'acc')) subsets = [] for alias, name in zip(('etrn', 'etst'), ('train', 'test')): tasks = [] for ds in cfg._dss: dl = build_dataloader(datasets_dir=datasets_dir, ds=ds.name, split=ds.split, subset=name, batch_size=batch_size, cache=True) loss = tf.keras.metrics.SparseCategoricalCrossentropy() acc = tf.keras.metrics.SparseCategoricalAccuracy() tasks.append(Task(ds.name, dl, loss, acc)) writer = tf.summary.create_file_writer(join(run_dir, alias)) subsets.append(Subset(tasks, writer)) tasks_eval = TasksEval(*subsets) return tasks_eval
def __init__(self, opt): self.opt = opt self.device = self.opt.device train_dataloader, valid_dataloader = build_dataloader(opt) self.dataloader = { 'train': train_dataloader, 'valid': valid_dataloader } self.net = network(self.opt.channels, self.opt.height, self.opt.width, -1, self.opt.schedsamp_k, self.opt.use_state, self.opt.num_masks, self.opt.model == 'STP', self.opt.model == 'CDNA', self.opt.model == 'DNA', self.opt.context_frames) self.net.to(self.device) self.mse_loss = nn.MSELoss() self.w_state = 1e-4 if self.opt.pretrained_model: self.load_weight() self.optimizer = torch.optim.Adam(self.net.parameters(), self.opt.learning_rate)
except: signals, labels = get_ecg(PATH, length=LENGTH) segments = np.zeros((245990, 1001)) k = 0 for i, record in enumerate(signals): rp = qrs_detection(record, sample_rate=FS) seg = get_segments(record, rp, labels[i]) if seg is not None: segments[k:k + seg.shape[0], :] = seg k += seg.shape[0] del signals, labels np.save('./data/segment.npy', segments) X, y = segments[:, :-1], segments[:, -1][:, np.newaxis] del segments train, test = build_dataloader(X, y, resamp=RESAMP, batch_size=BATCH_SIZE) del X, y net = cnn_feed_lstm() try: params = torch.load("../params/net_0.81.pkl") net.load_state_dict(params["model_state_dict"]) except: pass loss, val_score = learn(net, train, test, lr=LR, epoch=EPOCH) plot(loss, val_score)
def main(config): os.environ['CUDA_VISIBLE_DEVICES'] = config.GPU if not config.EVAL_MODE: sys.stdout = Logger(osp.join(config.OUTPUT, 'log_train.txt')) else: sys.stdout = Logger(osp.join(config.OUTPUT, 'log_test.txt')) print("==========\nConfig:{}\n==========".format(config)) print("Currently using GPU {}".format(config.GPU)) # Set random seed set_seed(config.SEED) # Build dataloader trainloader, queryloader, galleryloader, num_classes = build_dataloader( config) # Build model model, classifier = build_model(config, num_classes) # Build classification and pairwise loss criterion_cla, criterion_pair = build_losses(config) # Build optimizer parameters = list(model.parameters()) + list(classifier.parameters()) if config.TRAIN.OPTIMIZER.NAME == 'adam': optimizer = optim.Adam( parameters, lr=config.TRAIN.OPTIMIZER.LR, weight_decay=config.TRAIN.OPTIMIZER.WEIGHT_DECAY) elif config.TRAIN.OPTIMIZER.NAME == 'adamw': optimizer = optim.AdamW( parameters, lr=config.TRAIN.OPTIMIZER.LR, weight_decay=config.TRAIN.OPTIMIZER.WEIGHT_DECAY) elif config.TRAIN.OPTIMIZER.NAME == 'sgd': optimizer = optim.SGD(parameters, lr=config.TRAIN.OPTIMIZER.LR, momentum=0.9, weight_decay=config.TRAIN.OPTIMIZER.WEIGHT_DECAY, nesterov=True) else: raise KeyError("Unknown optimizer: {}".format( config.TRAIN.OPTIMIZER.NAME)) # Build lr_scheduler scheduler = lr_scheduler.MultiStepLR( optimizer, milestones=config.TRAIN.LR_SCHEDULER.STEPSIZE, gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE) start_epoch = config.TRAIN.START_EPOCH if config.MODEL.RESUME: print("Loading checkpoint from '{}'".format(config.MODEL.RESUME)) checkpoint = torch.load(config.MODEL.RESUME) model.load_state_dict(checkpoint['state_dict']) start_epoch = checkpoint['epoch'] model = nn.DataParallel(model).cuda() classifier = nn.DataParallel(classifier).cuda() if config.EVAL_MODE: print("Evaluate only") test(model, queryloader, galleryloader) return start_time = time.time() train_time = 0 best_rank1 = -np.inf best_epoch = 0 print("==> Start training") for epoch in range(start_epoch, config.TRAIN.MAX_EPOCH): start_train_time = time.time() train(epoch, model, classifier, criterion_cla, criterion_pair, optimizer, trainloader) train_time += round(time.time() - start_train_time) if (epoch+1) > config.TEST.START_EVAL and config.TEST.EVAL_STEP > 0 and \ (epoch+1) % config.TEST.EVAL_STEP == 0 or (epoch+1) == config.TRAIN.MAX_EPOCH: print("==> Test") rank1 = test(model, queryloader, galleryloader) is_best = rank1 > best_rank1 if is_best: best_rank1 = rank1 best_epoch = epoch + 1 state_dict = model.module.state_dict() save_checkpoint( { 'state_dict': state_dict, 'rank1': rank1, 'epoch': epoch, }, is_best, osp.join(config.OUTPUT, 'checkpoint_ep' + str(epoch + 1) + '.pth.tar')) scheduler.step() print("==> Best Rank-1 {:.1%}, achieved at epoch {}".format( best_rank1, best_epoch)) elapsed = round(time.time() - start_time) elapsed = str(datetime.timedelta(seconds=elapsed)) train_time = str(datetime.timedelta(seconds=train_time)) print( "Finished. Total elapsed time (h:m:s): {}. Training time (h:m:s): {}.". format(elapsed, train_time))