def train_model(rank, world_size, args): if 1 < args.n_gpu: init_process_group(rank, world_size) master = (world_size == 0 or rank % world_size == 0) if master: wandb.init(project="transformer-evolution") vocab = load_vocab(args.vocab) config = cfg.Config.load(args.config) config.n_enc_vocab, config.n_dec_vocab = len(vocab), len(vocab) config.device = torch.device(f"cuda:{rank}" if torch.cuda.is_available() else "cpu") print(config) best_epoch, best_loss, best_score = 0, 0, 0 model = transformer.QA(config) if os.path.isfile(args.save): best_epoch, best_loss, best_score = model.load(args.save) print(f"rank: {rank} load state dict from: {args.save}") if 1 < args.n_gpu: model.to(config.device) model = DistributedDataParallel(model, device_ids=[rank], find_unused_parameters=True) else: model.to(config.device) if master: wandb.watch(model) criterion = torch.nn.CrossEntropyLoss() train_loader, train_sampler = data.build_data_loader(vocab, "KorQuAD_v1.0_train.json", args, shuffle=True) test_loader, _ = data.build_data_loader(vocab, "KorQuAD_v1.0_train.json", args, shuffle=False) t_total = len(train_loader) * args.epoch no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = optim.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) offset = best_epoch for step in trange(args.epoch, desc="Epoch"): if train_sampler: train_sampler.set_epoch(step) epoch = step + offset loss = train_epoch(config, rank, epoch, model, criterion, optimizer, scheduler, train_loader) score = eval_epoch(config, rank, model, test_loader) if master: wandb.log({"loss": loss, "accuracy": score}) if master and best_score < score: best_epoch, best_loss, best_score = epoch, loss, score if isinstance(model, DistributedDataParallel): model.module.save(best_epoch, best_loss, best_score, args.save) else: model.save(best_epoch, best_loss, best_score, args.save) print(f">>>> rank: {rank} save model to {args.save}, epoch={best_epoch}, loss={best_loss:.3f}, socre={best_score:.3f}") if 1 < args.n_gpu: destroy_process_group()
def train_model(rank, world_size, args): print('dd22') if 1 < args.n_gpu: init_process_group(rank, world_size) master = (world_size == 0 or rank % world_size == 0) vocab = load_vocab(args.vocab) config = cfg.Config.load(args.config) config.n_enc_vocab = len(vocab) # GPU 사용 여부를 확인합니다. config.device = torch.device( f"cuda:{rank}" if torch.cuda.is_available() else "cpu") print(config) best_epoch, best_loss = 0, 0 """학습 실행""" # BERTPretrain을 생성합니다. model = bert.BERTPretrain(config) # 기존에 학습된 pretrain 값이 있다면 이를 로드 합니다. if os.path.isfile(args.save): best_epoch, best_loss = model.bert.load(args.save) print( f"rank: {rank} load pretrain from: {args.save}, epoch={best_epoch}, loss={best_loss}" ) best_epoch += 1 # BERTPretrain이 GPU 또는 CPU를 지원하도록 합니다. if 1 < args.n_gpu: model.to(config.device) model = DistributedDataParallel(model, device_ids=[rank], find_unused_parameters=True) else: model.to(config.device) # MLM loss(criterion_lm) 및 NLP loss(criterion_cls) 함수를 선언 합니다. criterion_lm = torch.nn.CrossEntropyLoss(ignore_index=-1, reduction='mean') criterion_cls = torch.nn.CrossEntropyLoss() train_loader = data.build_pretrain_loader(vocab, args, epoch=best_epoch, shuffle=True) t_total = len(train_loader) * args.epoch no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] # optimizer를 선언 합니다. optimizer = optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = optim.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) offset = best_epoch losses = [] for step in trange(args.epoch, desc="Epoch"): print('step offset') print(step) print(offset) epoch = step + offset # 각 epoch 마다 새로 train_loader를 생성 합니다. # step이 0인 경우는 위에서 생성했기 때문에 생성하지 않습니다. if 0 < step: del train_loader train_loader = data.build_pretrain_loader(vocab, args, epoch=epoch, shuffle=True) # 각 epoch 마다 학습을 합니다. loss = train_epoch(config, rank, epoch, model, criterion_lm, criterion_cls, optimizer, scheduler, train_loader) losses.append(loss) if master: best_epoch, best_loss = epoch, loss if isinstance(model, DistributedDataParallel): model.module.bert.save(best_epoch, best_loss, args.save) else: model.bert.save(best_epoch, best_loss, args.save) print( f">>>> rank: {rank} save model to {args.save}, epoch={best_epoch}, loss={best_loss:.3f}" ) print(f">>>> rank: {rank} losses: {losses}") if 1 < args.n_gpu: destroy_process_group()
def train_model(rank, world_size, args): """ 모델 학습 """ if 1 < args.n_gpu: init_process_group(rank, world_size) master = (world_size == 0 or rank % world_size == 0) if master and args.wandb: wandb.init(project=args.project) vocab = load_vocab(args.vocab) config = Config.load(args.config) config.n_enc_vocab = len(vocab) config.device = f"cuda:{rank}" if torch.cuda.is_available() else "cpu" print(config) best_epoch, best_loss = 0, 0 train_model = ALBERTPretrain(config) if os.path.isfile(args.pretrain_save): try: best_epoch, best_loss = train_model.albert.load(args.pretrain_save) print( f"load pretrain from: {os.path.basename(args.pretrain_save)}, epoch={best_epoch}, loss={best_loss:.4f}" ) except: print(f'load {os.path.basename(args.pretrain_save)} failed.') if 1 < args.n_gpu: train_model.to(config.device) # noinspection PyArgumentList train_model = DistributedDataParallel(train_model, device_ids=[rank], find_unused_parameters=True) else: train_model.to(config.device) if master and args.wandb: wandb.watch(train_model) criterion_lm = torch.nn.CrossEntropyLoss(ignore_index=-1, reduction='mean') criterion_cls = torch.nn.CrossEntropyLoss() train_loader: DataLoader = data.build_pretrain_loader(vocab, args, shuffle=True) t_total = len(train_loader) * args.epoch no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in train_model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': config.weight_decay }, { 'params': [ p for n, p in train_model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = optim.AdamW(optimizer_grouped_parameters, lr=config.learning_rate, eps=config.adam_epsilon) scheduler = optim.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=config.warmup_steps, num_training_steps=t_total) start_epoch = best_epoch + 1 losses = [] with trange(args.epoch, desc="Epoch", position=0) as pbar: pbar.set_postfix_str( f"best epoch: {best_epoch}, loss: {best_loss:.4f}") for step in pbar: epoch = step + start_epoch loss = train_epoch(config, rank, train_model, criterion_lm, criterion_cls, optimizer, scheduler, train_loader) losses.append(loss) if master and args.wandb: wandb.log({"loss": loss}) if master: best_epoch, best_loss = epoch, loss if isinstance(train_model, DistributedDataParallel): train_model.module.albert.save(best_epoch, best_loss, args.pretrain_save) else: train_model.albert.save(best_epoch, best_loss, args.pretrain_save) pbar.set_postfix_str( f"best epoch: {best_epoch}, loss: {best_loss:.4f}") if 1 < args.n_gpu: destroy_process_group()
def train_model(rank, world_size, args): if 1 < args.n_gpu: init_process_group(rank, world_size) master = (world_size == 0 or rank % world_size == 0) vocab = load_vocab(args.vocab) config = cfg.Config.load(args.config) config.n_enc_vocab, config.n_dec_vocab = len(vocab), len(vocab) config.device = torch.device( f"cuda:{rank}" if torch.cuda.is_available() else "cpu") print(config) best_epoch, best_loss = 0, 0 model = albert.ALBERTPretrain(config) if os.path.isfile(args.save): model.albert.load(args.save) print(f"rank: {rank} load pretrain from: {args.save}") if 1 < args.n_gpu: model.to(config.device) model = DistributedDataParallel(model, device_ids=[rank], find_unused_parameters=True) else: model.to(config.device) criterion_lm = torch.nn.CrossEntropyLoss(ignore_index=-1, reduction='mean') criterion_cls = torch.nn.CrossEntropyLoss() train_loader, train_sampler = data.build_pretrain_loader(vocab, args, shuffle=True) t_total = len(train_loader) * args.epoch no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': config.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = optim.AdamW(optimizer_grouped_parameters, lr=config.learning_rate, eps=config.adam_epsilon) scheduler = optim.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=config.warmup_steps, num_training_steps=t_total) offset = best_epoch for step in trange(args.epoch, desc="Epoch"): if train_sampler: train_sampler.set_epoch(step) epoch = step + offset loss = train_epoch(config, rank, epoch, model, criterion_lm, criterion_cls, optimizer, scheduler, train_loader) if master: best_epoch, best_loss = epoch, loss if isinstance(model, DistributedDataParallel): model.module.albert.save(best_epoch, best_loss, args.save) else: model.albert.save(best_epoch, best_loss, args.save) print( f">>>> rank: {rank} save model to {args.save}, epoch={best_epoch}, loss={best_loss:.3f}" ) if 1 < args.n_gpu: destroy_process_group()
def train_model(rank, world_size, args): """ 모델 학습 """ master = (world_size == 0 or rank % world_size == 0) if master and args.wandb: wandb.init(project=args.project, resume=args.name, tags=args.tags) if 1 < args.n_gpu: init_process_group(rank, world_size) vocab = load_vocab(args.vocab) config = Config.load(args.config) config.n_enc_vocab, config.n_dec_vocab = len(vocab), len(vocab) config.device = torch.device(f"cuda:{rank}" if torch.cuda.is_available() else "cpu") print(config) best_epoch, best_loss, best_score = 0, 0, 0 model: MovieClassification = transformer.MovieClassification(config) if args.resume and os.path.isfile(args.save): best_epoch, best_loss, best_score = model.load(args.save) print(f"rank: {rank}, last epoch: {best_epoch} load state dict from: {os.path.basename(args.save)}") model.to(config.device) if master and args.wandb: wandb.watch(model) if 1 < args.n_gpu: model = DistributedDataParallel(model, device_ids=[rank], find_unused_parameters=True) criterion = torch.nn.CrossEntropyLoss() train_loader, train_sampler = data.build_data_loader(rank, vocab, os.path.abspath(os.path.join(os.getcwd(), args.data_dir, "ratings_train.json")), args, shuffle=True) test_loader, test_sampler = data.build_data_loader(rank, vocab, os.path.abspath(os.path.join(os.getcwd(), args.data_dir, "ratings_test.json")), args, shuffle=False) t_total = len(train_loader) * args.epoch no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = optimization.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = optimization.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total, last_epoch=best_epoch) print(f'total_memory: {torch.cuda.get_device_properties(rank).total_memory / (1024 * 1024):.3f} MB') with tqdm(initial=best_epoch + 1, total=args.epoch, position=0) as pbar: for epoch in range(best_epoch + 1, args.epoch + 1): if train_sampler: train_sampler.set_epoch(epoch) train_loss = train_epoch(args, config, rank, epoch, model, criterion, optimizer, scheduler, train_loader) test_loss, test_accuracy = eval_epoch(config, rank, model, test_loader, test_sampler) if master and args.wandb: wandb.config.update(args) wandb.log(row={"train loss": train_loss, "accuracy": test_accuracy}, step=epoch) if master: if best_score < test_accuracy: best_epoch, best_loss, best_score = epoch, train_loss, test_accuracy pbar.set_description(f'Best (score={best_score:.3f}, epoch={best_epoch})') if isinstance(model, DistributedDataParallel): model.module.save(best_epoch, best_loss, best_score, args.save) else: model.save(best_epoch, best_loss, best_score, args.save) else: if best_epoch + 5 < epoch: # early stop break pbar.update() break print(f'total_memory: {torch.cuda.get_device_properties(rank).total_memory / (1024 * 1024):.3f} MB') if master and args.wandb: wandb.save(args.name) if 1 < args.n_gpu: destroy_process_group()