示例#1
0
def train_model(rank, world_size, args):
    if 1 < args.n_gpu:
        init_process_group(rank, world_size)
    master = (world_size == 0 or rank % world_size == 0)
    if master: wandb.init(project="transformer-evolution")

    vocab = load_vocab(args.vocab)

    config = cfg.Config.load(args.config)
    config.n_enc_vocab, config.n_dec_vocab = len(vocab), len(vocab)
    config.device = torch.device(f"cuda:{rank}" if torch.cuda.is_available() else "cpu")
    print(config)

    best_epoch, best_loss, best_score = 0, 0, 0
    model = transformer.QA(config)
    if os.path.isfile(args.save):
        best_epoch, best_loss, best_score = model.load(args.save)
        print(f"rank: {rank} load state dict from: {args.save}")
    if 1 < args.n_gpu:
        model.to(config.device)
        model = DistributedDataParallel(model, device_ids=[rank], find_unused_parameters=True)
    else:
        model.to(config.device)
    if master: wandb.watch(model)

    criterion = torch.nn.CrossEntropyLoss()

    train_loader, train_sampler = data.build_data_loader(vocab, "KorQuAD_v1.0_train.json", args, shuffle=True)
    test_loader, _ = data.build_data_loader(vocab, "KorQuAD_v1.0_train.json", args, shuffle=False)

    t_total = len(train_loader) * args.epoch
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = optim.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)

    offset = best_epoch
    for step in trange(args.epoch, desc="Epoch"):
        if train_sampler:
            train_sampler.set_epoch(step)
        epoch = step + offset

        loss = train_epoch(config, rank, epoch, model, criterion, optimizer, scheduler, train_loader)
        score = eval_epoch(config, rank, model, test_loader)
        if master: wandb.log({"loss": loss, "accuracy": score})

        if master and best_score < score:
            best_epoch, best_loss, best_score = epoch, loss, score
            if isinstance(model, DistributedDataParallel):
                model.module.save(best_epoch, best_loss, best_score, args.save)
            else:
                model.save(best_epoch, best_loss, best_score, args.save)
            print(f">>>> rank: {rank} save model to {args.save}, epoch={best_epoch}, loss={best_loss:.3f}, socre={best_score:.3f}")

    if 1 < args.n_gpu:
        destroy_process_group()
示例#2
0
def train_model(rank, world_size, args):
    print('dd22')
    if 1 < args.n_gpu:
        init_process_group(rank, world_size)
    master = (world_size == 0 or rank % world_size == 0)

    vocab = load_vocab(args.vocab)

    config = cfg.Config.load(args.config)
    config.n_enc_vocab = len(vocab)
    # GPU 사용 여부를 확인합니다.
    config.device = torch.device(
        f"cuda:{rank}" if torch.cuda.is_available() else "cpu")
    print(config)

    best_epoch, best_loss = 0, 0
    """학습 실행"""
    # BERTPretrain을 생성합니다.
    model = bert.BERTPretrain(config)
    # 기존에 학습된 pretrain 값이 있다면 이를 로드 합니다.
    if os.path.isfile(args.save):
        best_epoch, best_loss = model.bert.load(args.save)
        print(
            f"rank: {rank} load pretrain from: {args.save}, epoch={best_epoch}, loss={best_loss}"
        )
        best_epoch += 1
    # BERTPretrain이 GPU 또는 CPU를 지원하도록 합니다.
    if 1 < args.n_gpu:
        model.to(config.device)
        model = DistributedDataParallel(model,
                                        device_ids=[rank],
                                        find_unused_parameters=True)
    else:
        model.to(config.device)

    # MLM loss(criterion_lm) 및 NLP loss(criterion_cls) 함수를 선언 합니다.
    criterion_lm = torch.nn.CrossEntropyLoss(ignore_index=-1, reduction='mean')
    criterion_cls = torch.nn.CrossEntropyLoss()

    train_loader = data.build_pretrain_loader(vocab,
                                              args,
                                              epoch=best_epoch,
                                              shuffle=True)

    t_total = len(train_loader) * args.epoch
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    # optimizer를 선언 합니다.
    optimizer = optim.AdamW(optimizer_grouped_parameters,
                            lr=args.learning_rate,
                            eps=args.adam_epsilon)
    scheduler = optim.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    offset = best_epoch
    losses = []
    for step in trange(args.epoch, desc="Epoch"):
        print('step offset')
        print(step)
        print(offset)
        epoch = step + offset
        # 각 epoch 마다 새로 train_loader를 생성 합니다.
        # step이 0인 경우는 위에서 생성했기 때문에 생성하지 않습니다.
        if 0 < step:
            del train_loader
            train_loader = data.build_pretrain_loader(vocab,
                                                      args,
                                                      epoch=epoch,
                                                      shuffle=True)

        # 각 epoch 마다 학습을 합니다.
        loss = train_epoch(config, rank, epoch, model, criterion_lm,
                           criterion_cls, optimizer, scheduler, train_loader)
        losses.append(loss)

        if master:
            best_epoch, best_loss = epoch, loss
            if isinstance(model, DistributedDataParallel):
                model.module.bert.save(best_epoch, best_loss, args.save)
            else:
                model.bert.save(best_epoch, best_loss, args.save)
            print(
                f">>>> rank: {rank} save model to {args.save}, epoch={best_epoch}, loss={best_loss:.3f}"
            )

    print(f">>>> rank: {rank} losses: {losses}")
    if 1 < args.n_gpu:
        destroy_process_group()
示例#3
0
def train_model(rank, world_size, args):
    """ 모델 학습 """
    if 1 < args.n_gpu:
        init_process_group(rank, world_size)
    master = (world_size == 0 or rank % world_size == 0)
    if master and args.wandb:
        wandb.init(project=args.project)

    vocab = load_vocab(args.vocab)

    config = Config.load(args.config)
    config.n_enc_vocab = len(vocab)
    config.device = f"cuda:{rank}" if torch.cuda.is_available() else "cpu"
    print(config)

    best_epoch, best_loss = 0, 0
    train_model = ALBERTPretrain(config)
    if os.path.isfile(args.pretrain_save):
        try:
            best_epoch, best_loss = train_model.albert.load(args.pretrain_save)
            print(
                f"load pretrain from: {os.path.basename(args.pretrain_save)}, epoch={best_epoch}, loss={best_loss:.4f}"
            )
        except:
            print(f'load {os.path.basename(args.pretrain_save)} failed.')

    if 1 < args.n_gpu:
        train_model.to(config.device)
        # noinspection PyArgumentList
        train_model = DistributedDataParallel(train_model,
                                              device_ids=[rank],
                                              find_unused_parameters=True)
    else:
        train_model.to(config.device)

    if master and args.wandb:
        wandb.watch(train_model)

    criterion_lm = torch.nn.CrossEntropyLoss(ignore_index=-1, reduction='mean')
    criterion_cls = torch.nn.CrossEntropyLoss()

    train_loader: DataLoader = data.build_pretrain_loader(vocab,
                                                          args,
                                                          shuffle=True)

    t_total = len(train_loader) * args.epoch
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in train_model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        config.weight_decay
    }, {
        'params': [
            p for n, p in train_model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = optim.AdamW(optimizer_grouped_parameters,
                            lr=config.learning_rate,
                            eps=config.adam_epsilon)
    scheduler = optim.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=config.warmup_steps,
        num_training_steps=t_total)

    start_epoch = best_epoch + 1
    losses = []
    with trange(args.epoch, desc="Epoch", position=0) as pbar:
        pbar.set_postfix_str(
            f"best epoch: {best_epoch}, loss: {best_loss:.4f}")
        for step in pbar:
            epoch = step + start_epoch

            loss = train_epoch(config, rank, train_model, criterion_lm,
                               criterion_cls, optimizer, scheduler,
                               train_loader)
            losses.append(loss)
            if master and args.wandb:
                wandb.log({"loss": loss})

            if master:
                best_epoch, best_loss = epoch, loss
                if isinstance(train_model, DistributedDataParallel):
                    train_model.module.albert.save(best_epoch, best_loss,
                                                   args.pretrain_save)
                else:
                    train_model.albert.save(best_epoch, best_loss,
                                            args.pretrain_save)

                pbar.set_postfix_str(
                    f"best epoch: {best_epoch}, loss: {best_loss:.4f}")

    if 1 < args.n_gpu:
        destroy_process_group()
示例#4
0
def train_model(rank, world_size, args):
    if 1 < args.n_gpu:
        init_process_group(rank, world_size)
    master = (world_size == 0 or rank % world_size == 0)

    vocab = load_vocab(args.vocab)

    config = cfg.Config.load(args.config)
    config.n_enc_vocab, config.n_dec_vocab = len(vocab), len(vocab)
    config.device = torch.device(
        f"cuda:{rank}" if torch.cuda.is_available() else "cpu")
    print(config)

    best_epoch, best_loss = 0, 0
    model = albert.ALBERTPretrain(config)
    if os.path.isfile(args.save):
        model.albert.load(args.save)
        print(f"rank: {rank} load pretrain from: {args.save}")
    if 1 < args.n_gpu:
        model.to(config.device)
        model = DistributedDataParallel(model,
                                        device_ids=[rank],
                                        find_unused_parameters=True)
    else:
        model.to(config.device)

    criterion_lm = torch.nn.CrossEntropyLoss(ignore_index=-1, reduction='mean')
    criterion_cls = torch.nn.CrossEntropyLoss()

    train_loader, train_sampler = data.build_pretrain_loader(vocab,
                                                             args,
                                                             shuffle=True)

    t_total = len(train_loader) * args.epoch
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        config.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = optim.AdamW(optimizer_grouped_parameters,
                            lr=config.learning_rate,
                            eps=config.adam_epsilon)
    scheduler = optim.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=config.warmup_steps,
        num_training_steps=t_total)

    offset = best_epoch
    for step in trange(args.epoch, desc="Epoch"):
        if train_sampler:
            train_sampler.set_epoch(step)
        epoch = step + offset

        loss = train_epoch(config, rank, epoch, model, criterion_lm,
                           criterion_cls, optimizer, scheduler, train_loader)

        if master:
            best_epoch, best_loss = epoch, loss
            if isinstance(model, DistributedDataParallel):
                model.module.albert.save(best_epoch, best_loss, args.save)
            else:
                model.albert.save(best_epoch, best_loss, args.save)
            print(
                f">>>> rank: {rank} save model to {args.save}, epoch={best_epoch}, loss={best_loss:.3f}"
            )

    if 1 < args.n_gpu:
        destroy_process_group()
示例#5
0
def train_model(rank, world_size, args):
    """ 모델 학습 """
    master = (world_size == 0 or rank % world_size == 0)
    if master and args.wandb:
        wandb.init(project=args.project, resume=args.name, tags=args.tags)

    if 1 < args.n_gpu:
        init_process_group(rank, world_size)

    vocab = load_vocab(args.vocab)

    config = Config.load(args.config)
    config.n_enc_vocab, config.n_dec_vocab = len(vocab), len(vocab)
    config.device = torch.device(f"cuda:{rank}" if torch.cuda.is_available() else "cpu")
    print(config)

    best_epoch, best_loss, best_score = 0, 0, 0
    model: MovieClassification = transformer.MovieClassification(config)
    if args.resume and os.path.isfile(args.save):
        best_epoch, best_loss, best_score = model.load(args.save)
        print(f"rank: {rank}, last epoch: {best_epoch} load state dict from: {os.path.basename(args.save)}")
    model.to(config.device)

    if master and args.wandb:
        wandb.watch(model)

    if 1 < args.n_gpu:
        model = DistributedDataParallel(model, device_ids=[rank], find_unused_parameters=True)

    criterion = torch.nn.CrossEntropyLoss()

    train_loader, train_sampler = data.build_data_loader(rank, vocab, os.path.abspath(os.path.join(os.getcwd(), args.data_dir, "ratings_train.json")), args, shuffle=True)
    test_loader, test_sampler = data.build_data_loader(rank, vocab, os.path.abspath(os.path.join(os.getcwd(), args.data_dir, "ratings_test.json")), args, shuffle=False)

    t_total = len(train_loader) * args.epoch
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = optimization.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = optimization.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total, last_epoch=best_epoch)

    print(f'total_memory: {torch.cuda.get_device_properties(rank).total_memory / (1024 * 1024):.3f} MB')
    with tqdm(initial=best_epoch + 1, total=args.epoch, position=0) as pbar:
        for epoch in range(best_epoch + 1, args.epoch + 1):
            if train_sampler:
                train_sampler.set_epoch(epoch)

            train_loss = train_epoch(args, config, rank, epoch, model, criterion, optimizer, scheduler, train_loader)
            test_loss, test_accuracy = eval_epoch(config, rank, model, test_loader, test_sampler)
            if master and args.wandb:
                wandb.config.update(args)
                wandb.log(row={"train loss": train_loss, "accuracy": test_accuracy}, step=epoch)

            if master:
                if best_score < test_accuracy:
                    best_epoch, best_loss, best_score = epoch, train_loss, test_accuracy
                    pbar.set_description(f'Best (score={best_score:.3f}, epoch={best_epoch})')
                    if isinstance(model, DistributedDataParallel):
                        model.module.save(best_epoch, best_loss, best_score, args.save)
                    else:
                        model.save(best_epoch, best_loss, best_score, args.save)
                else:
                    if best_epoch + 5 < epoch:  # early stop
                        break

            pbar.update()
            break
        print(f'total_memory: {torch.cuda.get_device_properties(rank).total_memory / (1024 * 1024):.3f} MB')

    if master and args.wandb:
        wandb.save(args.name)
    if 1 < args.n_gpu:
        destroy_process_group()