示例#1
0
def prep_data(trees, X_vocab=None, y_vocab=None):
    update_vocab = False
    if X_vocab is None:
        X_vocab, y_vocab = Vocab(), Vocab()
        update_vocab = True
    X, y = [], []
    for tree in tqdm(trees):
        if len(tree.tokens) < 2: continue
        #TODO accumulate features without iterating over all states
        try:
            for state, decision in tree.iter_oracle_states():
                feats = state.extract_features()
                if update_vocab:
                    X_vocab.add_words(feats)
                    y_vocab.add_word(decision)
                X.append([X_vocab.encode(f) for f in feats])
                y.append(y_vocab.encode(decision))
        except:
            pass
    return X, y, X_vocab, y_vocab
示例#2
0
def pre_trained(config):
    vocab = Vocab(config)
    vocab.add_words()
    vocab.build_bert_vocab()
    train = vocab.get_pre_trained_examples()
    print("train nums:{}".format(len(train)))

    # 3)使用DistributedSampler

    train_dataset = BuildDataSet(train)

    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset)
    train_load = DataLoader(dataset=train_dataset,
                            batch_size=config.batch_size,
                            shuffle=False,
                            collate_fn=collate_fn,
                            sampler=train_sampler)

    #load source bert weights

    model_config = BertConfig.from_pretrained(
        pretrained_model_name_or_path="../user_data/bert_source/{}_config.json"
        .format(config.model_name))
    model_config.vocab_size = len(
        pd.read_csv('../user_data/vocab', names=["score"]))
    model = BertForMaskedLM(config=model_config)

    #     if os.path.isfile('../user_data/save_bert/bert_checkpoint.pth.tar'):
    #         exist_checkpoint = torch.load('../user_data/save_bert/{}_checkpoint.pth.tar'.format(config.model_name),map_location=torch.device('cpu'))
    #         exit_status,exit_epoch = exist_checkpoint["status"],exist_checkpoint["epoch"]
    #         model = BertForMaskedLM(config=model_config)
    #         model.load_state_dict(exit_status)
    #         del exit_status
    #         print("*********load chechpoin file********")
    #     else:
    #         model = BertForMaskedLM(config=model_config)
    # #         status = torch.load('../user_data/bert_source/{}/pytorch_model.bin'.format(config.model_name),map_location=torch.device('cpu'))
    # #         del_ls=['bert.embeddings.word_embeddings.weight','cls.predictions.bias','cls.predictions.decoder.weight','cls.predictions.decoder.bias']
    # #         for col in del_ls:
    # #             if col in status:
    # #                 del status[col]
    # #         model.load_state_dict(status,strict=False)
    #         exit_epoch = 0
    #         print("*********load {}_bert source file********".format(config.model_name))
    #         del status

    for param in model.parameters():
        param.requires_grad = True

    # 4) 封装之前要把模型移到对应的gpu
    model = model.to(config.device)

    no_decay = ["bias", "LayerNorm.weight"]

    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            config.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate)
    cudnn.benchmark = True

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        # 5)封装
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[config.local_rank])

    model.train()
    for epoch in range(config.num_train_epochs):
        train_sampler.set_epoch(epoch)
        torch.cuda.empty_cache()

        for batch, (input_ids, token_type_ids, attention_mask,
                    label) in enumerate(train_load):
            input_ids = input_ids.cuda(config.local_rank, non_blocking=True)
            attention_mask = attention_mask.cuda(config.local_rank,
                                                 non_blocking=True)
            token_type_ids = token_type_ids.cuda(config.local_rank,
                                                 non_blocking=True)
            label = label.cuda(config.local_rank, non_blocking=True)

            outputs = model(input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            labels=label)

            loss = outputs.loss

            #同步各个进程的速度,计算分布式loss
            torch.distributed.barrier()
            if torch.cuda.device_count() > 1:
                reduced_loss = reduce_mean(loss, config.nprocs)
            else:
                reduced_loss = loss

            model.zero_grad()
            loss.backward()
            optimizer.step()

        if config.local_rank in [0, -1]:
            now = strftime("%Y-%m-%d %H:%M:%S", localtime())
            print("time:{},epoch:{}/{},mlm_reduce_loss:{}".format(
                now, epoch + 1, config.num_train_epochs, reduced_loss.item()))
            if torch.cuda.device_count() > 1:
                checkpoint = {
                    "status": model.module.state_dict(),
                    "epoch": epoch + 1
                }
            else:
                checkpoint = {"status": model.state_dict(), "epoch": epoch + 1}
            torch.save(
                checkpoint,
                '../user_data/save_bert/{}_checkpoint.pth.tar'.format(
                    config.model_name))
            del checkpoint
示例#3
0
def pre_trained(config):
    vocab = Vocab(config)
    vocab.add_words()
    vocab.build_bert_vocab()
    train = vocab.get_pre_trained_examples()

    # 3)使用DistributedSampler

    train_dataset = BuildDataSet(train)

    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset)
    train_load = DataLoader(dataset=train_dataset,
                            batch_size=config.batch_size,
                            shuffle=False,
                            collate_fn=collate_fn,
                            sampler=train_sampler)

    #load source bert weights
    model_config = BertConfig.from_pretrained(
        pretrained_model_name_or_path="bert_source/bert_config.json")
    model = BertForMaskedLM.from_pretrained(
        pretrained_model_name_or_path="bert_source", config=model_config)

    # model_config = BertConfig()
    # model = BertForMaskedLM(config=model_config)

    # 4) 封装之前要把模型移到对应的gpu
    model = model.to(config.device)

    no_decay = ["bias", "LayerNorm.weight"]

    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            config.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate)
    cudnn.benchmark = True

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        # 5)封装
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[config.local_rank])

    model.train()
    for epoch in range(config.num_train_epochs):
        train_sampler.set_epoch(epoch)

        for batch, (input_ids, token_type_ids, attention_mask,
                    label) in enumerate(train_load):
            input_ids = input_ids.cuda(config.local_rank, non_blocking=True)
            attention_mask = attention_mask.cuda(config.local_rank,
                                                 non_blocking=True)
            token_type_ids = token_type_ids.cuda(config.local_rank,
                                                 non_blocking=True)
            label = label.cuda(config.local_rank, non_blocking=True)

            outputs = model(input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            labels=label)

            loss = outputs.loss

            #同步各个进程的速度,计算分布式loss
            torch.distributed.barrier()
            reduced_loss = reduce_mean(loss, config.nprocs)

            model.zero_grad()
            loss.backward()
            optimizer.step()

        if config.local_rank in [0, -1]:
            now = strftime("%Y-%m-%d %H:%M:%S", localtime())
            print("time:{},epoch:{}/{},mlm_reduce_loss:{},loss:{}".format(
                now, epoch + 1, config.num_train_epochs, reduced_loss.item(),
                loss.item()))
            torch.save(model.module.state_dict(),
                       'save_bert' + os.sep + 'checkpoint.pth.tar')