def main(args):
    preprocessor = Preprocessor(args.model_type, args.max_len)
    train_dataloader, val_dataloader, test_dataloader = get_dataloader(
        args, preprocessor)
    bert_finetuner = BertModel(args, train_dataloader, val_dataloader,
                               test_dataloader)

    logger = TensorBoardLogger(save_dir=args.log_dir,
                               version=1,
                               name="nsmc-bert")

    early_stop_callback = EarlyStopping(monitor='val_acc',
                                        min_delta=0.00,
                                        patience=5,
                                        verbose=False,
                                        mode='max')

    checkpoint_callback = ModelCheckpoint(filepath=args.checkpoint_path,
                                          verbose=True,
                                          monitor='val_acc',
                                          mode='max',
                                          save_top_k=3,
                                          prefix='')

    trainer = pl.Trainer(
        gpus=1,
        # distributed_backend='ddp'
        checkpoint_callback=checkpoint_callback,
        early_stop_callback=early_stop_callback,
        logger=logger)

    trainer.fit(bert_finetuner)

    trainer.test()
Exemplo n.º 2
0
def get_model(tokenizer, args):
    """Build the model."""

    print('building BERT model ...')
    model = BertModel(tokenizer, args)
    print(' > number of parameters: {}'.format(
        sum([p.nelement() for p in model.parameters()])),
          flush=True)

    # GPU allocation.
    model.cuda(torch.cuda.current_device())

    # Fp16 conversion.
    if args.fp16:
        print("fp16 mode")
        model = FP16_Module(model)
        if args.fp32_embedding:
            model.module.model.bert.embeddings.word_embeddings.float()
            model.module.model.bert.embeddings.position_embeddings.float()
            model.module.model.bert.embeddings.token_type_embeddings.float()
        if args.fp32_tokentypes:
            model.module.model.bert.embeddings.token_type_embeddings.float()
        if args.fp32_layernorm:
            for name, _module in model.named_modules():
                if 'LayerNorm' in name:
                    _module.float()
    # Wrap model for distributed training.
    if args.world_size > 1:
        model = DDP(model)

    return model
Exemplo n.º 3
0
def main():
    parser = ArgumentParser()
    parser.add_argument("--model-config",
                        type=str,
                        default="openai-gpt",
                        help="Path, url or short name of the model")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument("--outlens", type=int, default=30)
    parser.add_argument("--beam", type=int, default=1)
    parser.add_argument("--checkpoints", type=str)
    parser.add_argument("--data", type=str, default="file")

    args = parser.parse_args()
    args.load_model = True

    model = BertModel(None, args)
    state_dict = convert_model(torch.load(args.checkpoints)['sd'])
    model.load_state_dict(state_dict)
    model.to(args.device)
    tokenizer = BertWordPieceTokenizer("bert-base-chinese",
                                       cache_dir="temp_cache_dir")
    generate(model,
             tokenizer,
             args.device,
             args.data,
             sample=True,
             top_k=5,
             beam_size=6,
             outlens=30)
Exemplo n.º 4
0
def get_model(args):
    """Build the model."""

    print_rank_0('building BERT model ...')
    model = BertModel(args)

    if mpu.get_data_parallel_rank() == 0:
        print(' > number of parameters on model parallel rank {}: {}'.format(
            mpu.get_model_parallel_rank(),
            sum([p.nelement() for p in model.parameters()])),
              flush=True)

    # GPU allocation.
    model.cuda(torch.cuda.current_device())

    # Fp16 conversion.
    if args.fp16:
        model = FP16_Module(model)
        if args.fp32_embedding:
            model.module.model.bert.embeddings.word_embeddings.float()
            model.module.model.bert.embeddings.position_embeddings.float()
            model.module.model.bert.embeddings.token_type_embeddings.float()
        if args.fp32_tokentypes:
            model.module.model.bert.embeddings.token_type_embeddings.float()
        if args.fp32_layernorm:
            for name, _module in model.named_modules():
                if 'LayerNorm' in name:
                    _module.float()

    # Wrap model for distributed training.
    if USE_TORCH_DDP:
        i = torch.cuda.current_device()
        model = DDP(model,
                    device_ids=[i],
                    output_device=i,
                    process_group=mpu.get_data_parallel_group())
    else:
        model = DDP(model)

    return model
Exemplo n.º 5
0
def get_model(args):
    """Build the model."""

    print_rank_0('building BERT model ...')
    model = BertModel(args)

    if mpu.get_data_parallel_rank() == 0:
        print(' > number of parameters on model parallel rank {}: {}'.format(
            mpu.get_model_parallel_rank(),
            sum([p.nelement() for p in model.parameters()])), flush=True)

    # GPU allocation.
    model.cuda(torch.cuda.current_device())

    # Fp16 conversion.
    if args.fp16:
        model = FP16_Module(model)
        if args.fp32_embedding:
            model.module.model.bert.embeddings.word_embeddings.float()
            if args.ds_type=='BERT':
                model.module.model.bert.embeddings.position_embeddings.float()
            else:
                model.module.model.bert.embeddings.token_position_embeddings.float()
                model.module.model.bert.embeddings.para_position_embeddings.float()
                model.module.model.bert.embeddings.sent_position_embeddings.float()
            model.module.model.bert.embeddings.token_type_embeddings.float()
        if args.fp32_tokentypes:
            model.module.model.bert.embeddings.token_type_embeddings.float()
        if args.fp32_layernorm:
            for name, _module in model.named_modules():
                if 'LayerNorm' in name:
                    _module.float()

    # Wrap model for distributed training.
    if args.DDP_impl == 'torch':
        i = torch.cuda.current_device()
        args.DDP_type = torch.nn.parallel.distributed.DistributedDataParallel
        model = args.DDP_type(model, device_ids=[i], output_device=i,
                              process_group=mpu.get_data_parallel_group())
    elif args.DDP_impl == 'local':
        args.DDP_type = LocalDDP
        model = args.DDP_type(model)
    else:
        print_rank_0('Unknown DDP implementation specified: {}. '
                     'Exiting.'.format(args.DDP_impl))
        exit()

    return model
Exemplo n.º 6
0
def model_init(app):
    ArgsSet = type('ArgsSet',(object,),{})
    client = ArgsSet()
    parser = ArgumentParser()
    parser.add_argument("--model-config", type=str, default="openai-gpt",
                        help="Path, url or short name of the model")
    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available()
                        else "cpu", help="Device (cuda or cpu)")
    parser.add_argument("--outlens", type=int, default=30)
    parser.add_argument("--beam", type=int, default=1)
    parser.add_argument("--gpt-checkpoints", type=str)
    parser.add_argument("--port", type=int, default=8866)

    args = parser.parse_args()
    args.load_model = True
    args.fp32_embedding = False
    args.fp32_layernorm = False
    args.fp32_tokentypes = False
    args.layernorm_epsilon = 1e-12

    gpt = BertModel(None, args)
    state_dict = convert_model(torch.load(args.gpt_checkpoints)['sd'])
    gpt.load_state_dict(state_dict)
    gpt.to(args.device)
    gpt.eval()
    tokenizer = BertWordPieceTokenizer("bert-base-chinese", cache_dir="temp_cache_dir")
    print(" Load model from {}".format(args.gpt_checkpoints))

    client.tokenizer = tokenizer
    client.gpt =gpt
    client.gpt_beam = SequenceGenerator(gpt, tokenizer, beam_size=args.beam, max_lens=args.outlens)
    client.device = args.device
    client.port = args.port
    client.generator = sample_sequence

    return client
Exemplo n.º 7
0
def main(train_file,
         dev_file,
         target_dir,
         epochs=10,
         batch_size=32,
         lr=2e-05,
         patience=3,
         max_grad_norm=10.0,
         checkpoint=None):
    bert_tokenizer = BertTokenizer.from_pretrained('bert-base-chinese',
                                                   do_lower_case=True)
    device = torch.device("cuda")
    print(20 * "=", " Preparing for training ", 20 * "=")
    # 保存模型的路径
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    # -------------------- Data loading ------------------- #
    print("\t* Loading training data...")
    train_data = DataPrecessForSentence(bert_tokenizer, train_file)
    train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
    print("\t* Loading validation data...")
    dev_data = DataPrecessForSentence(bert_tokenizer, dev_file)
    dev_loader = DataLoader(dev_data, shuffle=True, batch_size=batch_size)
    # -------------------- Model definition ------------------- #
    print("\t* Building model...")
    model = BertModel().to(device)
    # -------------------- Preparation for training  ------------------- #
    # 待优化的参数
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
    #optimizer = torch.optim.Adam(optimizer_grouped_parameters, lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           mode="max",
                                                           factor=0.85,
                                                           patience=0)
    best_score = 0.0
    start_epoch = 1
    # Data for loss curves plot
    epochs_count = []
    train_losses = []
    valid_losses = []
    # Continuing training from a checkpoint if one was given as argument
    if checkpoint:
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint["epoch"] + 1
        best_score = checkpoint["best_score"]
        print("\t* Training will continue on existing model from epoch {}...".
              format(start_epoch))
        model.load_state_dict(checkpoint["model"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        epochs_count = checkpoint["epochs_count"]
        train_losses = checkpoint["train_losses"]
        valid_losses = checkpoint["valid_losses"]
    # Compute loss and accuracy before starting (or resuming) training.
    _, valid_loss, valid_accuracy, auc = validate(model, dev_loader)
    print(
        "\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}"
        .format(valid_loss, (valid_accuracy * 100), auc))
    # -------------------- Training epochs ------------------- #
    print("\n", 20 * "=", "Training Bert model on device: {}".format(device),
          20 * "=")
    patience_counter = 0
    for epoch in range(start_epoch, epochs + 1):
        epochs_count.append(epoch)
        print("* Training epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader,
                                                       optimizer, epoch,
                                                       max_grad_norm)
        train_losses.append(epoch_loss)
        print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%".
              format(epoch_time, epoch_loss, (epoch_accuracy * 100)))
        print("* Validation for epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy, epoch_auc = validate(
            model, dev_loader)
        valid_losses.append(epoch_loss)
        print(
            "-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}\n"
            .format(epoch_time, epoch_loss, (epoch_accuracy * 100), epoch_auc))
        # Update the optimizer's learning rate with the scheduler.
        scheduler.step(epoch_accuracy)
        # Early stopping on validation accuracy.
        if epoch_accuracy < best_score:
            patience_counter += 1
        else:
            best_score = epoch_accuracy
            patience_counter = 0
            torch.save(
                {
                    "epoch": epoch,
                    "model": model.state_dict(),
                    "best_score": best_score,
                    "epochs_count": epochs_count,
                    "train_losses": train_losses,
                    "valid_losses": valid_losses
                }, os.path.join(target_dir, "best.pth.tar"))
        if patience_counter >= patience:
            print("-> Early stopping: patience limit reached, stopping...")
            break
Exemplo n.º 8
0
def train(args):
    if "bert" in args.model_type:
        set_gelu("tanh")  # 切换gelu版本

        # Step1: Load Data
        data_generator = None
        if "siamese" in args.model_type:
            data_generator = SiameseDataGenerator
        elif "albert" in args.model_type:
            data_generator = BertDataGenerator

        train_ds = data_generator(data_path=args.train_data_path,
                                  batch_size=args.batch_size,
                                  dict_path=args.bert_dict_path,
                                  maxlen=args.query_len)
        dev_ds = data_generator(data_path=args.dev_data_path,
                                batch_size=args.batch_size,
                                maxlen=args.query_len,
                                dict_path=args.bert_dict_path)
        test_ds = data_generator(data_path=args.test_data_path,
                                 batch_size=args.batch_size,
                                 maxlen=args.query_len,
                                 dict_path=args.bert_dict_path)

        # Step2: Load Model
        model = None
        if "siamese" in args.model_type:
            model = SiameseBertModel(config_path=args.bert_config_path,
                                     checkpoint_path=args.bert_checkpoint_path,
                                     dense_units=args.dense_units)
        elif "albert" in args.model_type:
            model = BertModel(config_path=args.bert_config_path,
                              checkpoint_path=args.bert_checkpoint_path)

        model_name = model.__class__.__name__
        model = model.get_model()

        from bert4keras.optimizers import Adam
        model.compile(
            loss='sparse_categorical_crossentropy',
            optimizer=Adam(2e-5),  # 用足够小的学习率
            # optimizer=PiecewiseLinearLearningRate(Adam(5e-5), {10000: 1, 30000: 0.1}),
            metrics=['accuracy'],
        )

        evaluator = Evaluator(dev_ds=dev_ds,
                              model_name=model_name,
                              is_bert_model=True,
                              test_ds=test_ds)
        logger.info("***** Running training *****")
        logger.info("  Model Class Name = %s", model_name)
        logger.info("  Num Epochs = %d", args.epoch)
        model.fit_generator(train_ds.forfit(),
                            steps_per_epoch=len(train_ds),
                            epochs=args.epoch,
                            callbacks=[evaluator],
                            verbose=2)

        model.load_weights('./checkpoints/best_{}.weight'.format(model_name))
        logger.info("***** Test Reslt *****")
        logger.info("  Model = %s", model_name)
        logger.info("  Batch Size = %d", args.batch_size)
        logger.info("  Final Test Acc:%05f",
                    cal_acc(data=test_ds, model=model, is_bert_model=True))

    elif "NN" in args.model_type:
        # Step 1 : Loda Data
        train_data = pd.read_csv(args.train_data_path)
        dev_data = pd.read_csv(args.dev_data_path)
        test_data = pd.read_csv(args.test_data_path)

        category_count = len(train_data["category"].value_counts())
        category_encoder = category_OneHotEncoder(data_df=train_data)

        loader = LoadData(w2v_path=args.w2v_path, query_len=args.query_len)
        word2idx = loader.word2idx
        emd_matrix = loader.emb_matrix
        """
        注意:
        shuffle的顺序很重要:一般建议是先执行shuffle方法,接着采用batch方法。
        这样是为了保证在整体数据打乱之后再取出batch_size大小的数据。
        如果先采取batch方法再采用shuffle方法,那么此时就只是对batch进行shuffle,
        而batch里面的数据顺序依旧是有序的,那么随机程度会减弱。
        """
        train_ds = loader.dataset(encoder=category_encoder, data_df=train_data)
        train_ds = train_ds.shuffle(buffer_size=len(train_data)).batch(
            batch_size=args.batch_size).repeat()

        dev_ds = loader.dataset(encoder=category_encoder, data_df=dev_data)
        dev_ds = dev_ds.batch(batch_size=args.batch_size)
        test_ds = loader.dataset(encoder=category_encoder, data_df=test_data)
        test_ds = test_ds.batch(batch_size=args.batch_size)

        # Step2: Load Model
        model = None
        if "siamese_CNN" in args.model_type:
            model = SiameseCnnModel(emb_matrix=emd_matrix,
                                    word2idx=word2idx,
                                    filters_nums=args.filters_nums,
                                    kernel_sizes=args.kernel_sizes,
                                    dense_units=args.dense_units,
                                    label_count=args.label_count,
                                    category_count=category_count,
                                    query_len=args.query_len,
                                    shared=args.feature_shared,
                                    add_feature=args.add_features)
        elif "siamese_RNN" in args.model_type:
            model = SiameseRnnModel(emb_matrix=emd_matrix,
                                    word2idx=word2idx,
                                    hidden_units=args.hidden_units,
                                    dense_units=args.dense_units,
                                    label_count=args.label_count,
                                    category_count=category_count,
                                    query_len=args.query_len,
                                    mask_zero=args.mask_zero,
                                    bidirection=args.bi_direction,
                                    shared=args.feature_shared,
                                    add_feature=args.add_features)
        model_name = model.__class__.__name__
        model = model.get_model()

        logger.info("***** Running training *****")
        logger.info("  Model Class Name = %s", model_name)
        logger.info("  Num examples = %d", len(train_data))
        logger.info("  Num Epochs = %d", args.epoch)

        model.compile(optimizer='adam',
                      loss="binary_crossentropy",
                      metrics=["acc"])
        early_stopping = EarlyStopping(monitor="val_acc",
                                       patience=3,
                                       mode="max")
        evaluator = Evaluator(dev_ds=dev_ds,
                              model_name=model_name,
                              is_bert_model=False,
                              dev_label=dev_data['label'])

        # Step3: Train Model
        history = model.fit(train_ds,
                            callbacks=[early_stopping, evaluator],
                            epochs=args.epoch,
                            steps_per_epoch=len(train_data) // args.batch_size,
                            validation_data=dev_ds,
                            validation_steps=len(dev_data) // args.batch_size)

        # Step4 : Save model and trainLogs
        logger.info("***** Training Logs *****")

        for epoch in history.epoch:
            logger.info("Epoch %d", epoch)
            logger.info("train_loss:%f train_acc:%f val_loss:%f val_acc:%f",
                        history.history.get("loss")[epoch],
                        history.history.get("acc")[epoch],
                        history.history.get("val_loss")[epoch],
                        history.history.get("val_acc")[epoch])
        #
        # time_stamp = datetime.datetime.now().strftime('%m-%d_%H-%M-%S')
        # path = './checkpoints/{}_{}.h5'.format(model_name, time_stamp)
        # model.save(path)

        model = load_model('./checkpoints/best_{}.h5'.format(model_name))
        y_pred = model.predict(test_ds)
        y_true = test_data["label"].values.reshape((-1, 1))

        y_pred[y_pred > 0.5] = 1
        y_pred[y_pred < 0.5] = 0

        acc = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)

        logger.info("***** Pramaters *****")
        logger.info("  ModelName = %s", args.model_type)
        logger.info("  Add Features = %s", args.add_features)
        logger.info("  Embedding dims = %d", len(emd_matrix[0]))
        logger.info("  BatchSize = %d", args.batch_size)

        if "CNN" in args.model_type:
            logger.info("  kernel_sizes = %s", args.kernel_sizes)
            logger.info("  filters_nums = %s", args.filters_nums)
        elif "RNN" in args.model_type:
            logger.info("  hidden_units = %s", args.hidden_units)
            logger.info("  bi_direction = %s", args.bi_direction)

        logger.info("  dense_units = %s", args.dense_units)
        logger.info("  feature_shared = %s", args.feature_shared)
        logger.info("***** Testing Results *****")
        logger.info("  Acc = %f", acc)
        logger.info("  Precision = %f", precision)
        logger.info("  Recall = %f", recall)
        logger.info("  F1-score = %f", f1)
Exemplo n.º 9
0
Arquivo: main.py Projeto: jeanm/text
    # We use only ~1% data to fine-tune the model.
    train, dev = SQuAD1()
    raw_train = list(train)[:1024]
    raw_dev = list(dev)[:128]
    convert_to_arrow(raw_train, "train_arrow")
    convert_to_arrow(raw_dev, "dev_arrow")

    base_url = 'https://pytorch.s3.amazonaws.com/models/text/torchtext_bert_example/'
    vocab_path = download_from_url(base_url + 'bert_vocab.txt')
    data_module = QuestionAnswerDataModule(train_arrow_path='train_arrow',
                                           dev_arrow_path='dev_arrow',
                                           vocab_filepath=vocab_path,
                                           batch_size=BATCH_SIZE)

    # Load pretrained model and generate task
    # default parameters from the pretrained model
    vocab_size, emsize, nhead, nhid, nlayers, dropout = 99230, 768, 12, 3072, 12, 0.2
    pretrained_bert = BertModel(vocab_size, emsize, nhead, nhid, nlayers,
                                dropout)
    pretrained_model_path = download_from_url(base_url + 'ns_bert.pt')
    pretrained_bert.load_state_dict(
        torch.load(pretrained_model_path, map_location='cpu'))
    qa_model = QuestionAnswerModel(pretrained_bert)

    task = QuestionAnswerTask(qa_model, LR)
    trainer = Trainer(gpus=0,
                      max_epochs=EPOCH,
                      progress_bar_refresh_rate=40,
                      fast_dev_run=True)
    trainer.fit(task, data_module)
Exemplo n.º 10
0
from model import mlm_loss, nsp_accuracy, mlm_accuracy

config = BertConfig(vocab_size=300,
                    hidden_size=128,
                    num_hidden_layers=1,
                    num_attention_heads=1,
                    intermediate_size=512,
                    hidden_act="gelu",
                    hidden_dropout_prob=0.1,
                    attention_probs_dropout_prob=0.1,
                    max_position_embeddings=512,
                    type_vocab_size=2,
                    initializer_range=0.02)


bert_model = BertModel(config, with_nsp=True, with_mlm=True, is_pretrain=True)



# if with_mlm=True, inputs will be (input_ids, input_mask, token_type_ids, masked_lm_positions)
input_ids = tf.keras.Input(shape=(512,), dtype=tf.int32)
input_mask = tf.keras.Input(shape=(512,), dtype=tf.int32)
token_type_ids = tf.keras.Input(shape=(512,), dtype=tf.int32)
# masked_lm_positions = [batch_size, MAX_PREDICTIONS_PER_SEQ]
masked_lm_positions = tf.keras.Input(shape=(2,), dtype=tf.int32)

inputs = (input_ids, input_mask, token_type_ids, masked_lm_positions)
bert_model(inputs)

# bert_model = tf.keras.Model(inputs, outputs)
Exemplo n.º 11
0
def model_init(app):
    ArgsSet = type('ArgsSet', (object, ), {})
    client = ArgsSet()
    parser = ArgumentParser()
    parser.add_argument("--model-config",
                        type=str,
                        default="openai-gpt",
                        help="Path, url or short name of the model")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument("--outlens", type=int, default=30)
    parser.add_argument("--beam", type=int, default=1)
    parser.add_argument("--fuse-checkpoints", type=str)
    parser.add_argument("--gpt-checkpoints", type=str)
    parser.add_argument("--qa-style-checkpoints", type=str)
    parser.add_argument("--multi-task", type=str)
    parser.add_argument("--split-sentence-with-task-embedding-checkpoints",
                        type=str)
    parser.add_argument("--special-cls-checkpoints", type=str)

    parser.add_argument("--port", type=int, default=8866)

    args = parser.parse_args()
    args.load_model = True
    args.fp32_embedding = False
    args.fp32_layernorm = False
    args.fp32_tokentypes = False
    args.layernorm_epsilon = 1e-12

    fuse_model = BertModel(None, args)
    state_dict = convert_model(torch.load(args.fuse_checkpoints)['sd'])
    fuse_model.load_state_dict(state_dict)
    fuse_model.to(args.device)
    fuse_model.eval()
    print("| Load model from {}".format(args.fuse_checkpoints))

    gpt = BertModel(None, args)
    state_dict = convert_model(torch.load(args.gpt_checkpoints)['sd'])
    gpt.load_state_dict(state_dict)
    gpt.to(args.device)
    gpt.eval()
    tokenizer = BertWordPieceTokenizer("bert-base-chinese",
                                       cache_dir="temp_cache_dir")
    print(" Load model from {}".format(args.gpt_checkpoints))

    # Load bert checkpoints
    args.load_model = False
    args.fp32_embedding = False
    args.fp32_layernorm = False
    args.fp32_tokentypes = False
    args.layernorm_epsilon = 1e-12
    bert = BertModel(None, args)
    bert.to(args.device)
    bert.eval()

    client.tokenizer = tokenizer
    client.fuse_model = fuse_model
    client.fuse_beam = SequenceGenerator(fuse_model,
                                         tokenizer,
                                         beam_size=args.beam,
                                         max_lens=args.outlens)
    client.gpt = gpt
    client.gpt_beam = SequenceGenerator(gpt,
                                        tokenizer,
                                        beam_size=args.beam,
                                        max_lens=args.outlens)
    client.bert = bert
    client.device = args.device
    client.port = args.port
    client.generator = sample_sequence

    # multi task model

    multi_task = BertModel(None, args)
    state_dict = convert_model(torch.load(args.multi_task)['sd'])
    print("| Load model from {}".format(args.multi_task))
    multi_task.load_state_dict(state_dict)
    multi_task.to(args.device)
    multi_task.eval()
    client.multi_task_model = multi_task
    client.multi_task_beam = SequenceGenerator(multi_task,
                                               tokenizer,
                                               beam_size=args.beam,
                                               max_lens=args.outlens)

    # qa style model
    qa_style = BertModel(None, args)
    state_dict = convert_model(torch.load(args.qa_style_checkpoints)['sd'])
    qa_style.load_state_dict(state_dict)
    qa_style.to(args.device)
    qa_style.eval()
    print(" Load model from {}".format(args.qa_style_checkpoints))
    client.qa_task_model = qa_style

    # special cls tokens
    special_cls_model = BertModel(None, args)
    special_cls_model.eval()
    state_dict = convert_model(torch.load(args.special_cls_checkpoints)['sd'])
    special_cls_model.load_state_dict(state_dict)
    special_cls_model.to(args.device)
    special_cls_model.eval()
    print(" Load model from {}".format(args.special_cls_checkpoints))
    client.special_cls_model = special_cls_model
    client.special_beam = SequenceGenerator(special_cls_model,
                                            tokenizer,
                                            beam_size=args.beam,
                                            max_lens=args.outlens)

    # split sentence model with task embedding
    split_sentence_model = BertModel(None, args)
    split_sentence_model.eval()
    state_dict = convert_model(
        torch.load(args.split_sentence_with_task_embedding_checkpoints)['sd'])
    split_sentence_model.load_state_dict(state_dict)
    split_sentence_model.to(args.device)
    split_sentence_model.eval()
    print(" Load model from {}".format(
        args.split_sentence_with_task_embedding_checkpoints))
    client.split_sentence_model = split_sentence_model
    client.split_sentence_beam = SequenceGenerator(split_sentence_model,
                                                   tokenizer,
                                                   beam_size=args.beam,
                                                   max_lens=args.outlens)

    return client
Exemplo n.º 12
0
 @contact: [email protected]
"""
import re
import torch
import sentencepiece as spm

from model import BertModel

sp = spm.SentencePieceProcessor()
sp.load('resource/sentencepiece.unigram.35000.model')
vocab_size = sp.get_piece_size()

n_embedding = 512
n_layer = 8

model = BertModel(vocab_size, n_embedding, n_layer)
model.eval()
model.load_state_dict(torch.load('resource/model.{}.{}.th'.format(n_embedding, n_layer),
                                 map_location='cpu'))

# you should enable cuda if it is available
# model.cuda()

# if you are using a GPU that has tensor cores (nvidia volta, Turing architecture), you can enable half precision
# inference and training, we recommend to use the nvidia official apex to make everything as clean as possible from
# apex import amp [model] = amp.initialize([model], opt_level="O2")
device = model.embedding.weight.data.device


def clean_text(txt):
    txt = txt.lower()
Exemplo n.º 13
0
                result[0], result[1], result[2], result[3], result[4], result[5]
            
            summary_writer.add_summary(summary, global_step)
            _info('loss : {}\t lr : {}'.format(loss_bs, learning_rate), head='Step: {}'.format(global_step))

            if global_step % 100 == 0:
                train_model.saver.save(
                    train_sess,
                    os.path.join(PROJECT_PATH, config.ckpt_path),
                    global_step=global_step)

def infer(config):
    # build graph
    infer_graph = tf.Graph()
    with infer_graph.as_default():
        infer_model = BertModel(config=config, is_training=False)
    
    # create session
    sess_conf = tf.ConfigProto(intra_op_parallelism_threads=8, inter_op_parallelism_threads=8)
    sess_conf.gpu_options.allow_growth = True
    infer_sess = tf.Session(config=sess_conf, graph=infer_graph)

    # restore model from the latest checkpoint
    with infer_graph.as_default():
        loaded_model, global_step = _mh.create_or_load(
            infer_model, os.path.join(PROJECT_PATH, config.ckpt_path), infer_sess)

    # the following is just for test
    """
    import numpy as np
    input_ids = np.array([[10, 128, 10, 0, 120], [20, 3, 0, 0, 30]], dtype=np.int32)
Exemplo n.º 14
0
def main(seed):
    # tasks = ['Ames', 'BBB', 'FDAMDD', 'H_HT', 'Pgp_inh', 'Pgp_sub']
    # os.environ['CUDA_VISIBLE_DEVICES'] = "1"
    # tasks = ['BBB', 'FDAMDD',  'Pgp_sub']

    task = 'FDAMDD'
    print(task)

    small = {'name':'Small','num_layers': 3, 'num_heads': 2, 'd_model': 128,'path':'small_weights','addH':True}
    medium = {'name':'Medium','num_layers': 6, 'num_heads': 8, 'd_model': 256,'path':'medium_weights','addH':True}
    large = {'name':'Large','num_layers': 12, 'num_heads': 12, 'd_model': 512,'path':'large_weights','addH':True}

    arch = medium  ## small 3 4 128   medium: 6 6  256     large:  12 8 516
    pretraining = True
    pretraining_str = 'pretraining' if pretraining else ''

    trained_epoch = 10

    num_layers = arch['num_layers']
    num_heads = arch['num_heads']
    d_model = arch['d_model']
    addH = arch['addH']

    dff = d_model * 2
    vocab_size = 17
    dropout_rate = 0.1

    seed = seed
    np.random.seed(seed=seed)
    tf.random.set_seed(seed=seed)
    train_dataset, test_dataset , val_dataset = Graph_Classification_Dataset('data/clf/{}.csv'.format(task), smiles_field='SMILES',
                                                               label_field='Label',addH=True).get_data()

    x, adjoin_matrix, y = next(iter(train_dataset.take(1)))
    seq = tf.cast(tf.math.equal(x, 0), tf.float32)
    mask = seq[:, tf.newaxis, tf.newaxis, :]
    model = PredictModel(num_layers=num_layers, d_model=d_model, dff=dff, num_heads=num_heads, vocab_size=vocab_size,
                         dense_dropout=0.5)

    if pretraining:
        temp = BertModel(num_layers=num_layers, d_model=d_model, dff=dff, num_heads=num_heads, vocab_size=vocab_size)
        pred = temp(x, mask=mask, training=True, adjoin_matrix=adjoin_matrix)
        temp.load_weights(arch['path']+'/bert_weights{}_{}.h5'.format(arch['name'],trained_epoch))
        temp.encoder.save_weights(arch['path']+'/bert_weights_encoder{}_{}.h5'.format(arch['name'],trained_epoch))
        del temp

        pred = model(x,mask=mask,training=True,adjoin_matrix=adjoin_matrix)
        model.encoder.load_weights(arch['path']+'/bert_weights_encoder{}_{}.h5'.format(arch['name'],trained_epoch))
        print('load_wieghts')


    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)

    auc= -10
    stopping_monitor = 0
    for epoch in range(100):
        accuracy_object = tf.keras.metrics.BinaryAccuracy()
        loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True)
        for x,adjoin_matrix,y in train_dataset:
            with tf.GradientTape() as tape:
                seq = tf.cast(tf.math.equal(x, 0), tf.float32)
                mask = seq[:, tf.newaxis, tf.newaxis, :]
                preds = model(x,mask=mask,training=True,adjoin_matrix=adjoin_matrix)
                loss = loss_object(y,preds)
                grads = tape.gradient(loss, model.trainable_variables)
                optimizer.apply_gradients(zip(grads, model.trainable_variables))
                accuracy_object.update_state(y,preds)
        print('epoch: ',epoch,'loss: {:.4f}'.format(loss.numpy().item()),'accuracy: {:.4f}'.format(accuracy_object.result().numpy().item()))

        y_true = []
        y_preds = []

        for x, adjoin_matrix, y in val_dataset:
            seq = tf.cast(tf.math.equal(x, 0), tf.float32)
            mask = seq[:, tf.newaxis, tf.newaxis, :]
            preds = model(x,mask=mask,adjoin_matrix=adjoin_matrix,training=False)
            y_true.append(y.numpy())
            y_preds.append(preds.numpy())
        y_true = np.concatenate(y_true,axis=0).reshape(-1)
        y_preds = np.concatenate(y_preds,axis=0).reshape(-1)
        y_preds = tf.sigmoid(y_preds).numpy()
        auc_new = roc_auc_score(y_true,y_preds)

        val_accuracy = keras.metrics.binary_accuracy(y_true.reshape(-1), y_preds.reshape(-1)).numpy()
        print('val auc:{:.4f}'.format(auc_new), 'val accuracy:{:.4f}'.format(val_accuracy))

        if auc_new > auc:
            auc = auc_new
            stopping_monitor = 0
            np.save('{}/{}{}{}{}{}'.format(arch['path'], task, seed, arch['name'], trained_epoch, trained_epoch,pretraining_str),
                    [y_true, y_preds])
            model.save_weights('classification_weights/{}_{}.h5'.format(task,seed))
            print('save model weights')
        else:
            stopping_monitor += 1
        print('best val auc: {:.4f}'.format(auc))
        if stopping_monitor>0:
            print('stopping_monitor:',stopping_monitor)
        if stopping_monitor>20:
            break

    y_true = []
    y_preds = []
    model.load_weights('classification_weights/{}_{}.h5'.format(task, seed))
    for x, adjoin_matrix, y in test_dataset:
        seq = tf.cast(tf.math.equal(x, 0), tf.float32)
        mask = seq[:, tf.newaxis, tf.newaxis, :]
        preds = model(x, mask=mask, adjoin_matrix=adjoin_matrix, training=False)
        y_true.append(y.numpy())
        y_preds.append(preds.numpy())
    y_true = np.concatenate(y_true, axis=0).reshape(-1)
    y_preds = np.concatenate(y_preds, axis=0).reshape(-1)
    y_preds = tf.sigmoid(y_preds).numpy()
    test_auc = roc_auc_score(y_true, y_preds)
    test_accuracy = keras.metrics.binary_accuracy(y_true.reshape(-1), y_preds.reshape(-1)).numpy()
    print('test auc:{:.4f}'.format(test_auc), 'test accuracy:{:.4f}'.format(test_accuracy))

    return test_auc
def main(seed):
    # tasks = ['caco2', 'logD', 'logS', 'PPB', 'tox']
    # os.environ['CUDA_VISIBLE_DEVICES'] = "1"
    keras.backend.clear_session()
    os.environ['CUDA_VISIBLE_DEVICES'] = "0"
    small = {'name': 'Small', 'num_layers': 3, 'num_heads': 4, 'd_model': 128, 'path': 'small_weights','addH':True}
    medium = {'name': 'Medium', 'num_layers': 6, 'num_heads': 8, 'd_model': 256, 'path': 'medium_weights','addH':True}
    medium2 = {'name': 'Medium', 'num_layers': 6, 'num_heads': 8, 'd_model': 256, 'path': 'medium_weights2',
               'addH': True}
    large = {'name': 'Large', 'num_layers': 12, 'num_heads': 12, 'd_model': 576, 'path': 'large_weights','addH':True}
    medium_without_H = {'name': 'Medium', 'num_layers': 6, 'num_heads': 8, 'd_model': 256, 'path': 'weights_without_H','addH':False}
    medium_without_pretrain = {'name': 'Medium', 'num_layers': 6, 'num_heads': 8, 'd_model': 256,'path': 'medium_without_pretraining_weights','addH':True}

    arch = medium ## small 3 4 128   medium: 6 6  256     large:  12 8 516

    pretraining = True
    pretraining_str = 'pretraining' if pretraining else ''

    trained_epoch = 10
    task = 'PPB'
    print(task)
    seed = seed

    num_layers = arch['num_layers']
    num_heads = arch['num_heads']
    d_model = arch['d_model']
    addH = arch['addH']

    dff = d_model * 2
    vocab_size = 17
    dropout_rate = 0.1

    tf.random.set_seed(seed=seed)
    graph_dataset = Graph_Regression_Dataset('data/reg/{}.txt'.format(task), smiles_field='SMILES',
                                                           label_field='Label',addH=addH).get_data()
        
    train_dataset, test_dataset,val_dataset = graph_dataset.get_data()
    
    value_range = graph_dataset.value_range()

    x, adjoin_matrix, y = next(iter(train_dataset.take(1)))
    seq = tf.cast(tf.math.equal(x, 0), tf.float32)
    mask = seq[:, tf.newaxis, tf.newaxis, :]
    model = PredictModel(num_layers=num_layers, d_model=d_model, dff=dff, num_heads=num_heads, vocab_size=vocab_size,
                         dense_dropout=0.15)
    if pretraining:
        temp = BertModel(num_layers=num_layers, d_model=d_model, dff=dff, num_heads=num_heads, vocab_size=vocab_size)
        pred = temp(x, mask=mask, training=True, adjoin_matrix=adjoin_matrix)
        temp.load_weights(arch['path']+'/bert_weights{}_{}.h5'.format(arch['name'],trained_epoch))
        temp.encoder.save_weights(arch['path']+'/bert_weights_encoder{}_{}.h5'.format(arch['name'],trained_epoch))
        del temp

        pred = model(x, mask=mask, training=True, adjoin_matrix=adjoin_matrix)

        model.encoder.load_weights(arch['path']+'/bert_weights_encoder{}_{}.h5'.format(arch['name'],trained_epoch))
        print('load_wieghts')

    class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
        def __init__(self, d_model, total_steps=4000):
            super(CustomSchedule, self).__init__()

            self.d_model = d_model
            self.d_model = tf.cast(self.d_model, tf.float32)
            self.total_step = total_steps
            self.warmup_steps = total_steps*0.10

        def __call__(self, step):
            arg1 = step/self.warmup_steps
            arg2 = 1-(step-self.warmup_steps)/(self.total_step-self.warmup_steps)

            return 10e-5* tf.math.minimum(arg1, arg2)

    steps_per_epoch = len(train_dataset)
    learning_rate = CustomSchedule(128,100*steps_per_epoch)
    optimizer = tf.keras.optimizers.Adam(learning_rate=10e-5)
    
    value_range = 

    r2 = -10
    stopping_monitor = 0
    for epoch in range(100):
        mse_object = tf.keras.metrics.MeanSquaredError()
        for x,adjoin_matrix,y in train_dataset:
            with tf.GradientTape() as tape:
                seq = tf.cast(tf.math.equal(x, 0), tf.float32)
                mask = seq[:, tf.newaxis, tf.newaxis, :]
                preds = model(x,mask=mask,training=True,adjoin_matrix=adjoin_matrix)
                loss = tf.reduce_mean(tf.square(y-preds))
                grads = tape.gradient(loss, model.trainable_variables)
                optimizer.apply_gradients(zip(grads, model.trainable_variables))
                mse_object.update_state(y,preds)
        print('epoch: ',epoch,'loss: {:.4f}'.format(loss.numpy().item()),'mse: {:.4f}'.format(mse_object.result().numpy().item() * (value_range**2)))

        y_true = []
        y_preds = []
        for x, adjoin_matrix, y in val_dataset:
            seq = tf.cast(tf.math.equal(x, 0), tf.float32)
            mask = seq[:, tf.newaxis, tf.newaxis, :]
            preds = model(x,mask=mask,adjoin_matrix=adjoin_matrix,training=False)
            y_true.append(y.numpy())
            y_preds.append(preds.numpy())
        y_true = np.concatenate(y_true,axis=0).reshape(-1)
        y_preds = np.concatenate(y_preds,axis=0).reshape(-1)
        r2_new = r2_score(y_true,y_preds)

        val_mse = keras.metrics.MSE(y_true, y_preds).numpy() * (value_range**2)
        print('val r2: {:.4f}'.format(r2_new), 'val mse:{:.4f}'.format(val_mse))
        if r2_new > r2:
            r2 = r2_new
            stopping_monitor = 0
            np.save('{}/{}{}{}{}{}'.format(arch['path'], task, seed, arch['name'], trained_epoch, trained_epoch,pretraining_str),
                    [y_true, y_preds])
            model.save_weights('regression_weights/{}.h5'.format(task))
        else:
            stopping_monitor +=1
        print('best r2: {:.4f}'.format(r2))
        if stopping_monitor>0:
            print('stopping_monitor:',stopping_monitor)
        if stopping_monitor>20:
            break

    y_true = []
    y_preds = []
    model.load_weights('regression_weights/{}.h5'.format(task, seed))
    for x, adjoin_matrix, y in test_dataset:
        seq = tf.cast(tf.math.equal(x, 0), tf.float32)
        mask = seq[:, tf.newaxis, tf.newaxis, :]
        preds = model(x, mask=mask, adjoin_matrix=adjoin_matrix, training=False)
        y_true.append(y.numpy())
        y_preds.append(preds.numpy())
    y_true = np.concatenate(y_true, axis=0).reshape(-1)
    y_preds = np.concatenate(y_preds, axis=0).reshape(-1)

    test_r2 = r2_score(y_true, y_preds)
    test_mse = keras.metrics.MSE(y_true.reshape(-1), y_preds.reshape(-1)).numpy() * (value_range**2)
    print('test r2:{:.4f}'.format(test_r2), 'test mse:{:.4f}'.format(test_mse))


    return r2
Exemplo n.º 16
0
    except:
        train_dataset, dev_dataset = SQuAD1()
        old_vocab = train_dataset.vocab
        vocab = torchtext.legacy.vocab.Vocab(
            counter=old_vocab.freqs, specials=['<unk>', '<pad>', '<MASK>'])
        with open(args.save_vocab, 'wb') as f:
            torch.save(vocab, f)
    pad_id = vocab.stoi['<pad>']
    sep_id = vocab.stoi['<sep>']
    cls_id = vocab.stoi['<cls>']
    train_dataset, dev_dataset = SQuAD1(vocab=vocab)
    train_dataset = process_raw_data(train_dataset)
    dev_dataset = process_raw_data(dev_dataset)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    embed_layer = BertEmbedding(len(vocab), args.emsize)
    pretrained_bert = BertModel(len(vocab), args.emsize, args.nhead, args.nhid,
                                args.nlayers, embed_layer, args.dropout)
    pretrained_bert.load_state_dict(torch.load(args.bert_model))
    model = QuestionAnswerTask(pretrained_bert).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=args.lr)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
    best_f1 = None
    train_loss_log, val_loss_log = [], []

    for epoch in range(1, args.epochs + 1):
        epoch_start_time = time.time()
        train()
        val_loss, val_exact, val_f1 = evaluate(dev_dataset, vocab)
        val_loss_log.append(val_loss)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
Exemplo n.º 17
0
def main(seed):
    # tasks = ['Ames', 'BBB', 'FDAMDD', 'H_HT', 'Pgp_inh', 'Pgp_sub']
    # os.environ['CUDA_VISIBLE_DEVICES'] = "1"
    # tasks = ['H_HT', 'Pgp_inh', 'Pgp_sub']

    task = 'Ames'
    print(task)

    medium2 = {
        'name': 'Medium',
        'num_layers': 6,
        'num_heads': 8,
        'd_model': 256,
        'path': 'medium_weights2',
        'addH': True
    }
    small = {
        'name': 'Small',
        'num_layers': 3,
        'num_heads': 4,
        'd_model': 128,
        'path': 'small_weights',
        'addH': True
    }
    medium = {
        'name': 'Medium',
        'num_layers': 6,
        'num_heads': 8,
        'd_model': 256,
        'path': 'medium_weights',
        'addH': True
    }
    large = {
        'name': 'Large',
        'num_layers': 12,
        'num_heads': 12,
        'd_model': 516,
        'path': 'large_weights',
        'addH': True
    }
    medium_without_H = {
        'name': 'Medium',
        'num_layers': 6,
        'num_heads': 8,
        'd_model': 256,
        'path': 'weights_without_H',
        'addH': False
    }
    medium_balanced = {
        'name': 'Medium',
        'num_layers': 6,
        'num_heads': 8,
        'd_model': 256,
        'path': 'weights_balanced',
        'addH': True
    }
    medium_without_pretrain = {
        'name': 'Medium',
        'num_layers': 6,
        'num_heads': 8,
        'd_model': 256,
        'path': 'medium_without_pretraining_weights',
        'addH': True
    }

    arch = medium  ## small 3 4 128   medium: 6 6  256     large:  12 8 516
    pretraining = True
    pretraining_str = 'pretraining' if pretraining else ''

    trained_epoch = 6

    num_layers = arch['num_layers']
    num_heads = arch['num_heads']
    d_model = arch['d_model']
    addH = arch['addH']

    dff = d_model * 2
    vocab_size = 17
    dropout_rate = 0.1

    seed = seed
    np.random.seed(seed=seed)
    tf.random.set_seed(seed=seed)
    train_dataset1, test_dataset1, val_dataset1 = Graph_Classification_Dataset(
        'data\clf\Ames.txt',
        smiles_field='SMILES',
        label_field='Label',
        addH=addH).get_data()
    train_dataset2, test_dataset2, val_dataset2 = Graph_Classification_Dataset(
        'data\clf\BBB.txt',
        smiles_field='SMILES',
        label_field='Label',
        addH=addH).get_data()

    train_dataset3, test_dataset3, val_dataset3 = Graph_Classification_Dataset(
        'data\clf\FDAMDD.txt',
        smiles_field='SMILES',
        label_field='Label',
        addH=addH).get_data()

    train_dataset4, test_dataset4, val_dataset4 = Graph_Classification_Dataset(
        'data\clf\H_HT.txt',
        smiles_field='SMILES',
        label_field='Label',
        addH=addH).get_data()
    train_dataset5, test_dataset5, val_dataset5 = Graph_Classification_Dataset(
        'data\clf\Pgp_inh.txt',
        smiles_field='SMILES',
        label_field='Label',
        addH=addH).get_data()

    train_dataset6, test_dataset6, val_dataset6 = Graph_Classification_Dataset(
        'data\clf\Pgp_sub.txt',
        smiles_field='SMILES',
        label_field='Label',
        addH=addH).get_data()

    x, adjoin_matrix, y = next(iter(train_dataset1.take(1)))
    seq = tf.cast(tf.math.equal(x, 0), tf.float32)
    mask = seq[:, tf.newaxis, tf.newaxis, :]
    model = PredictModel(num_layers=num_layers,
                         d_model=d_model,
                         dff=dff,
                         num_heads=num_heads,
                         vocab_size=vocab_size,
                         dense_dropout=0.2)

    if pretraining:
        temp = BertModel(num_layers=num_layers,
                         d_model=d_model,
                         dff=dff,
                         num_heads=num_heads,
                         vocab_size=vocab_size)
        pred = temp(x, mask=mask, training=True, adjoin_matrix=adjoin_matrix)
        temp.load_weights(
            arch['path'] +
            '/bert_weights{}_{}.h5'.format(arch['name'], trained_epoch))
        temp.encoder.save_weights(arch['path'] +
                                  '/bert_weights_encoder{}_{}.h5'.format(
                                      arch['name'], trained_epoch))
        del temp

        pred = model(x, mask=mask, training=True, adjoin_matrix=adjoin_matrix)
        model.encoder.load_weights(arch['path'] +
                                   '/bert_weights_encoder{}_{}.h5'.format(
                                       arch['name'], trained_epoch))
        print('load_wieghts')

    class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
        def __init__(self, d_model, total_steps=4000):
            super(CustomSchedule, self).__init__()

            self.d_model = d_model
            self.d_model = tf.cast(self.d_model, tf.float32)
            self.total_step = total_steps
            self.warmup_steps = total_steps * 0.06

        def __call__(self, step):
            arg1 = step / self.warmup_steps
            arg2 = 1 - (step - self.warmup_steps) / (self.total_step -
                                                     self.warmup_steps)

            return 5e-5 * tf.math.minimum(arg1, arg2)

    steps_per_epoch = len(train_dataset1)
    learning_rate = CustomSchedule(128, 100 * steps_per_epoch)
    optimizer = tf.keras.optimizers.Adam(learning_rate=10e-5)

    auc = 0
    stopping_monitor = 0

    for epoch in range(100):
        loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True)
        for x1, adjoin_matrix1, y1 in train_dataset1:
            x2, adjoin_matrix2, y2 = next(iter(train_dataset2))
            x3, adjoin_matrix3, y3 = next(iter(train_dataset3))
            x4, adjoin_matrix4, y4 = next(iter(train_dataset4))
            x5, adjoin_matrix5, y5 = next(iter(train_dataset5))
            x6, adjoin_matrix6, y6 = next(iter(train_dataset6))

            with tf.GradientTape() as tape:
                seq1 = tf.cast(tf.math.equal(x1, 0), tf.float32)
                mask1 = seq1[:, tf.newaxis, tf.newaxis, :]
                preds1 = model(x1,
                               mask=mask1,
                               training=True,
                               adjoin_matrix=adjoin_matrix1)
                # s1 = model.s[0]
                # s2 = model.s[1]
                # s3 = model.s[2]
                # s4 = model.s[3]
                # s5 = model.s[4]
                # s6 = model.s[5]
                loss1 = loss_object(y1, preds1[:, 0]) * 10

                seq2 = tf.cast(tf.math.equal(x2, 0), tf.float32)
                mask2 = seq2[:, tf.newaxis, tf.newaxis, :]
                preds2 = model(x2,
                               mask=mask2,
                               training=True,
                               adjoin_matrix=adjoin_matrix2)

                loss2 = loss_object(y2, preds2[:, 1])
                seq3 = tf.cast(tf.math.equal(x3, 0), tf.float32)
                mask3 = seq3[:, tf.newaxis, tf.newaxis, :]
                preds3 = model(x3,
                               mask=mask3,
                               training=True,
                               adjoin_matrix=adjoin_matrix3)

                loss3 = loss_object(y3, preds3[:, 2])

                seq4 = tf.cast(tf.math.equal(x4, 0), tf.float32)
                mask4 = seq4[:, tf.newaxis, tf.newaxis, :]
                preds4 = model(x4,
                               mask=mask4,
                               training=True,
                               adjoin_matrix=adjoin_matrix4)

                loss4 = loss_object(y4, preds4[:, 3])

                seq5 = tf.cast(tf.math.equal(x5, 0), tf.float32)
                mask5 = seq5[:, tf.newaxis, tf.newaxis, :]
                preds5 = model(x5,
                               mask=mask5,
                               training=True,
                               adjoin_matrix=adjoin_matrix5)

                loss5 = loss_object(y5, preds5[:, 4])

                seq6 = tf.cast(tf.math.equal(x6, 0), tf.float32)
                mask6 = seq6[:, tf.newaxis, tf.newaxis, :]
                preds6 = model(x6,
                               mask=mask6,
                               training=True,
                               adjoin_matrix=adjoin_matrix6)

                loss6 = loss_object(y6, preds6[:, 5])

                loss = loss1 + loss2 + loss3 + loss4 + loss5 + loss6
                grads = tape.gradient(loss, model.trainable_variables)
                optimizer.apply_gradients(zip(grads,
                                              model.trainable_variables))
        print('epoch: ', epoch, 'loss: {:.4f}'.format(loss.numpy().item()))

        y_true = []
        y_preds = []
        for x, adjoin_matrix, y in test_dataset1:
            seq = tf.cast(tf.math.equal(x, 0), tf.float32)
            mask = seq[:, tf.newaxis, tf.newaxis, :]
            preds = model(x,
                          mask=mask,
                          adjoin_matrix=adjoin_matrix,
                          training=False)
            y_true.append(y.numpy())
            y_preds.append(preds[:, 0].numpy())
        y_true = np.concatenate(y_true, axis=0).reshape(-1)
        y_preds = np.concatenate(y_preds, axis=0).reshape(-1)
        y_preds = tf.sigmoid(y_preds).numpy()
        auc_new = roc_auc_score(y_true, y_preds)
        test_accuracy = keras.metrics.binary_accuracy(y_true, y_preds).numpy()
        print('test auc :{:.4f}'.format(auc_new),
              'test accuracy:{:.4f}'.format(test_accuracy))

        y_true = []
        y_preds = []
        for x, adjoin_matrix, y in test_dataset2:
            seq = tf.cast(tf.math.equal(x, 0), tf.float32)
            mask = seq[:, tf.newaxis, tf.newaxis, :]
            preds = model(x,
                          mask=mask,
                          adjoin_matrix=adjoin_matrix,
                          training=False)
            y_true.append(y.numpy())
            y_preds.append(preds[:, 1].numpy())
        y_true = np.concatenate(y_true, axis=0).reshape(-1)
        y_preds = np.concatenate(y_preds, axis=0).reshape(-1)
        y_preds = tf.sigmoid(y_preds).numpy()
        auc_new = roc_auc_score(y_true, y_preds)

        test_accuracy = keras.metrics.binary_accuracy(y_true, y_preds).numpy()
        print('test auc:{:.4f}'.format(auc_new),
              'test accuracy:{:.4f}'.format(test_accuracy))

        y_true = []
        y_preds = []
        for x, adjoin_matrix, y in test_dataset3:
            seq = tf.cast(tf.math.equal(x, 0), tf.float32)
            mask = seq[:, tf.newaxis, tf.newaxis, :]
            preds = model(x,
                          mask=mask,
                          adjoin_matrix=adjoin_matrix,
                          training=False)
            y_true.append(y.numpy())
            y_preds.append(preds[:, 2].numpy())
        y_true = np.concatenate(y_true, axis=0).reshape(-1)
        y_preds = np.concatenate(y_preds, axis=0).reshape(-1)
        y_preds = tf.sigmoid(y_preds).numpy()
        auc_new = roc_auc_score(y_true, y_preds)
        test_accuracy = keras.metrics.binary_accuracy(y_true, y_preds).numpy()
        print('test auc :{:.4f}'.format(auc_new),
              'test accuracy:{:.4f}'.format(test_accuracy))

        y_true = []
        y_preds = []
        for x, adjoin_matrix, y in test_dataset4:
            seq = tf.cast(tf.math.equal(x, 0), tf.float32)
            mask = seq[:, tf.newaxis, tf.newaxis, :]
            preds = model(x,
                          mask=mask,
                          adjoin_matrix=adjoin_matrix,
                          training=False)
            y_true.append(y.numpy())
            y_preds.append(preds[:, 3].numpy())
        y_true = np.concatenate(y_true, axis=0).reshape(-1)
        y_preds = np.concatenate(y_preds, axis=0).reshape(-1)
        y_preds = tf.sigmoid(y_preds).numpy()
        auc_new = roc_auc_score(y_true, y_preds)
        test_accuracy = keras.metrics.binary_accuracy(y_true, y_preds).numpy()
        print('test auc :{:.4f}'.format(auc_new),
              'test accuracy:{:.4f}'.format(test_accuracy))

        y_true = []
        y_preds = []
        for x, adjoin_matrix, y in test_dataset5:
            seq = tf.cast(tf.math.equal(x, 0), tf.float32)
            mask = seq[:, tf.newaxis, tf.newaxis, :]
            preds = model(x,
                          mask=mask,
                          adjoin_matrix=adjoin_matrix,
                          training=False)
            y_true.append(y.numpy())
            y_preds.append(preds[:, 4].numpy())
        y_true = np.concatenate(y_true, axis=0).reshape(-1)
        y_preds = np.concatenate(y_preds, axis=0).reshape(-1)
        y_preds = tf.sigmoid(y_preds).numpy()
        auc_new = roc_auc_score(y_true, y_preds)
        test_accuracy = keras.metrics.binary_accuracy(y_true, y_preds).numpy()
        print('test auc :{:.4f}'.format(auc_new),
              'test accuracy:{:.4f}'.format(test_accuracy))

        y_true = []
        y_preds = []
        for x, adjoin_matrix, y in test_dataset6:
            seq = tf.cast(tf.math.equal(x, 0), tf.float32)
            mask = seq[:, tf.newaxis, tf.newaxis, :]
            preds = model(x,
                          mask=mask,
                          adjoin_matrix=adjoin_matrix,
                          training=False)
            y_true.append(y.numpy())
            y_preds.append(preds[:, 5].numpy())
        y_true = np.concatenate(y_true, axis=0).reshape(-1)
        y_preds = np.concatenate(y_preds, axis=0).reshape(-1)
        y_preds = tf.sigmoid(y_preds).numpy()
        auc_new = roc_auc_score(y_true, y_preds)
        test_accuracy = keras.metrics.binary_accuracy(y_true, y_preds).numpy()
        print('test auc :{:.4f}'.format(auc_new),
              'test accuracy:{:.4f}'.format(test_accuracy))

    return auc
Exemplo n.º 18
0
large = {'name': 'Large', 'num_layers': 12, 'num_heads': 12, 'd_model': 576, 'path': 'large_weights','addH':True}
medium_balanced = {'name':'Medium','num_layers': 6, 'num_heads': 8, 'd_model': 256,'path':'weights_balanced','addH':True}
medium_without_H = {'name':'Medium','num_layers': 6, 'num_heads': 8, 'd_model': 256,'path':'weights_without_H','addH':False}

arch = medium2           ## small 3 4 128   medium: 6 6  256     large:  12 8 516
num_layers = arch['num_layers']
num_heads =  arch['num_heads']
d_model =  arch['d_model']
addH = arch['addH']


dff = d_model*2
vocab_size =18
dropout_rate = 0.1

model = BertModel(num_layers=num_layers,d_model=d_model,dff=dff,num_heads=num_heads,vocab_size=vocab_size)

train_dataset, test_dataset = Multi_Task_Graph_Bert_Dataset(path='data/chem.txt',smiles_field='CAN_SMILES',addH=addH).get_data()

train_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
    tf.TensorSpec(shape=(None, None,None), dtype=tf.float32),
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
    tf.TensorSpec(shape=(None, None), dtype=tf.float32),
]

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')
test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy')
loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
Exemplo n.º 19
0
    def model_fn(features, labels, mode, params):
        """this is prototype syntax, all parameters are necessary."""
        # obtain the data
        _info('*** Features ***')
        for name in sorted(features.keys()):
            tf.logging.info("  name = %s, shape = %s" %
                            (name, features[name].shape))

        input_ids = features['input_ids']  # [batch_size, seq_length]
        input_mask = features['input_mask']  # [batch_size, seq_length]

        # if mode != tf.estimator.ModeKeys.PREDICT:
        #     # segment_idx = features['segment_dis']
        #     masked_lm_positions = features['masked_lm_positions']   # [batch_size, seq_length], specify the answer
        #     masked_lm_ids = features['masked_lm_ids']               # [batch_size, answer_seq_length], specify the answer labels
        #     masked_lm_weights = features['masked_lm_weights']        # [batch_size, seq_length], [1, 1, 0], 0 refers to the mask
        #     # next_sentence_labels = features['next_sentence_labels']
        # else:
        masked_lm_positions = features['masked_lm_positions']
        masked_lm_ids = features['masked_lm_ids']
        masked_lm_weights = features['masked_lm_weights']

        if bert_config.train_type == 'seq2seq':
            _info('Training seq2seq task.')
        elif bert_config.train_type == 'lm':
            _info('Training language model task.')

        # build model
        is_training = (mode == tf.estimator.ModeKeys.TRAIN)
        model = BertModel(config=bert_config,
                          is_training=is_training,
                          input_ids=input_ids,
                          input_mask=input_mask)

        # compute loss
        loss, per_loss, log_probs, logits = get_masked_lm_output(
            bert_config, model.get_sequence_output(), model.embedding_table,
            model.projection_table, masked_lm_positions, masked_lm_ids,
            masked_lm_weights, mode)

        if mode == tf.estimator.ModeKeys.PREDICT:
            masked_lm_predictions = tf.reshape(
                tf.argmax(log_probs, axis=-1, output_type=tf.int32), [-1])
            output_spec = tf.estimator.EstimatorSpec(
                mode, predictions=masked_lm_predictions)
        else:
            if mode == tf.estimator.ModeKeys.TRAIN:
                # restore from the checkpoint,
                # tf.estimator automatically restore from the model typically,
                # maybe here is for restore some pre-trained parameters
                tvars = tf.trainable_variables()
                initialized_variable_names = {}
                if init_checkpoint:
                    (assignment_map, initialized_variable_names
                     ) = get_assignment_map_from_checkpoint(
                         tvars, init_checkpoint)
                    tf.train.init_from_checkpoint(init_checkpoint,
                                                  assignment_map)

                _info('*** Trainable Variables ***')
                for var in tvars:
                    init_string = ''
                    if var.name in initialized_variable_names:
                        init_string = ', *INIT_FROM_CKPT*'
                    _info('name = {}, shape={}{}'.format(
                        var.name, var.shape, init_string))

                train_op = optimization.create_optimizer(
                    loss, bert_config.learning_rate, num_train_steps,
                    bert_config.lr_limit)

                # learning_rate = tf.train.polynomial_decay(bert_config.learning_rate,
                #                                         tf.train.get_or_create_global_step(),
                #                                         num_train_steps,
                #                                         end_learning_rate=0.0,
                #                                         power=1.0,
                #                                         cycle=False)
                # optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
                # gradients = tf.gradients(loss, tvars, colocate_gradients_with_ops=True)
                # clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
                # train_op = optimizer.apply_gradients(zip(clipped_gradients, tvars), global_step=tf.train.get_global_step())
                output_spec = tf.estimator.EstimatorSpec(mode,
                                                         loss=loss,
                                                         train_op=train_op)
            elif mode == tf.estimator.ModeKeys.EVAL:
                is_real_example = tf.ones(tf.shape(masked_lm_ids),
                                          dtype=tf.float32)

                def metric_fn(loss, label_ids, logits, is_real_example):
                    """
                    Args:
                        loss: tf.float32.
                        label_ids: [b, s].
                        logits: [b, s, v].
                    """
                    # [b * s, v]
                    logits = tf.reshape(logits, [-1, logits.shape[-1]])
                    # [b * s, 1]
                    predictions = tf.argmax(logits,
                                            axis=-1,
                                            output_type=tf.int32)
                    # [b * s]
                    label_ids = tf.reshape(label_ids, [-1])
                    accuracy = tf.metrics.accuracy(labels=label_ids,
                                                   predictions=predictions)
                    loss = tf.metrics.mean(values=loss)
                    return {'eval_accuracy': accuracy, 'eval_loss': loss}

                eval_metrics = metric_fn(loss, masked_lm_ids, logits,
                                         is_real_example)
                output_spec = tf.estimator.EstimatorSpec(
                    mode=mode, loss=loss, eval_metric_ops=eval_metrics)

        return output_spec
Exemplo n.º 20
0
def run_main(args, rank=None):
    # Set the random seed manually for reproducibility.
    torch.manual_seed(args.seed)
    if args.parallel == 'DDP':
        n = torch.cuda.device_count() // args.world_size
        device = list(range(rank * n, (rank + 1) * n))
    else:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    vocab = torch.load(args.save_vocab)
    cls_id = vocab.stoi['<cls>']
    pad_id = vocab.stoi['<pad>']
    sep_id = vocab.stoi['<sep>']

    if args.dataset == 'WikiText103':
        from torchtext.experimental.datasets import WikiText103
        train_dataset, valid_dataset, test_dataset = WikiText103(vocab=vocab)
    elif args.dataset == 'BookCorpus':
        from data import BookCorpus
        train_dataset, valid_dataset, test_dataset = BookCorpus(vocab, min_sentence_len=60)

    if rank is not None:
        chunk_len = len(train_dataset.data) // args.world_size
        train_dataset.data = train_dataset.data[(rank * chunk_len):((rank + 1) * chunk_len)]

    if args.checkpoint != 'None':
        model = torch.load(args.checkpoint)
    else:
        pretrained_bert = BertModel(len(vocab), args.emsize, args.nhead, args.nhid, args.nlayers, args.dropout)
        pretrained_bert.load_state_dict(torch.load(args.bert_model))
        model = NextSentenceTask(pretrained_bert)

    if args.parallel == 'DDP':
        model = model.to(device[0])
        model = DDP(model, device_ids=device)
    else:
        model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=args.lr)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
    best_val_loss = None
    train_loss_log, val_loss_log = [], []

    for epoch in range(1, args.epochs + 1):
        epoch_start_time = time.time()
        train(process_raw_data(train_dataset, args), model, train_loss_log, device, optimizer,
              criterion, epoch, scheduler, cls_id, sep_id, pad_id, args, rank)
        val_loss = evaluate(process_raw_data(valid_dataset, args), model, device, criterion,
                            cls_id, sep_id, pad_id, args)
        val_loss_log.append(val_loss)

        if (rank is None) or (rank == 0):
            print('-' * 89)
            print('| end of epoch {:3d} | time: {:5.2f}s '
                  '| valid loss {:8.5f} | '.format(epoch,
                                                   (time.time() - epoch_start_time),
                                                   val_loss))
            print('-' * 89)
        if not best_val_loss or val_loss < best_val_loss:
            if rank is None:
                with open(args.save, 'wb') as f:
                    torch.save(model, f)
            elif rank == 0:
                with open(args.save, 'wb') as f:
                    torch.save(model.state_dict(), f)
            best_val_loss = val_loss
        else:
            scheduler.step()
    if args.parallel == 'DDP':
        rank0_devices = [x - rank * len(device) for x in device]
        device_pairs = zip(rank0_devices, device)
        map_location = {'cuda:%d' % x: 'cuda:%d' % y for x, y in device_pairs}
        model.load_state_dict(torch.load(args.save, map_location=map_location))
        test_loss = evaluate(process_raw_data(test_dataset, args), model, device, criterion,
                             cls_id, sep_id, pad_id, args)
        if rank == 0:
            wrap_up(train_loss_log, val_loss_log, test_loss, args, model.module, 'ns_loss.txt', 'ns_model.pt')
    else:
        with open(args.save, 'rb') as f:
            model = torch.load(f)

        test_loss = evaluate(process_raw_data(test_dataset, args), model, device, criterion,
                             cls_id, sep_id, pad_id, args)
        wrap_up(train_loss_log, val_loss_log, test_loss, args, model, 'ns_loss.txt', 'ns_model.pt')