def main(args): preprocessor = Preprocessor(args.model_type, args.max_len) train_dataloader, val_dataloader, test_dataloader = get_dataloader( args, preprocessor) bert_finetuner = BertModel(args, train_dataloader, val_dataloader, test_dataloader) logger = TensorBoardLogger(save_dir=args.log_dir, version=1, name="nsmc-bert") early_stop_callback = EarlyStopping(monitor='val_acc', min_delta=0.00, patience=5, verbose=False, mode='max') checkpoint_callback = ModelCheckpoint(filepath=args.checkpoint_path, verbose=True, monitor='val_acc', mode='max', save_top_k=3, prefix='') trainer = pl.Trainer( gpus=1, # distributed_backend='ddp' checkpoint_callback=checkpoint_callback, early_stop_callback=early_stop_callback, logger=logger) trainer.fit(bert_finetuner) trainer.test()
def main(): parser = ArgumentParser() parser.add_argument("--model-config", type=str, default="openai-gpt", help="Path, url or short name of the model") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--outlens", type=int, default=30) parser.add_argument("--beam", type=int, default=1) parser.add_argument("--checkpoints", type=str) parser.add_argument("--data", type=str, default="file") args = parser.parse_args() args.load_model = True model = BertModel(None, args) state_dict = convert_model(torch.load(args.checkpoints)['sd']) model.load_state_dict(state_dict) model.to(args.device) tokenizer = BertWordPieceTokenizer("bert-base-chinese", cache_dir="temp_cache_dir") generate(model, tokenizer, args.device, args.data, sample=True, top_k=5, beam_size=6, outlens=30)
def get_model(tokenizer, args): """Build the model.""" print('building BERT model ...') model = BertModel(tokenizer, args) print(' > number of parameters: {}'.format( sum([p.nelement() for p in model.parameters()])), flush=True) # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: print("fp16 mode") model = FP16_Module(model) if args.fp32_embedding: model.module.model.bert.embeddings.word_embeddings.float() model.module.model.bert.embeddings.position_embeddings.float() model.module.model.bert.embeddings.token_type_embeddings.float() if args.fp32_tokentypes: model.module.model.bert.embeddings.token_type_embeddings.float() if args.fp32_layernorm: for name, _module in model.named_modules(): if 'LayerNorm' in name: _module.float() # Wrap model for distributed training. if args.world_size > 1: model = DDP(model) return model
def get_model(args): """Build the model.""" print_rank_0('building BERT model ...') model = BertModel(args) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) if args.fp32_embedding: model.module.model.bert.embeddings.word_embeddings.float() if args.ds_type=='BERT': model.module.model.bert.embeddings.position_embeddings.float() else: model.module.model.bert.embeddings.token_position_embeddings.float() model.module.model.bert.embeddings.para_position_embeddings.float() model.module.model.bert.embeddings.sent_position_embeddings.float() model.module.model.bert.embeddings.token_type_embeddings.float() if args.fp32_tokentypes: model.module.model.bert.embeddings.token_type_embeddings.float() if args.fp32_layernorm: for name, _module in model.named_modules(): if 'LayerNorm' in name: _module.float() # Wrap model for distributed training. if args.DDP_impl == 'torch': i = torch.cuda.current_device() args.DDP_type = torch.nn.parallel.distributed.DistributedDataParallel model = args.DDP_type(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) elif args.DDP_impl == 'local': args.DDP_type = LocalDDP model = args.DDP_type(model) else: print_rank_0('Unknown DDP implementation specified: {}. ' 'Exiting.'.format(args.DDP_impl)) exit() return model
def get_model(args): """Build the model.""" print_rank_0('building BERT model ...') model = BertModel(args) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) if args.fp32_embedding: model.module.model.bert.embeddings.word_embeddings.float() model.module.model.bert.embeddings.position_embeddings.float() model.module.model.bert.embeddings.token_type_embeddings.float() if args.fp32_tokentypes: model.module.model.bert.embeddings.token_type_embeddings.float() if args.fp32_layernorm: for name, _module in model.named_modules(): if 'LayerNorm' in name: _module.float() # Wrap model for distributed training. if USE_TORCH_DDP: i = torch.cuda.current_device() model = DDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) else: model = DDP(model) return model
def model_init(app): ArgsSet = type('ArgsSet',(object,),{}) client = ArgsSet() parser = ArgumentParser() parser.add_argument("--model-config", type=str, default="openai-gpt", help="Path, url or short name of the model") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--outlens", type=int, default=30) parser.add_argument("--beam", type=int, default=1) parser.add_argument("--gpt-checkpoints", type=str) parser.add_argument("--port", type=int, default=8866) args = parser.parse_args() args.load_model = True args.fp32_embedding = False args.fp32_layernorm = False args.fp32_tokentypes = False args.layernorm_epsilon = 1e-12 gpt = BertModel(None, args) state_dict = convert_model(torch.load(args.gpt_checkpoints)['sd']) gpt.load_state_dict(state_dict) gpt.to(args.device) gpt.eval() tokenizer = BertWordPieceTokenizer("bert-base-chinese", cache_dir="temp_cache_dir") print(" Load model from {}".format(args.gpt_checkpoints)) client.tokenizer = tokenizer client.gpt =gpt client.gpt_beam = SequenceGenerator(gpt, tokenizer, beam_size=args.beam, max_lens=args.outlens) client.device = args.device client.port = args.port client.generator = sample_sequence return client
def main(train_file, dev_file, target_dir, epochs=10, batch_size=32, lr=2e-05, patience=3, max_grad_norm=10.0, checkpoint=None): bert_tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', do_lower_case=True) device = torch.device("cuda") print(20 * "=", " Preparing for training ", 20 * "=") # 保存模型的路径 if not os.path.exists(target_dir): os.makedirs(target_dir) # -------------------- Data loading ------------------- # print("\t* Loading training data...") train_data = DataPrecessForSentence(bert_tokenizer, train_file) train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size) print("\t* Loading validation data...") dev_data = DataPrecessForSentence(bert_tokenizer, dev_file) dev_loader = DataLoader(dev_data, shuffle=True, batch_size=batch_size) # -------------------- Model definition ------------------- # print("\t* Building model...") model = BertModel().to(device) # -------------------- Preparation for training ------------------- # # 待优化的参数 param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=lr) #optimizer = torch.optim.Adam(optimizer_grouped_parameters, lr=lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=0.85, patience=0) best_score = 0.0 start_epoch = 1 # Data for loss curves plot epochs_count = [] train_losses = [] valid_losses = [] # Continuing training from a checkpoint if one was given as argument if checkpoint: checkpoint = torch.load(checkpoint) start_epoch = checkpoint["epoch"] + 1 best_score = checkpoint["best_score"] print("\t* Training will continue on existing model from epoch {}...". format(start_epoch)) model.load_state_dict(checkpoint["model"]) optimizer.load_state_dict(checkpoint["optimizer"]) epochs_count = checkpoint["epochs_count"] train_losses = checkpoint["train_losses"] valid_losses = checkpoint["valid_losses"] # Compute loss and accuracy before starting (or resuming) training. _, valid_loss, valid_accuracy, auc = validate(model, dev_loader) print( "\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}" .format(valid_loss, (valid_accuracy * 100), auc)) # -------------------- Training epochs ------------------- # print("\n", 20 * "=", "Training Bert model on device: {}".format(device), 20 * "=") patience_counter = 0 for epoch in range(start_epoch, epochs + 1): epochs_count.append(epoch) print("* Training epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader, optimizer, epoch, max_grad_norm) train_losses.append(epoch_loss) print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%". format(epoch_time, epoch_loss, (epoch_accuracy * 100))) print("* Validation for epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy, epoch_auc = validate( model, dev_loader) valid_losses.append(epoch_loss) print( "-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}\n" .format(epoch_time, epoch_loss, (epoch_accuracy * 100), epoch_auc)) # Update the optimizer's learning rate with the scheduler. scheduler.step(epoch_accuracy) # Early stopping on validation accuracy. if epoch_accuracy < best_score: patience_counter += 1 else: best_score = epoch_accuracy patience_counter = 0 torch.save( { "epoch": epoch, "model": model.state_dict(), "best_score": best_score, "epochs_count": epochs_count, "train_losses": train_losses, "valid_losses": valid_losses }, os.path.join(target_dir, "best.pth.tar")) if patience_counter >= patience: print("-> Early stopping: patience limit reached, stopping...") break
def train(args): if "bert" in args.model_type: set_gelu("tanh") # 切换gelu版本 # Step1: Load Data data_generator = None if "siamese" in args.model_type: data_generator = SiameseDataGenerator elif "albert" in args.model_type: data_generator = BertDataGenerator train_ds = data_generator(data_path=args.train_data_path, batch_size=args.batch_size, dict_path=args.bert_dict_path, maxlen=args.query_len) dev_ds = data_generator(data_path=args.dev_data_path, batch_size=args.batch_size, maxlen=args.query_len, dict_path=args.bert_dict_path) test_ds = data_generator(data_path=args.test_data_path, batch_size=args.batch_size, maxlen=args.query_len, dict_path=args.bert_dict_path) # Step2: Load Model model = None if "siamese" in args.model_type: model = SiameseBertModel(config_path=args.bert_config_path, checkpoint_path=args.bert_checkpoint_path, dense_units=args.dense_units) elif "albert" in args.model_type: model = BertModel(config_path=args.bert_config_path, checkpoint_path=args.bert_checkpoint_path) model_name = model.__class__.__name__ model = model.get_model() from bert4keras.optimizers import Adam model.compile( loss='sparse_categorical_crossentropy', optimizer=Adam(2e-5), # 用足够小的学习率 # optimizer=PiecewiseLinearLearningRate(Adam(5e-5), {10000: 1, 30000: 0.1}), metrics=['accuracy'], ) evaluator = Evaluator(dev_ds=dev_ds, model_name=model_name, is_bert_model=True, test_ds=test_ds) logger.info("***** Running training *****") logger.info(" Model Class Name = %s", model_name) logger.info(" Num Epochs = %d", args.epoch) model.fit_generator(train_ds.forfit(), steps_per_epoch=len(train_ds), epochs=args.epoch, callbacks=[evaluator], verbose=2) model.load_weights('./checkpoints/best_{}.weight'.format(model_name)) logger.info("***** Test Reslt *****") logger.info(" Model = %s", model_name) logger.info(" Batch Size = %d", args.batch_size) logger.info(" Final Test Acc:%05f", cal_acc(data=test_ds, model=model, is_bert_model=True)) elif "NN" in args.model_type: # Step 1 : Loda Data train_data = pd.read_csv(args.train_data_path) dev_data = pd.read_csv(args.dev_data_path) test_data = pd.read_csv(args.test_data_path) category_count = len(train_data["category"].value_counts()) category_encoder = category_OneHotEncoder(data_df=train_data) loader = LoadData(w2v_path=args.w2v_path, query_len=args.query_len) word2idx = loader.word2idx emd_matrix = loader.emb_matrix """ 注意: shuffle的顺序很重要:一般建议是先执行shuffle方法,接着采用batch方法。 这样是为了保证在整体数据打乱之后再取出batch_size大小的数据。 如果先采取batch方法再采用shuffle方法,那么此时就只是对batch进行shuffle, 而batch里面的数据顺序依旧是有序的,那么随机程度会减弱。 """ train_ds = loader.dataset(encoder=category_encoder, data_df=train_data) train_ds = train_ds.shuffle(buffer_size=len(train_data)).batch( batch_size=args.batch_size).repeat() dev_ds = loader.dataset(encoder=category_encoder, data_df=dev_data) dev_ds = dev_ds.batch(batch_size=args.batch_size) test_ds = loader.dataset(encoder=category_encoder, data_df=test_data) test_ds = test_ds.batch(batch_size=args.batch_size) # Step2: Load Model model = None if "siamese_CNN" in args.model_type: model = SiameseCnnModel(emb_matrix=emd_matrix, word2idx=word2idx, filters_nums=args.filters_nums, kernel_sizes=args.kernel_sizes, dense_units=args.dense_units, label_count=args.label_count, category_count=category_count, query_len=args.query_len, shared=args.feature_shared, add_feature=args.add_features) elif "siamese_RNN" in args.model_type: model = SiameseRnnModel(emb_matrix=emd_matrix, word2idx=word2idx, hidden_units=args.hidden_units, dense_units=args.dense_units, label_count=args.label_count, category_count=category_count, query_len=args.query_len, mask_zero=args.mask_zero, bidirection=args.bi_direction, shared=args.feature_shared, add_feature=args.add_features) model_name = model.__class__.__name__ model = model.get_model() logger.info("***** Running training *****") logger.info(" Model Class Name = %s", model_name) logger.info(" Num examples = %d", len(train_data)) logger.info(" Num Epochs = %d", args.epoch) model.compile(optimizer='adam', loss="binary_crossentropy", metrics=["acc"]) early_stopping = EarlyStopping(monitor="val_acc", patience=3, mode="max") evaluator = Evaluator(dev_ds=dev_ds, model_name=model_name, is_bert_model=False, dev_label=dev_data['label']) # Step3: Train Model history = model.fit(train_ds, callbacks=[early_stopping, evaluator], epochs=args.epoch, steps_per_epoch=len(train_data) // args.batch_size, validation_data=dev_ds, validation_steps=len(dev_data) // args.batch_size) # Step4 : Save model and trainLogs logger.info("***** Training Logs *****") for epoch in history.epoch: logger.info("Epoch %d", epoch) logger.info("train_loss:%f train_acc:%f val_loss:%f val_acc:%f", history.history.get("loss")[epoch], history.history.get("acc")[epoch], history.history.get("val_loss")[epoch], history.history.get("val_acc")[epoch]) # # time_stamp = datetime.datetime.now().strftime('%m-%d_%H-%M-%S') # path = './checkpoints/{}_{}.h5'.format(model_name, time_stamp) # model.save(path) model = load_model('./checkpoints/best_{}.h5'.format(model_name)) y_pred = model.predict(test_ds) y_true = test_data["label"].values.reshape((-1, 1)) y_pred[y_pred > 0.5] = 1 y_pred[y_pred < 0.5] = 0 acc = accuracy_score(y_true, y_pred) precision = precision_score(y_true, y_pred) recall = recall_score(y_true, y_pred) f1 = f1_score(y_true, y_pred) logger.info("***** Pramaters *****") logger.info(" ModelName = %s", args.model_type) logger.info(" Add Features = %s", args.add_features) logger.info(" Embedding dims = %d", len(emd_matrix[0])) logger.info(" BatchSize = %d", args.batch_size) if "CNN" in args.model_type: logger.info(" kernel_sizes = %s", args.kernel_sizes) logger.info(" filters_nums = %s", args.filters_nums) elif "RNN" in args.model_type: logger.info(" hidden_units = %s", args.hidden_units) logger.info(" bi_direction = %s", args.bi_direction) logger.info(" dense_units = %s", args.dense_units) logger.info(" feature_shared = %s", args.feature_shared) logger.info("***** Testing Results *****") logger.info(" Acc = %f", acc) logger.info(" Precision = %f", precision) logger.info(" Recall = %f", recall) logger.info(" F1-score = %f", f1)
# We use only ~1% data to fine-tune the model. train, dev = SQuAD1() raw_train = list(train)[:1024] raw_dev = list(dev)[:128] convert_to_arrow(raw_train, "train_arrow") convert_to_arrow(raw_dev, "dev_arrow") base_url = 'https://pytorch.s3.amazonaws.com/models/text/torchtext_bert_example/' vocab_path = download_from_url(base_url + 'bert_vocab.txt') data_module = QuestionAnswerDataModule(train_arrow_path='train_arrow', dev_arrow_path='dev_arrow', vocab_filepath=vocab_path, batch_size=BATCH_SIZE) # Load pretrained model and generate task # default parameters from the pretrained model vocab_size, emsize, nhead, nhid, nlayers, dropout = 99230, 768, 12, 3072, 12, 0.2 pretrained_bert = BertModel(vocab_size, emsize, nhead, nhid, nlayers, dropout) pretrained_model_path = download_from_url(base_url + 'ns_bert.pt') pretrained_bert.load_state_dict( torch.load(pretrained_model_path, map_location='cpu')) qa_model = QuestionAnswerModel(pretrained_bert) task = QuestionAnswerTask(qa_model, LR) trainer = Trainer(gpus=0, max_epochs=EPOCH, progress_bar_refresh_rate=40, fast_dev_run=True) trainer.fit(task, data_module)
from model import mlm_loss, nsp_accuracy, mlm_accuracy config = BertConfig(vocab_size=300, hidden_size=128, num_hidden_layers=1, num_attention_heads=1, intermediate_size=512, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02) bert_model = BertModel(config, with_nsp=True, with_mlm=True, is_pretrain=True) # if with_mlm=True, inputs will be (input_ids, input_mask, token_type_ids, masked_lm_positions) input_ids = tf.keras.Input(shape=(512,), dtype=tf.int32) input_mask = tf.keras.Input(shape=(512,), dtype=tf.int32) token_type_ids = tf.keras.Input(shape=(512,), dtype=tf.int32) # masked_lm_positions = [batch_size, MAX_PREDICTIONS_PER_SEQ] masked_lm_positions = tf.keras.Input(shape=(2,), dtype=tf.int32) inputs = (input_ids, input_mask, token_type_ids, masked_lm_positions) bert_model(inputs) # bert_model = tf.keras.Model(inputs, outputs)
def model_init(app): ArgsSet = type('ArgsSet', (object, ), {}) client = ArgsSet() parser = ArgumentParser() parser.add_argument("--model-config", type=str, default="openai-gpt", help="Path, url or short name of the model") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--outlens", type=int, default=30) parser.add_argument("--beam", type=int, default=1) parser.add_argument("--fuse-checkpoints", type=str) parser.add_argument("--gpt-checkpoints", type=str) parser.add_argument("--qa-style-checkpoints", type=str) parser.add_argument("--multi-task", type=str) parser.add_argument("--split-sentence-with-task-embedding-checkpoints", type=str) parser.add_argument("--special-cls-checkpoints", type=str) parser.add_argument("--port", type=int, default=8866) args = parser.parse_args() args.load_model = True args.fp32_embedding = False args.fp32_layernorm = False args.fp32_tokentypes = False args.layernorm_epsilon = 1e-12 fuse_model = BertModel(None, args) state_dict = convert_model(torch.load(args.fuse_checkpoints)['sd']) fuse_model.load_state_dict(state_dict) fuse_model.to(args.device) fuse_model.eval() print("| Load model from {}".format(args.fuse_checkpoints)) gpt = BertModel(None, args) state_dict = convert_model(torch.load(args.gpt_checkpoints)['sd']) gpt.load_state_dict(state_dict) gpt.to(args.device) gpt.eval() tokenizer = BertWordPieceTokenizer("bert-base-chinese", cache_dir="temp_cache_dir") print(" Load model from {}".format(args.gpt_checkpoints)) # Load bert checkpoints args.load_model = False args.fp32_embedding = False args.fp32_layernorm = False args.fp32_tokentypes = False args.layernorm_epsilon = 1e-12 bert = BertModel(None, args) bert.to(args.device) bert.eval() client.tokenizer = tokenizer client.fuse_model = fuse_model client.fuse_beam = SequenceGenerator(fuse_model, tokenizer, beam_size=args.beam, max_lens=args.outlens) client.gpt = gpt client.gpt_beam = SequenceGenerator(gpt, tokenizer, beam_size=args.beam, max_lens=args.outlens) client.bert = bert client.device = args.device client.port = args.port client.generator = sample_sequence # multi task model multi_task = BertModel(None, args) state_dict = convert_model(torch.load(args.multi_task)['sd']) print("| Load model from {}".format(args.multi_task)) multi_task.load_state_dict(state_dict) multi_task.to(args.device) multi_task.eval() client.multi_task_model = multi_task client.multi_task_beam = SequenceGenerator(multi_task, tokenizer, beam_size=args.beam, max_lens=args.outlens) # qa style model qa_style = BertModel(None, args) state_dict = convert_model(torch.load(args.qa_style_checkpoints)['sd']) qa_style.load_state_dict(state_dict) qa_style.to(args.device) qa_style.eval() print(" Load model from {}".format(args.qa_style_checkpoints)) client.qa_task_model = qa_style # special cls tokens special_cls_model = BertModel(None, args) special_cls_model.eval() state_dict = convert_model(torch.load(args.special_cls_checkpoints)['sd']) special_cls_model.load_state_dict(state_dict) special_cls_model.to(args.device) special_cls_model.eval() print(" Load model from {}".format(args.special_cls_checkpoints)) client.special_cls_model = special_cls_model client.special_beam = SequenceGenerator(special_cls_model, tokenizer, beam_size=args.beam, max_lens=args.outlens) # split sentence model with task embedding split_sentence_model = BertModel(None, args) split_sentence_model.eval() state_dict = convert_model( torch.load(args.split_sentence_with_task_embedding_checkpoints)['sd']) split_sentence_model.load_state_dict(state_dict) split_sentence_model.to(args.device) split_sentence_model.eval() print(" Load model from {}".format( args.split_sentence_with_task_embedding_checkpoints)) client.split_sentence_model = split_sentence_model client.split_sentence_beam = SequenceGenerator(split_sentence_model, tokenizer, beam_size=args.beam, max_lens=args.outlens) return client
@contact: [email protected] """ import re import torch import sentencepiece as spm from model import BertModel sp = spm.SentencePieceProcessor() sp.load('resource/sentencepiece.unigram.35000.model') vocab_size = sp.get_piece_size() n_embedding = 512 n_layer = 8 model = BertModel(vocab_size, n_embedding, n_layer) model.eval() model.load_state_dict(torch.load('resource/model.{}.{}.th'.format(n_embedding, n_layer), map_location='cpu')) # you should enable cuda if it is available # model.cuda() # if you are using a GPU that has tensor cores (nvidia volta, Turing architecture), you can enable half precision # inference and training, we recommend to use the nvidia official apex to make everything as clean as possible from # apex import amp [model] = amp.initialize([model], opt_level="O2") device = model.embedding.weight.data.device def clean_text(txt): txt = txt.lower()
result[0], result[1], result[2], result[3], result[4], result[5] summary_writer.add_summary(summary, global_step) _info('loss : {}\t lr : {}'.format(loss_bs, learning_rate), head='Step: {}'.format(global_step)) if global_step % 100 == 0: train_model.saver.save( train_sess, os.path.join(PROJECT_PATH, config.ckpt_path), global_step=global_step) def infer(config): # build graph infer_graph = tf.Graph() with infer_graph.as_default(): infer_model = BertModel(config=config, is_training=False) # create session sess_conf = tf.ConfigProto(intra_op_parallelism_threads=8, inter_op_parallelism_threads=8) sess_conf.gpu_options.allow_growth = True infer_sess = tf.Session(config=sess_conf, graph=infer_graph) # restore model from the latest checkpoint with infer_graph.as_default(): loaded_model, global_step = _mh.create_or_load( infer_model, os.path.join(PROJECT_PATH, config.ckpt_path), infer_sess) # the following is just for test """ import numpy as np input_ids = np.array([[10, 128, 10, 0, 120], [20, 3, 0, 0, 30]], dtype=np.int32)
def main(seed): # tasks = ['Ames', 'BBB', 'FDAMDD', 'H_HT', 'Pgp_inh', 'Pgp_sub'] # os.environ['CUDA_VISIBLE_DEVICES'] = "1" # tasks = ['BBB', 'FDAMDD', 'Pgp_sub'] task = 'FDAMDD' print(task) small = {'name':'Small','num_layers': 3, 'num_heads': 2, 'd_model': 128,'path':'small_weights','addH':True} medium = {'name':'Medium','num_layers': 6, 'num_heads': 8, 'd_model': 256,'path':'medium_weights','addH':True} large = {'name':'Large','num_layers': 12, 'num_heads': 12, 'd_model': 512,'path':'large_weights','addH':True} arch = medium ## small 3 4 128 medium: 6 6 256 large: 12 8 516 pretraining = True pretraining_str = 'pretraining' if pretraining else '' trained_epoch = 10 num_layers = arch['num_layers'] num_heads = arch['num_heads'] d_model = arch['d_model'] addH = arch['addH'] dff = d_model * 2 vocab_size = 17 dropout_rate = 0.1 seed = seed np.random.seed(seed=seed) tf.random.set_seed(seed=seed) train_dataset, test_dataset , val_dataset = Graph_Classification_Dataset('data/clf/{}.csv'.format(task), smiles_field='SMILES', label_field='Label',addH=True).get_data() x, adjoin_matrix, y = next(iter(train_dataset.take(1))) seq = tf.cast(tf.math.equal(x, 0), tf.float32) mask = seq[:, tf.newaxis, tf.newaxis, :] model = PredictModel(num_layers=num_layers, d_model=d_model, dff=dff, num_heads=num_heads, vocab_size=vocab_size, dense_dropout=0.5) if pretraining: temp = BertModel(num_layers=num_layers, d_model=d_model, dff=dff, num_heads=num_heads, vocab_size=vocab_size) pred = temp(x, mask=mask, training=True, adjoin_matrix=adjoin_matrix) temp.load_weights(arch['path']+'/bert_weights{}_{}.h5'.format(arch['name'],trained_epoch)) temp.encoder.save_weights(arch['path']+'/bert_weights_encoder{}_{}.h5'.format(arch['name'],trained_epoch)) del temp pred = model(x,mask=mask,training=True,adjoin_matrix=adjoin_matrix) model.encoder.load_weights(arch['path']+'/bert_weights_encoder{}_{}.h5'.format(arch['name'],trained_epoch)) print('load_wieghts') optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5) auc= -10 stopping_monitor = 0 for epoch in range(100): accuracy_object = tf.keras.metrics.BinaryAccuracy() loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True) for x,adjoin_matrix,y in train_dataset: with tf.GradientTape() as tape: seq = tf.cast(tf.math.equal(x, 0), tf.float32) mask = seq[:, tf.newaxis, tf.newaxis, :] preds = model(x,mask=mask,training=True,adjoin_matrix=adjoin_matrix) loss = loss_object(y,preds) grads = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) accuracy_object.update_state(y,preds) print('epoch: ',epoch,'loss: {:.4f}'.format(loss.numpy().item()),'accuracy: {:.4f}'.format(accuracy_object.result().numpy().item())) y_true = [] y_preds = [] for x, adjoin_matrix, y in val_dataset: seq = tf.cast(tf.math.equal(x, 0), tf.float32) mask = seq[:, tf.newaxis, tf.newaxis, :] preds = model(x,mask=mask,adjoin_matrix=adjoin_matrix,training=False) y_true.append(y.numpy()) y_preds.append(preds.numpy()) y_true = np.concatenate(y_true,axis=0).reshape(-1) y_preds = np.concatenate(y_preds,axis=0).reshape(-1) y_preds = tf.sigmoid(y_preds).numpy() auc_new = roc_auc_score(y_true,y_preds) val_accuracy = keras.metrics.binary_accuracy(y_true.reshape(-1), y_preds.reshape(-1)).numpy() print('val auc:{:.4f}'.format(auc_new), 'val accuracy:{:.4f}'.format(val_accuracy)) if auc_new > auc: auc = auc_new stopping_monitor = 0 np.save('{}/{}{}{}{}{}'.format(arch['path'], task, seed, arch['name'], trained_epoch, trained_epoch,pretraining_str), [y_true, y_preds]) model.save_weights('classification_weights/{}_{}.h5'.format(task,seed)) print('save model weights') else: stopping_monitor += 1 print('best val auc: {:.4f}'.format(auc)) if stopping_monitor>0: print('stopping_monitor:',stopping_monitor) if stopping_monitor>20: break y_true = [] y_preds = [] model.load_weights('classification_weights/{}_{}.h5'.format(task, seed)) for x, adjoin_matrix, y in test_dataset: seq = tf.cast(tf.math.equal(x, 0), tf.float32) mask = seq[:, tf.newaxis, tf.newaxis, :] preds = model(x, mask=mask, adjoin_matrix=adjoin_matrix, training=False) y_true.append(y.numpy()) y_preds.append(preds.numpy()) y_true = np.concatenate(y_true, axis=0).reshape(-1) y_preds = np.concatenate(y_preds, axis=0).reshape(-1) y_preds = tf.sigmoid(y_preds).numpy() test_auc = roc_auc_score(y_true, y_preds) test_accuracy = keras.metrics.binary_accuracy(y_true.reshape(-1), y_preds.reshape(-1)).numpy() print('test auc:{:.4f}'.format(test_auc), 'test accuracy:{:.4f}'.format(test_accuracy)) return test_auc
def main(seed): # tasks = ['caco2', 'logD', 'logS', 'PPB', 'tox'] # os.environ['CUDA_VISIBLE_DEVICES'] = "1" keras.backend.clear_session() os.environ['CUDA_VISIBLE_DEVICES'] = "0" small = {'name': 'Small', 'num_layers': 3, 'num_heads': 4, 'd_model': 128, 'path': 'small_weights','addH':True} medium = {'name': 'Medium', 'num_layers': 6, 'num_heads': 8, 'd_model': 256, 'path': 'medium_weights','addH':True} medium2 = {'name': 'Medium', 'num_layers': 6, 'num_heads': 8, 'd_model': 256, 'path': 'medium_weights2', 'addH': True} large = {'name': 'Large', 'num_layers': 12, 'num_heads': 12, 'd_model': 576, 'path': 'large_weights','addH':True} medium_without_H = {'name': 'Medium', 'num_layers': 6, 'num_heads': 8, 'd_model': 256, 'path': 'weights_without_H','addH':False} medium_without_pretrain = {'name': 'Medium', 'num_layers': 6, 'num_heads': 8, 'd_model': 256,'path': 'medium_without_pretraining_weights','addH':True} arch = medium ## small 3 4 128 medium: 6 6 256 large: 12 8 516 pretraining = True pretraining_str = 'pretraining' if pretraining else '' trained_epoch = 10 task = 'PPB' print(task) seed = seed num_layers = arch['num_layers'] num_heads = arch['num_heads'] d_model = arch['d_model'] addH = arch['addH'] dff = d_model * 2 vocab_size = 17 dropout_rate = 0.1 tf.random.set_seed(seed=seed) graph_dataset = Graph_Regression_Dataset('data/reg/{}.txt'.format(task), smiles_field='SMILES', label_field='Label',addH=addH).get_data() train_dataset, test_dataset,val_dataset = graph_dataset.get_data() value_range = graph_dataset.value_range() x, adjoin_matrix, y = next(iter(train_dataset.take(1))) seq = tf.cast(tf.math.equal(x, 0), tf.float32) mask = seq[:, tf.newaxis, tf.newaxis, :] model = PredictModel(num_layers=num_layers, d_model=d_model, dff=dff, num_heads=num_heads, vocab_size=vocab_size, dense_dropout=0.15) if pretraining: temp = BertModel(num_layers=num_layers, d_model=d_model, dff=dff, num_heads=num_heads, vocab_size=vocab_size) pred = temp(x, mask=mask, training=True, adjoin_matrix=adjoin_matrix) temp.load_weights(arch['path']+'/bert_weights{}_{}.h5'.format(arch['name'],trained_epoch)) temp.encoder.save_weights(arch['path']+'/bert_weights_encoder{}_{}.h5'.format(arch['name'],trained_epoch)) del temp pred = model(x, mask=mask, training=True, adjoin_matrix=adjoin_matrix) model.encoder.load_weights(arch['path']+'/bert_weights_encoder{}_{}.h5'.format(arch['name'],trained_epoch)) print('load_wieghts') class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule): def __init__(self, d_model, total_steps=4000): super(CustomSchedule, self).__init__() self.d_model = d_model self.d_model = tf.cast(self.d_model, tf.float32) self.total_step = total_steps self.warmup_steps = total_steps*0.10 def __call__(self, step): arg1 = step/self.warmup_steps arg2 = 1-(step-self.warmup_steps)/(self.total_step-self.warmup_steps) return 10e-5* tf.math.minimum(arg1, arg2) steps_per_epoch = len(train_dataset) learning_rate = CustomSchedule(128,100*steps_per_epoch) optimizer = tf.keras.optimizers.Adam(learning_rate=10e-5) value_range = r2 = -10 stopping_monitor = 0 for epoch in range(100): mse_object = tf.keras.metrics.MeanSquaredError() for x,adjoin_matrix,y in train_dataset: with tf.GradientTape() as tape: seq = tf.cast(tf.math.equal(x, 0), tf.float32) mask = seq[:, tf.newaxis, tf.newaxis, :] preds = model(x,mask=mask,training=True,adjoin_matrix=adjoin_matrix) loss = tf.reduce_mean(tf.square(y-preds)) grads = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) mse_object.update_state(y,preds) print('epoch: ',epoch,'loss: {:.4f}'.format(loss.numpy().item()),'mse: {:.4f}'.format(mse_object.result().numpy().item() * (value_range**2))) y_true = [] y_preds = [] for x, adjoin_matrix, y in val_dataset: seq = tf.cast(tf.math.equal(x, 0), tf.float32) mask = seq[:, tf.newaxis, tf.newaxis, :] preds = model(x,mask=mask,adjoin_matrix=adjoin_matrix,training=False) y_true.append(y.numpy()) y_preds.append(preds.numpy()) y_true = np.concatenate(y_true,axis=0).reshape(-1) y_preds = np.concatenate(y_preds,axis=0).reshape(-1) r2_new = r2_score(y_true,y_preds) val_mse = keras.metrics.MSE(y_true, y_preds).numpy() * (value_range**2) print('val r2: {:.4f}'.format(r2_new), 'val mse:{:.4f}'.format(val_mse)) if r2_new > r2: r2 = r2_new stopping_monitor = 0 np.save('{}/{}{}{}{}{}'.format(arch['path'], task, seed, arch['name'], trained_epoch, trained_epoch,pretraining_str), [y_true, y_preds]) model.save_weights('regression_weights/{}.h5'.format(task)) else: stopping_monitor +=1 print('best r2: {:.4f}'.format(r2)) if stopping_monitor>0: print('stopping_monitor:',stopping_monitor) if stopping_monitor>20: break y_true = [] y_preds = [] model.load_weights('regression_weights/{}.h5'.format(task, seed)) for x, adjoin_matrix, y in test_dataset: seq = tf.cast(tf.math.equal(x, 0), tf.float32) mask = seq[:, tf.newaxis, tf.newaxis, :] preds = model(x, mask=mask, adjoin_matrix=adjoin_matrix, training=False) y_true.append(y.numpy()) y_preds.append(preds.numpy()) y_true = np.concatenate(y_true, axis=0).reshape(-1) y_preds = np.concatenate(y_preds, axis=0).reshape(-1) test_r2 = r2_score(y_true, y_preds) test_mse = keras.metrics.MSE(y_true.reshape(-1), y_preds.reshape(-1)).numpy() * (value_range**2) print('test r2:{:.4f}'.format(test_r2), 'test mse:{:.4f}'.format(test_mse)) return r2
except: train_dataset, dev_dataset = SQuAD1() old_vocab = train_dataset.vocab vocab = torchtext.legacy.vocab.Vocab( counter=old_vocab.freqs, specials=['<unk>', '<pad>', '<MASK>']) with open(args.save_vocab, 'wb') as f: torch.save(vocab, f) pad_id = vocab.stoi['<pad>'] sep_id = vocab.stoi['<sep>'] cls_id = vocab.stoi['<cls>'] train_dataset, dev_dataset = SQuAD1(vocab=vocab) train_dataset = process_raw_data(train_dataset) dev_dataset = process_raw_data(dev_dataset) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") embed_layer = BertEmbedding(len(vocab), args.emsize) pretrained_bert = BertModel(len(vocab), args.emsize, args.nhead, args.nhid, args.nlayers, embed_layer, args.dropout) pretrained_bert.load_state_dict(torch.load(args.bert_model)) model = QuestionAnswerTask(pretrained_bert).to(device) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=args.lr) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1) best_f1 = None train_loss_log, val_loss_log = [], [] for epoch in range(1, args.epochs + 1): epoch_start_time = time.time() train() val_loss, val_exact, val_f1 = evaluate(dev_dataset, vocab) val_loss_log.append(val_loss) print('-' * 89) print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
def main(seed): # tasks = ['Ames', 'BBB', 'FDAMDD', 'H_HT', 'Pgp_inh', 'Pgp_sub'] # os.environ['CUDA_VISIBLE_DEVICES'] = "1" # tasks = ['H_HT', 'Pgp_inh', 'Pgp_sub'] task = 'Ames' print(task) medium2 = { 'name': 'Medium', 'num_layers': 6, 'num_heads': 8, 'd_model': 256, 'path': 'medium_weights2', 'addH': True } small = { 'name': 'Small', 'num_layers': 3, 'num_heads': 4, 'd_model': 128, 'path': 'small_weights', 'addH': True } medium = { 'name': 'Medium', 'num_layers': 6, 'num_heads': 8, 'd_model': 256, 'path': 'medium_weights', 'addH': True } large = { 'name': 'Large', 'num_layers': 12, 'num_heads': 12, 'd_model': 516, 'path': 'large_weights', 'addH': True } medium_without_H = { 'name': 'Medium', 'num_layers': 6, 'num_heads': 8, 'd_model': 256, 'path': 'weights_without_H', 'addH': False } medium_balanced = { 'name': 'Medium', 'num_layers': 6, 'num_heads': 8, 'd_model': 256, 'path': 'weights_balanced', 'addH': True } medium_without_pretrain = { 'name': 'Medium', 'num_layers': 6, 'num_heads': 8, 'd_model': 256, 'path': 'medium_without_pretraining_weights', 'addH': True } arch = medium ## small 3 4 128 medium: 6 6 256 large: 12 8 516 pretraining = True pretraining_str = 'pretraining' if pretraining else '' trained_epoch = 6 num_layers = arch['num_layers'] num_heads = arch['num_heads'] d_model = arch['d_model'] addH = arch['addH'] dff = d_model * 2 vocab_size = 17 dropout_rate = 0.1 seed = seed np.random.seed(seed=seed) tf.random.set_seed(seed=seed) train_dataset1, test_dataset1, val_dataset1 = Graph_Classification_Dataset( 'data\clf\Ames.txt', smiles_field='SMILES', label_field='Label', addH=addH).get_data() train_dataset2, test_dataset2, val_dataset2 = Graph_Classification_Dataset( 'data\clf\BBB.txt', smiles_field='SMILES', label_field='Label', addH=addH).get_data() train_dataset3, test_dataset3, val_dataset3 = Graph_Classification_Dataset( 'data\clf\FDAMDD.txt', smiles_field='SMILES', label_field='Label', addH=addH).get_data() train_dataset4, test_dataset4, val_dataset4 = Graph_Classification_Dataset( 'data\clf\H_HT.txt', smiles_field='SMILES', label_field='Label', addH=addH).get_data() train_dataset5, test_dataset5, val_dataset5 = Graph_Classification_Dataset( 'data\clf\Pgp_inh.txt', smiles_field='SMILES', label_field='Label', addH=addH).get_data() train_dataset6, test_dataset6, val_dataset6 = Graph_Classification_Dataset( 'data\clf\Pgp_sub.txt', smiles_field='SMILES', label_field='Label', addH=addH).get_data() x, adjoin_matrix, y = next(iter(train_dataset1.take(1))) seq = tf.cast(tf.math.equal(x, 0), tf.float32) mask = seq[:, tf.newaxis, tf.newaxis, :] model = PredictModel(num_layers=num_layers, d_model=d_model, dff=dff, num_heads=num_heads, vocab_size=vocab_size, dense_dropout=0.2) if pretraining: temp = BertModel(num_layers=num_layers, d_model=d_model, dff=dff, num_heads=num_heads, vocab_size=vocab_size) pred = temp(x, mask=mask, training=True, adjoin_matrix=adjoin_matrix) temp.load_weights( arch['path'] + '/bert_weights{}_{}.h5'.format(arch['name'], trained_epoch)) temp.encoder.save_weights(arch['path'] + '/bert_weights_encoder{}_{}.h5'.format( arch['name'], trained_epoch)) del temp pred = model(x, mask=mask, training=True, adjoin_matrix=adjoin_matrix) model.encoder.load_weights(arch['path'] + '/bert_weights_encoder{}_{}.h5'.format( arch['name'], trained_epoch)) print('load_wieghts') class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule): def __init__(self, d_model, total_steps=4000): super(CustomSchedule, self).__init__() self.d_model = d_model self.d_model = tf.cast(self.d_model, tf.float32) self.total_step = total_steps self.warmup_steps = total_steps * 0.06 def __call__(self, step): arg1 = step / self.warmup_steps arg2 = 1 - (step - self.warmup_steps) / (self.total_step - self.warmup_steps) return 5e-5 * tf.math.minimum(arg1, arg2) steps_per_epoch = len(train_dataset1) learning_rate = CustomSchedule(128, 100 * steps_per_epoch) optimizer = tf.keras.optimizers.Adam(learning_rate=10e-5) auc = 0 stopping_monitor = 0 for epoch in range(100): loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True) for x1, adjoin_matrix1, y1 in train_dataset1: x2, adjoin_matrix2, y2 = next(iter(train_dataset2)) x3, adjoin_matrix3, y3 = next(iter(train_dataset3)) x4, adjoin_matrix4, y4 = next(iter(train_dataset4)) x5, adjoin_matrix5, y5 = next(iter(train_dataset5)) x6, adjoin_matrix6, y6 = next(iter(train_dataset6)) with tf.GradientTape() as tape: seq1 = tf.cast(tf.math.equal(x1, 0), tf.float32) mask1 = seq1[:, tf.newaxis, tf.newaxis, :] preds1 = model(x1, mask=mask1, training=True, adjoin_matrix=adjoin_matrix1) # s1 = model.s[0] # s2 = model.s[1] # s3 = model.s[2] # s4 = model.s[3] # s5 = model.s[4] # s6 = model.s[5] loss1 = loss_object(y1, preds1[:, 0]) * 10 seq2 = tf.cast(tf.math.equal(x2, 0), tf.float32) mask2 = seq2[:, tf.newaxis, tf.newaxis, :] preds2 = model(x2, mask=mask2, training=True, adjoin_matrix=adjoin_matrix2) loss2 = loss_object(y2, preds2[:, 1]) seq3 = tf.cast(tf.math.equal(x3, 0), tf.float32) mask3 = seq3[:, tf.newaxis, tf.newaxis, :] preds3 = model(x3, mask=mask3, training=True, adjoin_matrix=adjoin_matrix3) loss3 = loss_object(y3, preds3[:, 2]) seq4 = tf.cast(tf.math.equal(x4, 0), tf.float32) mask4 = seq4[:, tf.newaxis, tf.newaxis, :] preds4 = model(x4, mask=mask4, training=True, adjoin_matrix=adjoin_matrix4) loss4 = loss_object(y4, preds4[:, 3]) seq5 = tf.cast(tf.math.equal(x5, 0), tf.float32) mask5 = seq5[:, tf.newaxis, tf.newaxis, :] preds5 = model(x5, mask=mask5, training=True, adjoin_matrix=adjoin_matrix5) loss5 = loss_object(y5, preds5[:, 4]) seq6 = tf.cast(tf.math.equal(x6, 0), tf.float32) mask6 = seq6[:, tf.newaxis, tf.newaxis, :] preds6 = model(x6, mask=mask6, training=True, adjoin_matrix=adjoin_matrix6) loss6 = loss_object(y6, preds6[:, 5]) loss = loss1 + loss2 + loss3 + loss4 + loss5 + loss6 grads = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) print('epoch: ', epoch, 'loss: {:.4f}'.format(loss.numpy().item())) y_true = [] y_preds = [] for x, adjoin_matrix, y in test_dataset1: seq = tf.cast(tf.math.equal(x, 0), tf.float32) mask = seq[:, tf.newaxis, tf.newaxis, :] preds = model(x, mask=mask, adjoin_matrix=adjoin_matrix, training=False) y_true.append(y.numpy()) y_preds.append(preds[:, 0].numpy()) y_true = np.concatenate(y_true, axis=0).reshape(-1) y_preds = np.concatenate(y_preds, axis=0).reshape(-1) y_preds = tf.sigmoid(y_preds).numpy() auc_new = roc_auc_score(y_true, y_preds) test_accuracy = keras.metrics.binary_accuracy(y_true, y_preds).numpy() print('test auc :{:.4f}'.format(auc_new), 'test accuracy:{:.4f}'.format(test_accuracy)) y_true = [] y_preds = [] for x, adjoin_matrix, y in test_dataset2: seq = tf.cast(tf.math.equal(x, 0), tf.float32) mask = seq[:, tf.newaxis, tf.newaxis, :] preds = model(x, mask=mask, adjoin_matrix=adjoin_matrix, training=False) y_true.append(y.numpy()) y_preds.append(preds[:, 1].numpy()) y_true = np.concatenate(y_true, axis=0).reshape(-1) y_preds = np.concatenate(y_preds, axis=0).reshape(-1) y_preds = tf.sigmoid(y_preds).numpy() auc_new = roc_auc_score(y_true, y_preds) test_accuracy = keras.metrics.binary_accuracy(y_true, y_preds).numpy() print('test auc:{:.4f}'.format(auc_new), 'test accuracy:{:.4f}'.format(test_accuracy)) y_true = [] y_preds = [] for x, adjoin_matrix, y in test_dataset3: seq = tf.cast(tf.math.equal(x, 0), tf.float32) mask = seq[:, tf.newaxis, tf.newaxis, :] preds = model(x, mask=mask, adjoin_matrix=adjoin_matrix, training=False) y_true.append(y.numpy()) y_preds.append(preds[:, 2].numpy()) y_true = np.concatenate(y_true, axis=0).reshape(-1) y_preds = np.concatenate(y_preds, axis=0).reshape(-1) y_preds = tf.sigmoid(y_preds).numpy() auc_new = roc_auc_score(y_true, y_preds) test_accuracy = keras.metrics.binary_accuracy(y_true, y_preds).numpy() print('test auc :{:.4f}'.format(auc_new), 'test accuracy:{:.4f}'.format(test_accuracy)) y_true = [] y_preds = [] for x, adjoin_matrix, y in test_dataset4: seq = tf.cast(tf.math.equal(x, 0), tf.float32) mask = seq[:, tf.newaxis, tf.newaxis, :] preds = model(x, mask=mask, adjoin_matrix=adjoin_matrix, training=False) y_true.append(y.numpy()) y_preds.append(preds[:, 3].numpy()) y_true = np.concatenate(y_true, axis=0).reshape(-1) y_preds = np.concatenate(y_preds, axis=0).reshape(-1) y_preds = tf.sigmoid(y_preds).numpy() auc_new = roc_auc_score(y_true, y_preds) test_accuracy = keras.metrics.binary_accuracy(y_true, y_preds).numpy() print('test auc :{:.4f}'.format(auc_new), 'test accuracy:{:.4f}'.format(test_accuracy)) y_true = [] y_preds = [] for x, adjoin_matrix, y in test_dataset5: seq = tf.cast(tf.math.equal(x, 0), tf.float32) mask = seq[:, tf.newaxis, tf.newaxis, :] preds = model(x, mask=mask, adjoin_matrix=adjoin_matrix, training=False) y_true.append(y.numpy()) y_preds.append(preds[:, 4].numpy()) y_true = np.concatenate(y_true, axis=0).reshape(-1) y_preds = np.concatenate(y_preds, axis=0).reshape(-1) y_preds = tf.sigmoid(y_preds).numpy() auc_new = roc_auc_score(y_true, y_preds) test_accuracy = keras.metrics.binary_accuracy(y_true, y_preds).numpy() print('test auc :{:.4f}'.format(auc_new), 'test accuracy:{:.4f}'.format(test_accuracy)) y_true = [] y_preds = [] for x, adjoin_matrix, y in test_dataset6: seq = tf.cast(tf.math.equal(x, 0), tf.float32) mask = seq[:, tf.newaxis, tf.newaxis, :] preds = model(x, mask=mask, adjoin_matrix=adjoin_matrix, training=False) y_true.append(y.numpy()) y_preds.append(preds[:, 5].numpy()) y_true = np.concatenate(y_true, axis=0).reshape(-1) y_preds = np.concatenate(y_preds, axis=0).reshape(-1) y_preds = tf.sigmoid(y_preds).numpy() auc_new = roc_auc_score(y_true, y_preds) test_accuracy = keras.metrics.binary_accuracy(y_true, y_preds).numpy() print('test auc :{:.4f}'.format(auc_new), 'test accuracy:{:.4f}'.format(test_accuracy)) return auc
large = {'name': 'Large', 'num_layers': 12, 'num_heads': 12, 'd_model': 576, 'path': 'large_weights','addH':True} medium_balanced = {'name':'Medium','num_layers': 6, 'num_heads': 8, 'd_model': 256,'path':'weights_balanced','addH':True} medium_without_H = {'name':'Medium','num_layers': 6, 'num_heads': 8, 'd_model': 256,'path':'weights_without_H','addH':False} arch = medium2 ## small 3 4 128 medium: 6 6 256 large: 12 8 516 num_layers = arch['num_layers'] num_heads = arch['num_heads'] d_model = arch['d_model'] addH = arch['addH'] dff = d_model*2 vocab_size =18 dropout_rate = 0.1 model = BertModel(num_layers=num_layers,d_model=d_model,dff=dff,num_heads=num_heads,vocab_size=vocab_size) train_dataset, test_dataset = Multi_Task_Graph_Bert_Dataset(path='data/chem.txt',smiles_field='CAN_SMILES',addH=addH).get_data() train_step_signature = [ tf.TensorSpec(shape=(None, None), dtype=tf.int64), tf.TensorSpec(shape=(None, None,None), dtype=tf.float32), tf.TensorSpec(shape=(None, None), dtype=tf.int64), tf.TensorSpec(shape=(None, None), dtype=tf.float32), ] train_loss = tf.keras.metrics.Mean(name='train_loss') train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy') test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy') loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
def model_fn(features, labels, mode, params): """this is prototype syntax, all parameters are necessary.""" # obtain the data _info('*** Features ***') for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features['input_ids'] # [batch_size, seq_length] input_mask = features['input_mask'] # [batch_size, seq_length] # if mode != tf.estimator.ModeKeys.PREDICT: # # segment_idx = features['segment_dis'] # masked_lm_positions = features['masked_lm_positions'] # [batch_size, seq_length], specify the answer # masked_lm_ids = features['masked_lm_ids'] # [batch_size, answer_seq_length], specify the answer labels # masked_lm_weights = features['masked_lm_weights'] # [batch_size, seq_length], [1, 1, 0], 0 refers to the mask # # next_sentence_labels = features['next_sentence_labels'] # else: masked_lm_positions = features['masked_lm_positions'] masked_lm_ids = features['masked_lm_ids'] masked_lm_weights = features['masked_lm_weights'] if bert_config.train_type == 'seq2seq': _info('Training seq2seq task.') elif bert_config.train_type == 'lm': _info('Training language model task.') # build model is_training = (mode == tf.estimator.ModeKeys.TRAIN) model = BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask) # compute loss loss, per_loss, log_probs, logits = get_masked_lm_output( bert_config, model.get_sequence_output(), model.embedding_table, model.projection_table, masked_lm_positions, masked_lm_ids, masked_lm_weights, mode) if mode == tf.estimator.ModeKeys.PREDICT: masked_lm_predictions = tf.reshape( tf.argmax(log_probs, axis=-1, output_type=tf.int32), [-1]) output_spec = tf.estimator.EstimatorSpec( mode, predictions=masked_lm_predictions) else: if mode == tf.estimator.ModeKeys.TRAIN: # restore from the checkpoint, # tf.estimator automatically restore from the model typically, # maybe here is for restore some pre-trained parameters tvars = tf.trainable_variables() initialized_variable_names = {} if init_checkpoint: (assignment_map, initialized_variable_names ) = get_assignment_map_from_checkpoint( tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) _info('*** Trainable Variables ***') for var in tvars: init_string = '' if var.name in initialized_variable_names: init_string = ', *INIT_FROM_CKPT*' _info('name = {}, shape={}{}'.format( var.name, var.shape, init_string)) train_op = optimization.create_optimizer( loss, bert_config.learning_rate, num_train_steps, bert_config.lr_limit) # learning_rate = tf.train.polynomial_decay(bert_config.learning_rate, # tf.train.get_or_create_global_step(), # num_train_steps, # end_learning_rate=0.0, # power=1.0, # cycle=False) # optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate) # gradients = tf.gradients(loss, tvars, colocate_gradients_with_ops=True) # clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0) # train_op = optimizer.apply_gradients(zip(clipped_gradients, tvars), global_step=tf.train.get_global_step()) output_spec = tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op) elif mode == tf.estimator.ModeKeys.EVAL: is_real_example = tf.ones(tf.shape(masked_lm_ids), dtype=tf.float32) def metric_fn(loss, label_ids, logits, is_real_example): """ Args: loss: tf.float32. label_ids: [b, s]. logits: [b, s, v]. """ # [b * s, v] logits = tf.reshape(logits, [-1, logits.shape[-1]]) # [b * s, 1] predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) # [b * s] label_ids = tf.reshape(label_ids, [-1]) accuracy = tf.metrics.accuracy(labels=label_ids, predictions=predictions) loss = tf.metrics.mean(values=loss) return {'eval_accuracy': accuracy, 'eval_loss': loss} eval_metrics = metric_fn(loss, masked_lm_ids, logits, is_real_example) output_spec = tf.estimator.EstimatorSpec( mode=mode, loss=loss, eval_metric_ops=eval_metrics) return output_spec
def run_main(args, rank=None): # Set the random seed manually for reproducibility. torch.manual_seed(args.seed) if args.parallel == 'DDP': n = torch.cuda.device_count() // args.world_size device = list(range(rank * n, (rank + 1) * n)) else: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") vocab = torch.load(args.save_vocab) cls_id = vocab.stoi['<cls>'] pad_id = vocab.stoi['<pad>'] sep_id = vocab.stoi['<sep>'] if args.dataset == 'WikiText103': from torchtext.experimental.datasets import WikiText103 train_dataset, valid_dataset, test_dataset = WikiText103(vocab=vocab) elif args.dataset == 'BookCorpus': from data import BookCorpus train_dataset, valid_dataset, test_dataset = BookCorpus(vocab, min_sentence_len=60) if rank is not None: chunk_len = len(train_dataset.data) // args.world_size train_dataset.data = train_dataset.data[(rank * chunk_len):((rank + 1) * chunk_len)] if args.checkpoint != 'None': model = torch.load(args.checkpoint) else: pretrained_bert = BertModel(len(vocab), args.emsize, args.nhead, args.nhid, args.nlayers, args.dropout) pretrained_bert.load_state_dict(torch.load(args.bert_model)) model = NextSentenceTask(pretrained_bert) if args.parallel == 'DDP': model = model.to(device[0]) model = DDP(model, device_ids=device) else: model = model.to(device) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=args.lr) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1) best_val_loss = None train_loss_log, val_loss_log = [], [] for epoch in range(1, args.epochs + 1): epoch_start_time = time.time() train(process_raw_data(train_dataset, args), model, train_loss_log, device, optimizer, criterion, epoch, scheduler, cls_id, sep_id, pad_id, args, rank) val_loss = evaluate(process_raw_data(valid_dataset, args), model, device, criterion, cls_id, sep_id, pad_id, args) val_loss_log.append(val_loss) if (rank is None) or (rank == 0): print('-' * 89) print('| end of epoch {:3d} | time: {:5.2f}s ' '| valid loss {:8.5f} | '.format(epoch, (time.time() - epoch_start_time), val_loss)) print('-' * 89) if not best_val_loss or val_loss < best_val_loss: if rank is None: with open(args.save, 'wb') as f: torch.save(model, f) elif rank == 0: with open(args.save, 'wb') as f: torch.save(model.state_dict(), f) best_val_loss = val_loss else: scheduler.step() if args.parallel == 'DDP': rank0_devices = [x - rank * len(device) for x in device] device_pairs = zip(rank0_devices, device) map_location = {'cuda:%d' % x: 'cuda:%d' % y for x, y in device_pairs} model.load_state_dict(torch.load(args.save, map_location=map_location)) test_loss = evaluate(process_raw_data(test_dataset, args), model, device, criterion, cls_id, sep_id, pad_id, args) if rank == 0: wrap_up(train_loss_log, val_loss_log, test_loss, args, model.module, 'ns_loss.txt', 'ns_model.pt') else: with open(args.save, 'rb') as f: model = torch.load(f) test_loss = evaluate(process_raw_data(test_dataset, args), model, device, criterion, cls_id, sep_id, pad_id, args) wrap_up(train_loss_log, val_loss_log, test_loss, args, model, 'ns_loss.txt', 'ns_model.pt')