def preprocess() -> argparse.Namespace: """ preprocess of training :return: config args """ print('preprocessing starts...\n') # ====== parse arguments ====== # args = parse_args() # ====== set random seed ====== # random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # ====== save path ====== # now_time = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') args.save_path = os.path.join('./logs/', 'my_log-' + now_time) if not os.path.exists(args.save_path) and not args.debug: os.makedirs(args.save_path) # ====== fitlog init ====== # fitlog.commit(__file__) fitlog.debug(args.debug) fitlog.add_hyper(args) # ====== tb VisualLogger init ====== # args.visual_logger = VisualLogger( args.save_path) if not args.debug else None # ====== cuda enable ====== # os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpuid) args.device = torch.device( 'cuda') if args.cuda and torch.cuda.is_available() else torch.device( 'cpu') # ====== others ====== # os.environ['TOKENIZERS_PARALLELISM'] = 'false' torch.set_num_threads(6) print(args, end='\n\n') return args
def after_parse_t2g(C , need_logger = False): #----- make logger ----- logger = Logger(C.log_file) logger.log = logger.log_print_w_time if C.no_log: logger.log = logger.nolog C.tmp_file_name = random_tmp_name() #----- other stuff ----- if C.auto_hyperparam: auto_hyperparam(C) logger.log("Hyper parameters autoset.") if C.no_fitlog: fitlog.debug() fitlog.set_log_dir("logs") fitlog.add_hyper(C) logger.log ("------------------------------------------------------") logger.log (pformat(C.__dict__)) logger.log ("------------------------------------------------------") C.gpus = list(range(tc.cuda.device_count())) #----- initialize ----- if C.t2g_seed > 0: random.seed(C.t2g_seed) tc.manual_seed(C.t2g_seed) np.random.seed(C.t2g_seed) tc.cuda.manual_seed_all(C.t2g_seed) tc.backends.cudnn.deterministic = True tc.backends.cudnn.benchmark = False logger.log ("Seed set. %d" % (C.t2g_seed)) tc.cuda.set_device(C.gpus[0]) C.device = C.gpus[0] if need_logger: return C , logger return C
def auto_hyperparam(C): if C.dataset == "ace_2005": C.ensemble = 1 C.no_rel_name = "NO_RELATION" C.gnn = True C.matrix_trans = True C.train_text_1 = "./data/ace_2005/ace_05_processed/ace-05-splits/json-pm13/bn+nw.json" C.valid_text = "./data/ace_2005/ace_05_processed/ace-05-splits/json-pm13/bc_dev.json" C.test_text = "./data/ace_2005/ace_05_processed/ace-05-splits/json-pm13/bc_test.json" C.dataset = "ace_2005" C.gene_in_data = True C.valid_metric = "macro" C.scheduler = "cosine" C.no_valid = True C.loss = "loss_1" C.t2g_batch_size= 8 C.t2g_lr = 5e-5 C.no_rel_weight = 0.25 C.epoch_numb = 30 C.warmup_prop = 0.02 C.model_save = "model_ace.pkl" elif C.dataset == "semeval_2018_task7": C.ensemble = 5 C.epoch_numb = 30 C.no_rel_name = "NONE" C.matrix_trans = True C.gnn = True C.valid_text = "./data/semeval_2018_task7/2.test.text.xml" C.valid_rels = "./data/semeval_2018_task7/keys.test.2.txt" C.loss = "loss_2" C.no_valid = True C.warmup_prop = 0.1 C.scheduler = "cosine" C.t2g_batch_size= 8 C.t2g_lr = 1e-4 C.model_save = "model_semeval.pkl" C.no_fitlog = True fitlog.debug()
import fitlog use_fitlog = False if not use_fitlog: fitlog.debug() fitlog.set_log_dir('logs') load_dataset_seed = 100 fitlog.add_hyper(load_dataset_seed, 'load_dataset_seed') fitlog.set_rng_seed(load_dataset_seed) import sys sys.path.append('../') import argparse from fastNLP.core import Trainer from fastNLP.core import Callback from fastNLP import LossInForward from fastNLP.core.metrics import SpanFPreRecMetric, AccuracyMetric from fastNLP.core.callback import WarmupCallback, GradientClipCallback, EarlyStopCallback, FitlogCallback from fastNLP import LRScheduler from fastNLP import logger import torch import torch.optim as optim import torch.nn as nn from torch.optim.lr_scheduler import LambdaLR import collections from load_data import *
def train(): args = parse_args() if args.debug: fitlog.debug() args.save_model = False # ================= define ================= tokenizer = RobertaTokenizer.from_pretrained('roberta-base') word_mask_index = tokenizer.mask_token_id word_vocab_size = len(tokenizer) if get_local_rank() == 0: fitlog.set_log_dir(args.log_dir) fitlog.commit(__file__, fit_msg=args.name) fitlog.add_hyper_in_file(__file__) fitlog.add_hyper(args) # ================= load data ================= dist.init_process_group('nccl') init_logger_dist() n_proc = dist.get_world_size() bsz = args.batch_size // args.grad_accumulation // n_proc args.local_rank = get_local_rank() args.save_dir = os.path.join(args.save_dir, args.name) if args.save_model else None if args.save_dir is not None and os.path.exists(args.save_dir): raise RuntimeError('save_dir has already existed.') logger.info('save directory: {}'.format( 'None' if args.save_dir is None else args.save_dir)) devices = list(range(torch.cuda.device_count())) NUM_WORKERS = 4 ent_vocab, rel_vocab = load_ent_rel_vocabs() logger.info('# entities: {}'.format(len(ent_vocab))) logger.info('# relations: {}'.format(len(rel_vocab))) ent_freq = get_ent_freq() assert len(ent_vocab) == len(ent_freq), '{} {}'.format( len(ent_vocab), len(ent_freq)) ##### root = args.data_dir dirs = os.listdir(root) drop_files = [] for dir in dirs: path = os.path.join(root, dir) max_idx = 0 for file_name in os.listdir(path): if 'large' in file_name: continue max_idx = int(file_name) if int(file_name) > max_idx else max_idx drop_files.append(os.path.join(path, str(max_idx))) ##### file_list = [] for path, _, filenames in os.walk(args.data_dir): for filename in filenames: file = os.path.join(path, filename) if 'large' in file or file in drop_files: continue file_list.append(file) logger.info('used {} files in {}.'.format(len(file_list), args.data_dir)) if args.data_prop > 1: used_files = file_list[:int(args.data_prop)] else: used_files = file_list[:round(args.data_prop * len(file_list))] data = GraphOTFDataSet(used_files, n_proc, args.local_rank, word_mask_index, word_vocab_size, args.n_negs, ent_vocab, rel_vocab, ent_freq) dev_data = GraphDataSet(used_files[0], word_mask_index, word_vocab_size, args.n_negs, ent_vocab, rel_vocab, ent_freq) sampler = OTFDistributedSampler(used_files, n_proc, get_local_rank()) train_data_iter = TorchLoaderIter(dataset=data, batch_size=bsz, sampler=sampler, num_workers=NUM_WORKERS, collate_fn=data.collate_fn) dev_data_iter = TorchLoaderIter(dataset=dev_data, batch_size=bsz, sampler=RandomSampler(), num_workers=NUM_WORKERS, collate_fn=dev_data.collate_fn) if args.test_data is not None: test_data = FewRelDevDataSet(path=args.test_data, label_vocab=rel_vocab, ent_vocab=ent_vocab) test_data_iter = TorchLoaderIter(dataset=test_data, batch_size=32, sampler=RandomSampler(), num_workers=NUM_WORKERS, collate_fn=test_data.collate_fn) if args.local_rank == 0: print('full wiki files: {}'.format(len(file_list))) print('used wiki files: {}'.format(len(used_files))) print('# of trained samples: {}'.format(len(data) * n_proc)) print('# of trained entities: {}'.format(len(ent_vocab))) print('# of trained relations: {}'.format(len(rel_vocab))) # ================= prepare model ================= logger.info('model init') if args.rel_emb is not None: # load pretrained relation embeddings rel_emb = np.load(args.rel_emb) # add_embs = np.random.randn(3, rel_emb.shape[1]) # add <pad>, <mask>, <unk> # rel_emb = np.r_[add_embs, rel_emb] rel_emb = torch.from_numpy(rel_emb).float() assert rel_emb.shape[0] == len(rel_vocab), '{} {}'.format( rel_emb.shape[0], len(rel_vocab)) # assert rel_emb.shape[1] == args.rel_dim logger.info('loaded pretrained relation embeddings. dim: {}'.format( rel_emb.shape[1])) else: rel_emb = None if args.model_name is not None: logger.info('further pre-train.') config = RobertaConfig.from_pretrained('roberta-base', type_vocab_size=3) model = CoLAKE(config=config, num_ent=len(ent_vocab), num_rel=len(rel_vocab), ent_dim=args.ent_dim, rel_dim=args.rel_dim, ent_lr=args.ent_lr, ip_config=args.ip_config, rel_emb=None, emb_name=args.emb_name) states_dict = torch.load(args.model_name) model.load_state_dict(states_dict, strict=True) else: model = CoLAKE.from_pretrained( 'roberta-base', num_ent=len(ent_vocab), num_rel=len(rel_vocab), ent_lr=args.ent_lr, ip_config=args.ip_config, rel_emb=rel_emb, emb_name=args.emb_name, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'dist_{}'.format(args.local_rank)) model.extend_type_embedding(token_type=3) # if args.local_rank == 0: # for name, param in model.named_parameters(): # if param.requires_grad is True: # print('{}: {}'.format(name, param.shape)) # ================= train model ================= # lr=1e-4 for peak value, lr=5e-5 for initial value logger.info('trainer init') no_decay = [ 'bias', 'LayerNorm.bias', 'LayerNorm.weight', 'layer_norm.bias', 'layer_norm.weight' ] param_optimizer = list(model.named_parameters()) optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] word_acc = WordMLMAccuracy(pred='word_pred', target='masked_lm_labels', seq_len='word_seq_len') ent_acc = EntityMLMAccuracy(pred='entity_pred', target='ent_masked_lm_labels', seq_len='ent_seq_len') rel_acc = RelationMLMAccuracy(pred='relation_pred', target='rel_masked_lm_labels', seq_len='rel_seq_len') metrics = [word_acc, ent_acc, rel_acc] if args.test_data is not None: test_metric = [rel_acc] tester = Tester(data=test_data_iter, model=model, metrics=test_metric, device=list(range(torch.cuda.device_count()))) # tester.test() else: tester = None optimizer = optim.AdamW(optimizer_grouped_parameters, lr=args.lr, betas=(0.9, args.beta), eps=1e-6) # warmup_callback = WarmupCallback(warmup=args.warm_up, schedule='linear') fitlog_callback = MyFitlogCallback(tester=tester, log_loss_every=100, verbose=1) gradient_clip_callback = GradientClipCallback(clip_value=1, clip_type='norm') emb_callback = EmbUpdateCallback(model.ent_embeddings) all_callbacks = [gradient_clip_callback, emb_callback] if args.save_dir is None: master_callbacks = [fitlog_callback] else: save_callback = SaveModelCallback(args.save_dir, model.ent_embeddings, only_params=True) master_callbacks = [fitlog_callback, save_callback] if args.do_test: states_dict = torch.load(os.path.join(args.save_dir, args.model_name)).state_dict() model.load_state_dict(states_dict) data_iter = TorchLoaderIter(dataset=data, batch_size=args.batch_size, sampler=RandomSampler(), num_workers=NUM_WORKERS, collate_fn=data.collate_fn) tester = Tester(data=data_iter, model=model, metrics=metrics, device=devices) tester.test() else: trainer = DistTrainer(train_data=train_data_iter, dev_data=dev_data_iter, model=model, optimizer=optimizer, loss=LossInForward(), batch_size_per_gpu=bsz, update_every=args.grad_accumulation, n_epochs=args.epoch, metrics=metrics, callbacks_master=master_callbacks, callbacks_all=all_callbacks, validate_every=5000, use_tqdm=True, fp16='O1' if args.fp16 else '') trainer.train(load_best_model=False)
def main(): args = parse_args() if args.debug: fitlog.debug() fitlog.set_log_dir(args.log_dir) fitlog.commit(__file__) fitlog.add_hyper_in_file(__file__) fitlog.add_hyper(args) if args.gpu != 'all': os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu train_set, dev_set, test_set, temp_ent_vocab = load_fewrel_graph_data( data_dir=args.data_dir) print('data directory: {}'.format(args.data_dir)) print('# of train samples: {}'.format(len(train_set))) print('# of dev samples: {}'.format(len(dev_set))) print('# of test samples: {}'.format(len(test_set))) ent_vocab, rel_vocab = load_ent_rel_vocabs(path='../') # load entity embeddings ent_index = [] for k, v in temp_ent_vocab.items(): ent_index.append(ent_vocab[k]) ent_index = torch.tensor(ent_index) ent_emb = np.load(os.path.join(args.model_path, 'entities.npy')) ent_embedding = nn.Embedding.from_pretrained(torch.from_numpy(ent_emb)) ent_emb = ent_embedding(ent_index.view(1, -1)).squeeze().detach() # load CoLAKE parameters config = RobertaConfig.from_pretrained('roberta-base', type_vocab_size=3) model = CoLAKEForRE(config, num_types=len(train_set.label_vocab), ent_emb=ent_emb) states_dict = torch.load(os.path.join(args.model_path, 'model.bin')) model.load_state_dict(states_dict, strict=False) print('parameters below are randomly initializecd:') for name, param in model.named_parameters(): if name not in states_dict: print(name) # tie relation classification head rel_index = [] for k, v in train_set.label_vocab.items(): rel_index.append(rel_vocab[k]) rel_index = torch.LongTensor(rel_index) rel_embeddings = nn.Embedding.from_pretrained( states_dict['rel_embeddings.weight']) rel_index = rel_index.cuda() rel_cls_weight = rel_embeddings(rel_index.view(1, -1)).squeeze() model.tie_rel_weights(rel_cls_weight) model.rel_head.dense.weight.data = states_dict['rel_lm_head.dense.weight'] model.rel_head.dense.bias.data = states_dict['rel_lm_head.dense.bias'] model.rel_head.layer_norm.weight.data = states_dict[ 'rel_lm_head.layer_norm.weight'] model.rel_head.layer_norm.bias.data = states_dict[ 'rel_lm_head.layer_norm.bias'] model.resize_token_embeddings( len(RobertaTokenizer.from_pretrained('roberta-base')) + 4) print('parameters of CoLAKE has been loaded.') # fine-tune no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight', 'embedding'] param_optimizer = list(model.named_parameters()) optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = optim.AdamW(optimizer_grouped_parameters, lr=args.lr, betas=(0.9, args.beta), eps=1e-6) metrics = [MacroMetric(pred='pred', target='target')] test_data_iter = TorchLoaderIter(dataset=test_set, batch_size=args.batch_size, sampler=RandomSampler(), num_workers=4, collate_fn=test_set.collate_fn) devices = list(range(torch.cuda.device_count())) tester = Tester(data=test_data_iter, model=model, metrics=metrics, device=devices) # tester.test() fitlog_callback = FitlogCallback(tester=tester, log_loss_every=100, verbose=1) gradient_clip_callback = GradientClipCallback(clip_value=1, clip_type='norm') warmup_callback = WarmupCallback(warmup=args.warm_up, schedule='linear') bsz = args.batch_size // args.grad_accumulation train_data_iter = TorchLoaderIter(dataset=train_set, batch_size=bsz, sampler=RandomSampler(), num_workers=4, collate_fn=train_set.collate_fn) dev_data_iter = TorchLoaderIter(dataset=dev_set, batch_size=bsz, sampler=RandomSampler(), num_workers=4, collate_fn=dev_set.collate_fn) trainer = Trainer( train_data=train_data_iter, dev_data=dev_data_iter, model=model, optimizer=optimizer, loss=LossInForward(), batch_size=bsz, update_every=args.grad_accumulation, n_epochs=args.epoch, metrics=metrics, callbacks=[fitlog_callback, gradient_clip_callback, warmup_callback], device=devices, use_tqdm=True) trainer.train(load_best_model=False)
def main(): args = parse_args() batch_size = 12 if args.debug: fitlog.debug() if args.gpu != 'all': os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu #load entity embeddings #TODO 初期化をSPECTERで行う #train_set, test_set, ent_vocab = load_AASC_graph_data(args.data_dir,args.frequency,args.WINDOW_SIZE,args.MAX_LEN,args.pretrained_model) #num_ent = len(ent_vocab) # load parameters """ if args.pretrained_model == "scibert": model = PTBCN.from_pretrained('../pretrainedmodel/scibert_scivocab_uncased',num_ent=len(ent_vocab),MAX_LEN=args.MAX_LEN) else: model = PTBCN.from_pretrained('bert-base-uncased',num_ent=len(ent_vocab),MAX_LEN=args.MAX_LEN) model.change_type_embeddings() print('parameters of SciBERT has been loaded.') # fine-tune no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight', 'layer_norm.bias', 'layer_norm.weight'] param_optimizer = list(model.named_parameters()) optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = optim.AdamW(optimizer_grouped_parameters, lr=args.lr, betas=(0.9, args.beta), eps=1e-6) optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr) metrics = [MacroMetric(pred='pred', target='target')] devices = list(range(torch.cuda.device_count())) if torch.cuda.is_available(): print("GPU OK") else: print("GPU NO") gradient_clip_callback = GradientClipCallback(clip_value=1, clip_type='norm') warmup_callback = WarmupCallback(warmup=args.warm_up, schedule='linear') """ bsz = args.batch_size // args.grad_accumulation if args.data_dir[-1] == "/": data_dir_modelname = os.path.basename(args.data_dir[:-1]) else: data_dir_modelname = os.path.basename(args.data_dir) X_train, y_train, X_test, y_test = load_data_intent_identification() ydict = {} for i in y_train + y_test: if i not in ydict: ydict[i] = 1 else: ydict[i] += 1 print(ydict) i = 0 l = 0 tokenizer = BertTokenizer.from_pretrained( '../../pretrainedmodel/scibert_scivocab_uncased', do_lower_case=True) for x in X_train + X_test: l += len(tokenizer.tokenize(x["text"])) i += 1 print(l / i) """ l = [i for i in range(len(X))] random.shuffle(l) for epoch in range(5): if epoch == 0: X_test = [X[i] for i in l[:len(l)//5]] y_test = [y[i] for i in l[:len(l)//5]] X_train = [X[i] for i in l[len(l)//5:]] y_train = [y[i] for i in l[len(l)//5:]] elif epoch == 4: X_test = [X[i] for i in l[len(l)*epoch//5:]] y_test = [y[i] for i in l[len(l)*epoch//5:]] X_train = [X[i] for i in l[:len(l)*epoch//5]] y_train = [y[i] for i in l[:len(l)*epoch//5]] else: X_test = [X[i] for i in l[len(l)*epoch//5:len(l)*(epoch+1)//5]] y_test = [y[i] for i in l[len(l)*epoch//5:len(l)*(epoch+1)//5]] X_train = [X[i] for i in l[:len(l)*epoch//5]+l[len(l)*(epoch+1)//5:]] y_train = [y[i] for i in l[:len(l)*epoch//5]+l[len(l)*(epoch+1)//5:]] #X_train, y_train = oversampling(X_train, y_train) """ '''
def main(): args = parse_args() batch_size = 12 if args.debug: fitlog.debug() if args.gpu != 'all': os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu #load entity embeddings #TODO 初期化をSPECTERで行う #train_set, test_set, ent_vocab = load_AASC_graph_data(args.data_dir,args.frequency,args.WINDOW_SIZE,args.MAX_LEN,args.pretrained_model) #num_ent = len(ent_vocab) # load parameters """ if args.pretrained_model == "scibert": model = PTBCN.from_pretrained('../pretrainedmodel/scibert_scivocab_uncased',num_ent=len(ent_vocab),MAX_LEN=args.MAX_LEN) else: model = PTBCN.from_pretrained('bert-base-uncased',num_ent=len(ent_vocab),MAX_LEN=args.MAX_LEN) model.change_type_embeddings() print('parameters of SciBERT has been loaded.') # fine-tune no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight', 'layer_norm.bias', 'layer_norm.weight'] param_optimizer = list(model.named_parameters()) optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = optim.AdamW(optimizer_grouped_parameters, lr=args.lr, betas=(0.9, args.beta), eps=1e-6) optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr) metrics = [MacroMetric(pred='pred', target='target')] devices = list(range(torch.cuda.device_count())) if torch.cuda.is_available(): print("GPU OK") else: print("GPU NO") gradient_clip_callback = GradientClipCallback(clip_value=1, clip_type='norm') warmup_callback = WarmupCallback(warmup=args.warm_up, schedule='linear') """ bsz = args.batch_size // args.grad_accumulation if args.data_dir[-1] == "/": data_dir_modelname = os.path.basename(args.data_dir[:-1]) else: data_dir_modelname = os.path.basename(args.data_dir) X_train, y_train, X_test, y_test, intentdict = load_data_intent_identification( ) ydict = {} for i in y_train + y_test: if i not in ydict: ydict[i] = 1 else: ydict[i] += 1 print(ydict) """ l = [i for i in range(len(X))] random.shuffle(l) for epoch in range(5): if epoch == 0: X_test = [X[i] for i in l[:len(l)//5]] y_test = [y[i] for i in l[:len(l)//5]] X_train = [X[i] for i in l[len(l)//5:]] y_train = [y[i] for i in l[len(l)//5:]] elif epoch == 4: X_test = [X[i] for i in l[len(l)*epoch//5:]] y_test = [y[i] for i in l[len(l)*epoch//5:]] X_train = [X[i] for i in l[:len(l)*epoch//5]] y_train = [y[i] for i in l[:len(l)*epoch//5]] else: X_test = [X[i] for i in l[len(l)*epoch//5:len(l)*(epoch+1)//5]] y_test = [y[i] for i in l[len(l)*epoch//5:len(l)*(epoch+1)//5]] X_train = [X[i] for i in l[:len(l)*epoch//5]+l[len(l)*(epoch+1)//5:]] y_train = [y[i] for i in l[:len(l)*epoch//5]+l[len(l)*(epoch+1)//5:]] #X_train, y_train = oversampling(X_train, y_train) """ print(collections.Counter(y_train)) print(intentdict) tokenizer = BertTokenizer.from_pretrained( '../../pretrainedmodel/scibert_scivocab_uncased', do_lower_case=True) epochs = 50 input_ids = [] attention_masks = [] num_label = max(y_train + y_test) + 1 for x, y1 in zip(X_train, y_train): text = x["text"] """ left_citation_tokenized = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(x["left_citated_text"])) right_citation_tokenized = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(x["right_citated_text"])) """ text_tokenized = tokenizer.convert_tokens_to_ids( tokenizer.tokenize(text)) xlen = len(text_tokenized[:512]) #xlen = len(left_citation_tokenized[-256:])+len(right_citation_tokenized[:256]) word_pad = 512 - xlen tokenized_ids = text_tokenized[:512] + [0] * (512 - xlen) #tokenized_ids = left_citation_tokenized[-256:] + right_citation_tokenized[:256] + [0]*(512-xlen) adj = torch.ones(xlen, xlen, dtype=torch.int) adj = torch.cat( (adj, torch.ones(word_pad, adj.shape[1], dtype=torch.int)), dim=0) adj = torch.cat((adj, torch.zeros(512, word_pad, dtype=torch.int)), dim=1) # Add the encoded sentence to the list. input_ids.append(tokenized_ids) # And its attention mask (simply differentiates padding from non-padding). attention_masks.append(adj) print("load train done") # Convert the lists into tensors. input_ids = torch.tensor(input_ids) attention_masks = torch.stack(attention_masks, dim=0) labels = torch.tensor(y_train) train_dataset = TensorDataset(input_ids, attention_masks, labels) input_ids = [] attention_masks = [] for x, y1 in zip(X_test, y_test): text = x["text"] text_tokenized = tokenizer.convert_tokens_to_ids( tokenizer.tokenize(text)) xlen = len(text_tokenized[:512]) #xlen = len(left_citation_tokenized[-256:])+len(right_citation_tokenized[:256]) word_pad = 512 - xlen tokenized_ids = text_tokenized[:512] + [0] * (512 - xlen) """ left_citation_tokenized = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(x["left_citated_text"])) right_citation_tokenized = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(x["right_citated_text"])) xlen = len(left_citation_tokenized[-256:])+len(right_citation_tokenized[:256]) word_pad = 512-xlen tokenized_ids = left_citation_tokenized[-256:] + right_citation_tokenized[:256] + [0]*(512-xlen) """ adj = torch.ones(xlen, xlen, dtype=torch.int) adj = torch.cat( (adj, torch.ones(word_pad, adj.shape[1], dtype=torch.int)), dim=0) adj = torch.cat((adj, torch.zeros(512, word_pad, dtype=torch.int)), dim=1) # Add the encoded sentence to the list. input_ids.append(tokenized_ids) # And its attention mask (simply differentiates padding from non-padding). attention_masks.append(adj) print("load test done") # Convert the lists into tensors. input_ids = torch.tensor(input_ids) attention_masks = torch.stack(attention_masks, dim=0) labels = torch.tensor(y_test) test_dataset = TensorDataset(input_ids, attention_masks, labels) train_dataloader = DataLoader( train_dataset, # The training samples. sampler=RandomSampler(train_dataset), # Select batches randomly batch_size=batch_size # Trains with this batch size. ) test_dataloader = DataLoader( test_dataset, # The training samples. sampler=None, # Select batches randomly batch_size=1 # Trains with this batch size. ) total_steps = len(train_dataloader) * epochs model = BertForSequenceClassification.from_pretrained( "../../pretrainedmodel/scibert_scivocab_uncased", # Use the 12-layer BERT model, with an uncased vocab. num_labels= num_label, # The number of output labels--2 for binary classification. # You can increase this for multi-class tasks. output_attentions=False, # Whether the model returns attentions weights. output_hidden_states= False, # Whether the model returns all hidden-states. ) optimizer = AdamW( model.parameters(), lr=5e-6, # args.learning_rate - default is 5e-5, our notebook had 2e-5 eps=1e-8 # args.adam_epsilon - default is 1e-8. ) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, # Default value in run_glue.py num_training_steps=total_steps) """ model.cuda() model.load_state_dict(torch.load("../../model/scibert_intentclassification.bin")) model.cuda() X_train,X_test = load_data_AASC() fw = open("train.txt","w") pred = [] with torch.no_grad(): for i,x in enumerate(X_train): if i%2500 == 0: print(i) print("len") print(len(pred)) left_citated_text = x["left_citated_text"] right_citated_text = x["right_citated_text"] target_id = x["target_id"] source_id = x["source_id"] left_tokenized = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(left_citated_text))[-50:] right_tokenized = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(right_citated_text))[:50] xlen = len(left_tokenized+right_tokenized) input_id = torch.tensor(left_tokenized+right_tokenized+[0]*(512-xlen)).unsqueeze(0).cuda() word_pad = 512-xlen adj = torch.ones(xlen, xlen, dtype=torch.int) adj = torch.cat((adj,torch.ones(word_pad,adj.shape[1],dtype=torch.int)),dim=0) adj = torch.cat((adj,torch.zeros(512,word_pad,dtype=torch.int)),dim=1) adj = adj.unsqueeze(0).cuda() label = torch.tensor([1]).unsqueeze(0).cuda() outputs = model(input_ids=input_id, attention_mask=adj, labels=label) logits = outputs["logits"] logits = logits.detach().cpu().numpy() pred += list(np.argmax(logits, axis=1)) fw.write(target_id+"\t"+str(pred[-1])+"\t"+source_id+"\n") fw = open("test.txt","w") pred = [] with torch.no_grad(): for i,x in enumerate(X_test): if i%2500 == 0: print(i) print("len") print(len(pred)) left_citated_text = x["left_citated_text"] right_citated_text = x["right_citated_text"] target_id = x["target_id"] source_id = x["source_id"] left_tokenized = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(left_citated_text))[-50:] right_tokenized = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(right_citated_text))[:50] xlen = len(left_tokenized+right_tokenized) input_id = torch.tensor(left_tokenized+right_tokenized+[0]*(512-xlen)).unsqueeze(0).cuda() word_pad = 512-xlen adj = torch.ones(xlen, xlen, dtype=torch.int) adj = torch.cat((adj,torch.ones(word_pad,adj.shape[1],dtype=torch.int)),dim=0) adj = torch.cat((adj,torch.zeros(512,word_pad,dtype=torch.int)),dim=1) adj = adj.unsqueeze(0).cuda() label = torch.tensor([1]).unsqueeze(0).cuda() outputs = model(input_ids=input_id, attention_mask=adj, labels=label) logits = outputs["logits"] logits = logits.detach().cpu().numpy() pred += list(np.argmax(logits, axis=1)) fw.write(target_id+"\t"+str(pred[-1])+"\t"+source_id+"\n") """ """