Exemplo n.º 1
0
def preprocess() -> argparse.Namespace:
    """
    preprocess of training
    :return: config args
    """
    print('preprocessing starts...\n')
    # ====== parse arguments ====== #
    args = parse_args()
    # ====== set random seed ====== #
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    # ====== save path ====== #
    now_time = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
    args.save_path = os.path.join('./logs/', 'my_log-' + now_time)
    if not os.path.exists(args.save_path) and not args.debug:
        os.makedirs(args.save_path)
    # ====== fitlog init ====== #
    fitlog.commit(__file__)
    fitlog.debug(args.debug)
    fitlog.add_hyper(args)
    # ====== tb VisualLogger init ====== #
    args.visual_logger = VisualLogger(
        args.save_path) if not args.debug else None
    # ====== cuda enable ====== #
    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpuid)
    args.device = torch.device(
        'cuda') if args.cuda and torch.cuda.is_available() else torch.device(
            'cpu')
    # ====== others ====== #
    os.environ['TOKENIZERS_PARALLELISM'] = 'false'
    torch.set_num_threads(6)
    print(args, end='\n\n')
    return args
def after_parse_t2g(C , need_logger = False):

	#----- make logger -----

	logger = Logger(C.log_file)
	logger.log = logger.log_print_w_time
	if C.no_log:
		logger.log = logger.nolog

	C.tmp_file_name = random_tmp_name()

	#----- other stuff -----

	if C.auto_hyperparam:
		auto_hyperparam(C)
		logger.log("Hyper parameters autoset.")

	if C.no_fitlog:
		fitlog.debug()

	fitlog.set_log_dir("logs")
	fitlog.add_hyper(C)

	logger.log ("------------------------------------------------------")
	logger.log (pformat(C.__dict__))
	logger.log ("------------------------------------------------------")

	C.gpus = list(range(tc.cuda.device_count()))


	#----- initialize -----

	if C.t2g_seed > 0:
		random.seed(C.t2g_seed)
		tc.manual_seed(C.t2g_seed)
		np.random.seed(C.t2g_seed)
		tc.cuda.manual_seed_all(C.t2g_seed)
		tc.backends.cudnn.deterministic = True
		tc.backends.cudnn.benchmark = False

		logger.log ("Seed set. %d" % (C.t2g_seed))

	tc.cuda.set_device(C.gpus[0])
	C.device = C.gpus[0]

	if need_logger:
		return C , logger

	return C
def auto_hyperparam(C):
	if C.dataset == "ace_2005":
		C.ensemble 		= 1 
		C.no_rel_name	= "NO_RELATION" 
		C.gnn  			= True
		C.matrix_trans  = True
		C.train_text_1	= "./data/ace_2005/ace_05_processed/ace-05-splits/json-pm13/bn+nw.json"
		C.valid_text	= "./data/ace_2005/ace_05_processed/ace-05-splits/json-pm13/bc_dev.json"
		C.test_text		= "./data/ace_2005/ace_05_processed/ace-05-splits/json-pm13/bc_test.json"
		C.dataset		= "ace_2005"
		C.gene_in_data 	= True
		C.valid_metric	= "macro"
		C.scheduler		= "cosine"
		C.no_valid 		= True
		C.loss 			= "loss_1"
		C.t2g_batch_size= 8 
		C.t2g_lr 		= 5e-5 
		C.no_rel_weight = 0.25 
		C.epoch_numb 	= 30  
		C.warmup_prop 	= 0.02
		C.model_save 	= "model_ace.pkl"
	elif C.dataset == "semeval_2018_task7":
		C.ensemble 		= 5 
		C.epoch_numb	= 30 
		C.no_rel_name 	= "NONE" 
		C.matrix_trans  = True
		C.gnn 			= True
		C.valid_text 	= "./data/semeval_2018_task7/2.test.text.xml"
		C.valid_rels 	= "./data/semeval_2018_task7/keys.test.2.txt"
		C.loss 			= "loss_2"
		C.no_valid 		= True
		C.warmup_prop 	= 0.1 
		C.scheduler 	= "cosine"
		C.t2g_batch_size= 8 
		C.t2g_lr 		= 1e-4
		C.model_save 	= "model_semeval.pkl"
	C.no_fitlog = True
	fitlog.debug()
import fitlog

use_fitlog = False
if not use_fitlog:
    fitlog.debug()
fitlog.set_log_dir('logs')
load_dataset_seed = 100
fitlog.add_hyper(load_dataset_seed, 'load_dataset_seed')
fitlog.set_rng_seed(load_dataset_seed)

import sys

sys.path.append('../')

import argparse
from fastNLP.core import Trainer
from fastNLP.core import Callback
from fastNLP import LossInForward
from fastNLP.core.metrics import SpanFPreRecMetric, AccuracyMetric
from fastNLP.core.callback import WarmupCallback, GradientClipCallback, EarlyStopCallback, FitlogCallback
from fastNLP import LRScheduler
from fastNLP import logger

import torch
import torch.optim as optim
import torch.nn as nn
from torch.optim.lr_scheduler import LambdaLR

import collections

from load_data import *
Exemplo n.º 5
0
def train():
    args = parse_args()
    if args.debug:
        fitlog.debug()
        args.save_model = False
    # ================= define =================
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    word_mask_index = tokenizer.mask_token_id
    word_vocab_size = len(tokenizer)

    if get_local_rank() == 0:
        fitlog.set_log_dir(args.log_dir)
        fitlog.commit(__file__, fit_msg=args.name)
        fitlog.add_hyper_in_file(__file__)
        fitlog.add_hyper(args)

    # ================= load data =================
    dist.init_process_group('nccl')
    init_logger_dist()

    n_proc = dist.get_world_size()
    bsz = args.batch_size // args.grad_accumulation // n_proc
    args.local_rank = get_local_rank()
    args.save_dir = os.path.join(args.save_dir,
                                 args.name) if args.save_model else None
    if args.save_dir is not None and os.path.exists(args.save_dir):
        raise RuntimeError('save_dir has already existed.')
    logger.info('save directory: {}'.format(
        'None' if args.save_dir is None else args.save_dir))
    devices = list(range(torch.cuda.device_count()))
    NUM_WORKERS = 4

    ent_vocab, rel_vocab = load_ent_rel_vocabs()
    logger.info('# entities: {}'.format(len(ent_vocab)))
    logger.info('# relations: {}'.format(len(rel_vocab)))
    ent_freq = get_ent_freq()
    assert len(ent_vocab) == len(ent_freq), '{} {}'.format(
        len(ent_vocab), len(ent_freq))

    #####
    root = args.data_dir
    dirs = os.listdir(root)
    drop_files = []
    for dir in dirs:
        path = os.path.join(root, dir)
        max_idx = 0
        for file_name in os.listdir(path):
            if 'large' in file_name:
                continue
            max_idx = int(file_name) if int(file_name) > max_idx else max_idx
        drop_files.append(os.path.join(path, str(max_idx)))
    #####

    file_list = []
    for path, _, filenames in os.walk(args.data_dir):
        for filename in filenames:
            file = os.path.join(path, filename)
            if 'large' in file or file in drop_files:
                continue
            file_list.append(file)
    logger.info('used {} files in {}.'.format(len(file_list), args.data_dir))
    if args.data_prop > 1:
        used_files = file_list[:int(args.data_prop)]
    else:
        used_files = file_list[:round(args.data_prop * len(file_list))]

    data = GraphOTFDataSet(used_files, n_proc, args.local_rank,
                           word_mask_index, word_vocab_size, args.n_negs,
                           ent_vocab, rel_vocab, ent_freq)
    dev_data = GraphDataSet(used_files[0], word_mask_index, word_vocab_size,
                            args.n_negs, ent_vocab, rel_vocab, ent_freq)

    sampler = OTFDistributedSampler(used_files, n_proc, get_local_rank())
    train_data_iter = TorchLoaderIter(dataset=data,
                                      batch_size=bsz,
                                      sampler=sampler,
                                      num_workers=NUM_WORKERS,
                                      collate_fn=data.collate_fn)
    dev_data_iter = TorchLoaderIter(dataset=dev_data,
                                    batch_size=bsz,
                                    sampler=RandomSampler(),
                                    num_workers=NUM_WORKERS,
                                    collate_fn=dev_data.collate_fn)
    if args.test_data is not None:
        test_data = FewRelDevDataSet(path=args.test_data,
                                     label_vocab=rel_vocab,
                                     ent_vocab=ent_vocab)
        test_data_iter = TorchLoaderIter(dataset=test_data,
                                         batch_size=32,
                                         sampler=RandomSampler(),
                                         num_workers=NUM_WORKERS,
                                         collate_fn=test_data.collate_fn)

    if args.local_rank == 0:
        print('full wiki files: {}'.format(len(file_list)))
        print('used wiki files: {}'.format(len(used_files)))
        print('# of trained samples: {}'.format(len(data) * n_proc))
        print('# of trained entities: {}'.format(len(ent_vocab)))
        print('# of trained relations: {}'.format(len(rel_vocab)))

    # ================= prepare model =================
    logger.info('model init')
    if args.rel_emb is not None:  # load pretrained relation embeddings
        rel_emb = np.load(args.rel_emb)
        # add_embs = np.random.randn(3, rel_emb.shape[1])  # add <pad>, <mask>, <unk>
        # rel_emb = np.r_[add_embs, rel_emb]
        rel_emb = torch.from_numpy(rel_emb).float()
        assert rel_emb.shape[0] == len(rel_vocab), '{} {}'.format(
            rel_emb.shape[0], len(rel_vocab))
        # assert rel_emb.shape[1] == args.rel_dim
        logger.info('loaded pretrained relation embeddings. dim: {}'.format(
            rel_emb.shape[1]))
    else:
        rel_emb = None
    if args.model_name is not None:
        logger.info('further pre-train.')
        config = RobertaConfig.from_pretrained('roberta-base',
                                               type_vocab_size=3)
        model = CoLAKE(config=config,
                       num_ent=len(ent_vocab),
                       num_rel=len(rel_vocab),
                       ent_dim=args.ent_dim,
                       rel_dim=args.rel_dim,
                       ent_lr=args.ent_lr,
                       ip_config=args.ip_config,
                       rel_emb=None,
                       emb_name=args.emb_name)
        states_dict = torch.load(args.model_name)
        model.load_state_dict(states_dict, strict=True)
    else:
        model = CoLAKE.from_pretrained(
            'roberta-base',
            num_ent=len(ent_vocab),
            num_rel=len(rel_vocab),
            ent_lr=args.ent_lr,
            ip_config=args.ip_config,
            rel_emb=rel_emb,
            emb_name=args.emb_name,
            cache_dir=PYTORCH_PRETRAINED_BERT_CACHE /
            'dist_{}'.format(args.local_rank))
        model.extend_type_embedding(token_type=3)
    # if args.local_rank == 0:
    #     for name, param in model.named_parameters():
    #         if param.requires_grad is True:
    #             print('{}: {}'.format(name, param.shape))

    # ================= train model =================
    # lr=1e-4 for peak value, lr=5e-5 for initial value
    logger.info('trainer init')
    no_decay = [
        'bias', 'LayerNorm.bias', 'LayerNorm.weight', 'layer_norm.bias',
        'layer_norm.weight'
    ]
    param_optimizer = list(model.named_parameters())
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    word_acc = WordMLMAccuracy(pred='word_pred',
                               target='masked_lm_labels',
                               seq_len='word_seq_len')
    ent_acc = EntityMLMAccuracy(pred='entity_pred',
                                target='ent_masked_lm_labels',
                                seq_len='ent_seq_len')
    rel_acc = RelationMLMAccuracy(pred='relation_pred',
                                  target='rel_masked_lm_labels',
                                  seq_len='rel_seq_len')
    metrics = [word_acc, ent_acc, rel_acc]

    if args.test_data is not None:
        test_metric = [rel_acc]
        tester = Tester(data=test_data_iter,
                        model=model,
                        metrics=test_metric,
                        device=list(range(torch.cuda.device_count())))
        # tester.test()
    else:
        tester = None

    optimizer = optim.AdamW(optimizer_grouped_parameters,
                            lr=args.lr,
                            betas=(0.9, args.beta),
                            eps=1e-6)
    # warmup_callback = WarmupCallback(warmup=args.warm_up, schedule='linear')
    fitlog_callback = MyFitlogCallback(tester=tester,
                                       log_loss_every=100,
                                       verbose=1)
    gradient_clip_callback = GradientClipCallback(clip_value=1,
                                                  clip_type='norm')
    emb_callback = EmbUpdateCallback(model.ent_embeddings)
    all_callbacks = [gradient_clip_callback, emb_callback]
    if args.save_dir is None:
        master_callbacks = [fitlog_callback]
    else:
        save_callback = SaveModelCallback(args.save_dir,
                                          model.ent_embeddings,
                                          only_params=True)
        master_callbacks = [fitlog_callback, save_callback]

    if args.do_test:
        states_dict = torch.load(os.path.join(args.save_dir,
                                              args.model_name)).state_dict()
        model.load_state_dict(states_dict)
        data_iter = TorchLoaderIter(dataset=data,
                                    batch_size=args.batch_size,
                                    sampler=RandomSampler(),
                                    num_workers=NUM_WORKERS,
                                    collate_fn=data.collate_fn)
        tester = Tester(data=data_iter,
                        model=model,
                        metrics=metrics,
                        device=devices)
        tester.test()
    else:
        trainer = DistTrainer(train_data=train_data_iter,
                              dev_data=dev_data_iter,
                              model=model,
                              optimizer=optimizer,
                              loss=LossInForward(),
                              batch_size_per_gpu=bsz,
                              update_every=args.grad_accumulation,
                              n_epochs=args.epoch,
                              metrics=metrics,
                              callbacks_master=master_callbacks,
                              callbacks_all=all_callbacks,
                              validate_every=5000,
                              use_tqdm=True,
                              fp16='O1' if args.fp16 else '')
        trainer.train(load_best_model=False)
Exemplo n.º 6
0
def main():
    args = parse_args()

    if args.debug:
        fitlog.debug()

    fitlog.set_log_dir(args.log_dir)
    fitlog.commit(__file__)
    fitlog.add_hyper_in_file(__file__)
    fitlog.add_hyper(args)
    if args.gpu != 'all':
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    train_set, dev_set, test_set, temp_ent_vocab = load_fewrel_graph_data(
        data_dir=args.data_dir)

    print('data directory: {}'.format(args.data_dir))
    print('# of train samples: {}'.format(len(train_set)))
    print('# of dev samples: {}'.format(len(dev_set)))
    print('# of test samples: {}'.format(len(test_set)))

    ent_vocab, rel_vocab = load_ent_rel_vocabs(path='../')

    # load entity embeddings
    ent_index = []
    for k, v in temp_ent_vocab.items():
        ent_index.append(ent_vocab[k])
    ent_index = torch.tensor(ent_index)
    ent_emb = np.load(os.path.join(args.model_path, 'entities.npy'))
    ent_embedding = nn.Embedding.from_pretrained(torch.from_numpy(ent_emb))
    ent_emb = ent_embedding(ent_index.view(1, -1)).squeeze().detach()

    # load CoLAKE parameters
    config = RobertaConfig.from_pretrained('roberta-base', type_vocab_size=3)
    model = CoLAKEForRE(config,
                        num_types=len(train_set.label_vocab),
                        ent_emb=ent_emb)
    states_dict = torch.load(os.path.join(args.model_path, 'model.bin'))
    model.load_state_dict(states_dict, strict=False)
    print('parameters below are randomly initializecd:')
    for name, param in model.named_parameters():
        if name not in states_dict:
            print(name)

    # tie relation classification head
    rel_index = []
    for k, v in train_set.label_vocab.items():
        rel_index.append(rel_vocab[k])
    rel_index = torch.LongTensor(rel_index)
    rel_embeddings = nn.Embedding.from_pretrained(
        states_dict['rel_embeddings.weight'])
    rel_index = rel_index.cuda()
    rel_cls_weight = rel_embeddings(rel_index.view(1, -1)).squeeze()
    model.tie_rel_weights(rel_cls_weight)

    model.rel_head.dense.weight.data = states_dict['rel_lm_head.dense.weight']
    model.rel_head.dense.bias.data = states_dict['rel_lm_head.dense.bias']
    model.rel_head.layer_norm.weight.data = states_dict[
        'rel_lm_head.layer_norm.weight']
    model.rel_head.layer_norm.bias.data = states_dict[
        'rel_lm_head.layer_norm.bias']

    model.resize_token_embeddings(
        len(RobertaTokenizer.from_pretrained('roberta-base')) + 4)
    print('parameters of CoLAKE has been loaded.')

    # fine-tune
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight', 'embedding']
    param_optimizer = list(model.named_parameters())
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        args.weight_decay
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    optimizer = optim.AdamW(optimizer_grouped_parameters,
                            lr=args.lr,
                            betas=(0.9, args.beta),
                            eps=1e-6)

    metrics = [MacroMetric(pred='pred', target='target')]

    test_data_iter = TorchLoaderIter(dataset=test_set,
                                     batch_size=args.batch_size,
                                     sampler=RandomSampler(),
                                     num_workers=4,
                                     collate_fn=test_set.collate_fn)
    devices = list(range(torch.cuda.device_count()))
    tester = Tester(data=test_data_iter,
                    model=model,
                    metrics=metrics,
                    device=devices)
    # tester.test()

    fitlog_callback = FitlogCallback(tester=tester,
                                     log_loss_every=100,
                                     verbose=1)
    gradient_clip_callback = GradientClipCallback(clip_value=1,
                                                  clip_type='norm')
    warmup_callback = WarmupCallback(warmup=args.warm_up, schedule='linear')

    bsz = args.batch_size // args.grad_accumulation

    train_data_iter = TorchLoaderIter(dataset=train_set,
                                      batch_size=bsz,
                                      sampler=RandomSampler(),
                                      num_workers=4,
                                      collate_fn=train_set.collate_fn)
    dev_data_iter = TorchLoaderIter(dataset=dev_set,
                                    batch_size=bsz,
                                    sampler=RandomSampler(),
                                    num_workers=4,
                                    collate_fn=dev_set.collate_fn)

    trainer = Trainer(
        train_data=train_data_iter,
        dev_data=dev_data_iter,
        model=model,
        optimizer=optimizer,
        loss=LossInForward(),
        batch_size=bsz,
        update_every=args.grad_accumulation,
        n_epochs=args.epoch,
        metrics=metrics,
        callbacks=[fitlog_callback, gradient_clip_callback, warmup_callback],
        device=devices,
        use_tqdm=True)

    trainer.train(load_best_model=False)
Exemplo n.º 7
0
def main():
    args = parse_args()
    batch_size = 12

    if args.debug:
        fitlog.debug()
    if args.gpu != 'all':
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    #load entity embeddings
    #TODO 初期化をSPECTERで行う
    #train_set, test_set, ent_vocab = load_AASC_graph_data(args.data_dir,args.frequency,args.WINDOW_SIZE,args.MAX_LEN,args.pretrained_model)
    #num_ent = len(ent_vocab)

    # load parameters
    """
    if args.pretrained_model == "scibert":
        model = PTBCN.from_pretrained('../pretrainedmodel/scibert_scivocab_uncased',num_ent=len(ent_vocab),MAX_LEN=args.MAX_LEN)
    else:
        model = PTBCN.from_pretrained('bert-base-uncased',num_ent=len(ent_vocab),MAX_LEN=args.MAX_LEN)
    model.change_type_embeddings()
    print('parameters of SciBERT has been loaded.')

    # fine-tune
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight', 'layer_norm.bias', 'layer_norm.weight']
    param_optimizer = list(model.named_parameters())
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay': args.weight_decay},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = optim.AdamW(optimizer_grouped_parameters, lr=args.lr, betas=(0.9, args.beta), eps=1e-6)
    optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr)

    metrics = [MacroMetric(pred='pred', target='target')]
    devices = list(range(torch.cuda.device_count()))
    if torch.cuda.is_available():
        print("GPU OK")
    else:
        print("GPU NO")

    gradient_clip_callback = GradientClipCallback(clip_value=1, clip_type='norm')
    warmup_callback = WarmupCallback(warmup=args.warm_up, schedule='linear')
    """
    bsz = args.batch_size // args.grad_accumulation
    if args.data_dir[-1] == "/":
        data_dir_modelname = os.path.basename(args.data_dir[:-1])
    else:
        data_dir_modelname = os.path.basename(args.data_dir)
    X_train, y_train, X_test, y_test = load_data_intent_identification()
    ydict = {}
    for i in y_train + y_test:
        if i not in ydict:
            ydict[i] = 1
        else:
            ydict[i] += 1
    print(ydict)
    i = 0
    l = 0
    tokenizer = BertTokenizer.from_pretrained(
        '../../pretrainedmodel/scibert_scivocab_uncased', do_lower_case=True)
    for x in X_train + X_test:
        l += len(tokenizer.tokenize(x["text"]))
        i += 1
    print(l / i)
    """
    l = [i for i in range(len(X))]
    random.shuffle(l)
    for epoch in range(5):
        if epoch == 0:
            X_test = [X[i] for i in l[:len(l)//5]]
            y_test = [y[i] for i in l[:len(l)//5]]
            X_train = [X[i] for i in l[len(l)//5:]]
            y_train = [y[i] for i in l[len(l)//5:]]
        elif epoch == 4:
            X_test = [X[i] for i in l[len(l)*epoch//5:]]
            y_test = [y[i] for i in l[len(l)*epoch//5:]]
            X_train = [X[i] for i in l[:len(l)*epoch//5]]
            y_train = [y[i] for i in l[:len(l)*epoch//5]]
        else:
            X_test = [X[i] for i in l[len(l)*epoch//5:len(l)*(epoch+1)//5]]
            y_test = [y[i] for i in l[len(l)*epoch//5:len(l)*(epoch+1)//5]]
            X_train = [X[i] for i in l[:len(l)*epoch//5]+l[len(l)*(epoch+1)//5:]]
            y_train = [y[i] for i in l[:len(l)*epoch//5]+l[len(l)*(epoch+1)//5:]]
        #X_train, y_train = oversampling(X_train, y_train)
    """
    ''' 
Exemplo n.º 8
0
def main():
    args = parse_args()
    batch_size = 12

    if args.debug:
        fitlog.debug()
    if args.gpu != 'all':
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    #load entity embeddings
    #TODO 初期化をSPECTERで行う
    #train_set, test_set, ent_vocab = load_AASC_graph_data(args.data_dir,args.frequency,args.WINDOW_SIZE,args.MAX_LEN,args.pretrained_model)
    #num_ent = len(ent_vocab)

    # load parameters
    """
    if args.pretrained_model == "scibert":
        model = PTBCN.from_pretrained('../pretrainedmodel/scibert_scivocab_uncased',num_ent=len(ent_vocab),MAX_LEN=args.MAX_LEN)
    else:
        model = PTBCN.from_pretrained('bert-base-uncased',num_ent=len(ent_vocab),MAX_LEN=args.MAX_LEN)
    model.change_type_embeddings()
    print('parameters of SciBERT has been loaded.')

    # fine-tune
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight', 'layer_norm.bias', 'layer_norm.weight']
    param_optimizer = list(model.named_parameters())
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay': args.weight_decay},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = optim.AdamW(optimizer_grouped_parameters, lr=args.lr, betas=(0.9, args.beta), eps=1e-6)
    optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr)

    metrics = [MacroMetric(pred='pred', target='target')]
    devices = list(range(torch.cuda.device_count()))
    if torch.cuda.is_available():
        print("GPU OK")
    else:
        print("GPU NO")

    gradient_clip_callback = GradientClipCallback(clip_value=1, clip_type='norm')
    warmup_callback = WarmupCallback(warmup=args.warm_up, schedule='linear')
    """
    bsz = args.batch_size // args.grad_accumulation
    if args.data_dir[-1] == "/":
        data_dir_modelname = os.path.basename(args.data_dir[:-1])
    else:
        data_dir_modelname = os.path.basename(args.data_dir)
    X_train, y_train, X_test, y_test, intentdict = load_data_intent_identification(
    )
    ydict = {}
    for i in y_train + y_test:
        if i not in ydict:
            ydict[i] = 1
        else:
            ydict[i] += 1
    print(ydict)
    """
    l = [i for i in range(len(X))]
    random.shuffle(l)
    for epoch in range(5):
        if epoch == 0:
            X_test = [X[i] for i in l[:len(l)//5]]
            y_test = [y[i] for i in l[:len(l)//5]]
            X_train = [X[i] for i in l[len(l)//5:]]
            y_train = [y[i] for i in l[len(l)//5:]]
        elif epoch == 4:
            X_test = [X[i] for i in l[len(l)*epoch//5:]]
            y_test = [y[i] for i in l[len(l)*epoch//5:]]
            X_train = [X[i] for i in l[:len(l)*epoch//5]]
            y_train = [y[i] for i in l[:len(l)*epoch//5]]
        else:
            X_test = [X[i] for i in l[len(l)*epoch//5:len(l)*(epoch+1)//5]]
            y_test = [y[i] for i in l[len(l)*epoch//5:len(l)*(epoch+1)//5]]
            X_train = [X[i] for i in l[:len(l)*epoch//5]+l[len(l)*(epoch+1)//5:]]
            y_train = [y[i] for i in l[:len(l)*epoch//5]+l[len(l)*(epoch+1)//5:]]
        #X_train, y_train = oversampling(X_train, y_train)
    """
    print(collections.Counter(y_train))
    print(intentdict)
    tokenizer = BertTokenizer.from_pretrained(
        '../../pretrainedmodel/scibert_scivocab_uncased', do_lower_case=True)
    epochs = 50
    input_ids = []
    attention_masks = []
    num_label = max(y_train + y_test) + 1
    for x, y1 in zip(X_train, y_train):
        text = x["text"]
        """
        left_citation_tokenized = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(x["left_citated_text"]))
        right_citation_tokenized = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(x["right_citated_text"]))
        """
        text_tokenized = tokenizer.convert_tokens_to_ids(
            tokenizer.tokenize(text))
        xlen = len(text_tokenized[:512])
        #xlen = len(left_citation_tokenized[-256:])+len(right_citation_tokenized[:256])
        word_pad = 512 - xlen
        tokenized_ids = text_tokenized[:512] + [0] * (512 - xlen)
        #tokenized_ids = left_citation_tokenized[-256:] + right_citation_tokenized[:256] + [0]*(512-xlen)
        adj = torch.ones(xlen, xlen, dtype=torch.int)
        adj = torch.cat(
            (adj, torch.ones(word_pad, adj.shape[1], dtype=torch.int)), dim=0)
        adj = torch.cat((adj, torch.zeros(512, word_pad, dtype=torch.int)),
                        dim=1)
        # Add the encoded sentence to the list.
        input_ids.append(tokenized_ids)
        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(adj)
    print("load train done")
    # Convert the lists into tensors.
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.stack(attention_masks, dim=0)
    labels = torch.tensor(y_train)
    train_dataset = TensorDataset(input_ids, attention_masks, labels)
    input_ids = []
    attention_masks = []
    for x, y1 in zip(X_test, y_test):
        text = x["text"]
        text_tokenized = tokenizer.convert_tokens_to_ids(
            tokenizer.tokenize(text))
        xlen = len(text_tokenized[:512])
        #xlen = len(left_citation_tokenized[-256:])+len(right_citation_tokenized[:256])
        word_pad = 512 - xlen
        tokenized_ids = text_tokenized[:512] + [0] * (512 - xlen)
        """
        left_citation_tokenized = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(x["left_citated_text"]))
        right_citation_tokenized = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(x["right_citated_text"]))
        xlen = len(left_citation_tokenized[-256:])+len(right_citation_tokenized[:256])
        word_pad = 512-xlen
        tokenized_ids = left_citation_tokenized[-256:] + right_citation_tokenized[:256] + [0]*(512-xlen)
        """
        adj = torch.ones(xlen, xlen, dtype=torch.int)
        adj = torch.cat(
            (adj, torch.ones(word_pad, adj.shape[1], dtype=torch.int)), dim=0)
        adj = torch.cat((adj, torch.zeros(512, word_pad, dtype=torch.int)),
                        dim=1)
        # Add the encoded sentence to the list.
        input_ids.append(tokenized_ids)
        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(adj)
    print("load test done")
    # Convert the lists into tensors.
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.stack(attention_masks, dim=0)
    labels = torch.tensor(y_test)
    test_dataset = TensorDataset(input_ids, attention_masks, labels)
    train_dataloader = DataLoader(
        train_dataset,  # The training samples.
        sampler=RandomSampler(train_dataset),  # Select batches randomly
        batch_size=batch_size  # Trains with this batch size.
    )

    test_dataloader = DataLoader(
        test_dataset,  # The training samples.
        sampler=None,  # Select batches randomly
        batch_size=1  # Trains with this batch size.
    )
    total_steps = len(train_dataloader) * epochs
    model = BertForSequenceClassification.from_pretrained(
        "../../pretrainedmodel/scibert_scivocab_uncased",  # Use the 12-layer BERT model, with an uncased vocab.
        num_labels=
        num_label,  # The number of output labels--2 for binary classification.
        # You can increase this for multi-class tasks.
        output_attentions=False,  # Whether the model returns attentions weights.
        output_hidden_states=
        False,  # Whether the model returns all hidden-states.
    )
    optimizer = AdamW(
        model.parameters(),
        lr=5e-6,  # args.learning_rate - default is 5e-5, our notebook had 2e-5
        eps=1e-8  # args.adam_epsilon  - default is 1e-8.
    )
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,  # Default value in run_glue.py
        num_training_steps=total_steps)
    """
    model.cuda()
    model.load_state_dict(torch.load("../../model/scibert_intentclassification.bin"))
    model.cuda()
    X_train,X_test = load_data_AASC()
    fw = open("train.txt","w")
    pred = []
    with torch.no_grad():
        for i,x in enumerate(X_train):
            if i%2500 == 0:
                print(i)
                print("len")
                print(len(pred))
            left_citated_text = x["left_citated_text"]
            right_citated_text = x["right_citated_text"]
            target_id = x["target_id"]
            source_id = x["source_id"]
            left_tokenized = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(left_citated_text))[-50:]
            right_tokenized = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(right_citated_text))[:50]
            xlen = len(left_tokenized+right_tokenized)
            input_id = torch.tensor(left_tokenized+right_tokenized+[0]*(512-xlen)).unsqueeze(0).cuda()
            word_pad = 512-xlen
            adj = torch.ones(xlen, xlen, dtype=torch.int)
            adj = torch.cat((adj,torch.ones(word_pad,adj.shape[1],dtype=torch.int)),dim=0)
            adj = torch.cat((adj,torch.zeros(512,word_pad,dtype=torch.int)),dim=1)
            adj = adj.unsqueeze(0).cuda()
            label = torch.tensor([1]).unsqueeze(0).cuda()
            outputs = model(input_ids=input_id,
                             attention_mask=adj,
                             labels=label)
            logits = outputs["logits"]
            logits = logits.detach().cpu().numpy()
            pred += list(np.argmax(logits, axis=1))
            fw.write(target_id+"\t"+str(pred[-1])+"\t"+source_id+"\n")
    fw = open("test.txt","w")
    pred = []
    with torch.no_grad():
        for i,x in enumerate(X_test):
            if i%2500 == 0:
                print(i)
                print("len")
                print(len(pred))
            left_citated_text = x["left_citated_text"]
            right_citated_text = x["right_citated_text"]
            target_id = x["target_id"]
            source_id = x["source_id"]
            left_tokenized = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(left_citated_text))[-50:]
            right_tokenized = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(right_citated_text))[:50]
            xlen = len(left_tokenized+right_tokenized)
            input_id = torch.tensor(left_tokenized+right_tokenized+[0]*(512-xlen)).unsqueeze(0).cuda()
            word_pad = 512-xlen
            adj = torch.ones(xlen, xlen, dtype=torch.int)
            adj = torch.cat((adj,torch.ones(word_pad,adj.shape[1],dtype=torch.int)),dim=0)
            adj = torch.cat((adj,torch.zeros(512,word_pad,dtype=torch.int)),dim=1)
            adj = adj.unsqueeze(0).cuda()
            label = torch.tensor([1]).unsqueeze(0).cuda()
            outputs = model(input_ids=input_id,
                             attention_mask=adj,
                             labels=label)
            logits = outputs["logits"]
            logits = logits.detach().cpu().numpy()
            pred += list(np.argmax(logits, axis=1))
            fw.write(target_id+"\t"+str(pred[-1])+"\t"+source_id+"\n")
    """
    """