示例#1
0
def get_compiled_dataset(path: str, type: str = "split", test_size: float = 0.1,
                         allowed_labelers: list = ['Vasily', 'Aydar'],
                         allowed_labels: list = ['Access control', 'Confidentiality', 'Availability', 'Integrity',
                                                 'Operational', 'Accountability']):
    """

    :param path: path to dataset file
    :param type (optional): either 'split' or '10-fold': 'split' will split data to train and test datasets, '10-fold' will split data into 10 train/test splits.
    :param test_size (optional): how big test dataset is, used only if type is 'split'
    :param allowed_labelers (optional): samples from which labelers are allowed
    :param allowed_labels (optional): samples with which labels are allowed
    :return: This function returns three objects:
        - train – train dataset or list of train datasets if ``type`` was set to 'split'
        - test – test dataset or list of test datasets if `type` was set to 'split'
        - encode_dict – dictionary which maps labels' names to labels' ids
    """

    def read_csv_dataset(path):
        df = pd.read_csv(path, sep=',')
        df = df[['Requirement', 'Context (Keywords)', 'Name of Doc', 'Label', 'Comments.1', 'Labeled by.1']]
        df.columns = ['text', 'context', 'doc', 'label', 'comments', 'labeler']

        # filter labels
        df = df[(df['label'].isin(allowed_labels))]

        # filter labelers
        df = df[(df['labeler'].isin(allowed_labelers))]

        encode_dict = {}

        def encode_cat(x):
            if x not in encode_dict.keys():
                encode_dict[x] = len(encode_dict)
            return encode_dict[x]

        encoded_labels = [encode_cat(label) for label in df['label'].values]
        df = df.assign(encoded_label=pd.Series(encoded_labels, index=df.index))

        return df, encode_dict

    df, encode_dict = read_csv_dataset(path)

    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    class OwnDataset(torch.utils.data.Dataset):
        def __init__(self, df, encoder):
            self.df = df
            self.encoder = encoder
            self.encodings = tokenizer(list(self.df['text'].values), truncation=True, padding=True)
            self.labels = df['encoded_label'].values

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]).to(device) for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx]).to(device)
            return item

        def __len__(self):
            return len(self.labels)

    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

    if type == "split":
        train, test = train_test_split(df, test_size=test_size, random_state=42)
        train_dataset = OwnDataset(train, tokenizer)
        test_dataset = OwnDataset(test, tokenizer)
    elif type == "10-fold":
        kfold = KFold(n_splits=10, shuffle=True, random_state=42)
        train_dataset, test_dataset = [], []
        for train_index, val_index in kfold.split(df):
            train_df = df.iloc[train_index]
            val_df = df.iloc[val_index]
            train_dataset.append(OwnDataset(train_df, tokenizer))
            test_dataset.append(OwnDataset(val_df, tokenizer))
    else:
        raise ValueError("type is either 'split' or '10-fold'")

    return train_dataset, test_dataset, encode_dict
示例#2
0
from transformers import DistilBertTokenizerFast, DistilBertForMaskedLM

# Load the tokenizer
#tokenizer = DistilBertTokenizerFast.from_pretrained("distilabena-base-v2-akuapem-twi-cased", max_len=512) # the one we trained ourselves (akuapem)
tokenizer = DistilBertTokenizerFast.from_pretrained("distilabena-base-v2-akuapem-twi-cased", max_len=512, do_lower_case=True) # the one we trained ourselves (asante, lowercase everything)
#tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-multilingual-cased") # you could also use pre-trained DistilmBERT tokenizer
#tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-multilingual-cased", do_lower_case=True) # for asante, lowercase pretrained tokenizer
#tokenizer.save_vocabulary("distilabena-base-akuapem-twi-cased") # when using pretrained tokenizer, be sure to save it locally
tokenizer.save_vocabulary("distilabena-base-v2-asante-twi-uncased") # saving pretrained tokenizer locally in case of asante 

# Load DistilBERT multilingual base checkpoint
#model = DistilBertForMaskedLM.from_pretrained("distilbert-base-multilingual-cased") # pretrained DistilmBERT weights
model = DistilBertForMaskedLM.from_pretrained("distilabena-base-v2-akuapem-twi-cased") # in the case of Asante Twi, start with Akuapem model weights
print("Number of parameters in the model:")
print(model.num_parameters())

# Create dataset object for JW300 dataset (Akuapem) or Asante Twi Bible 
from transformers import LineByLineTextDataset
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
#    file_path="../../data/jw300.en-tw.tw", # stage 1 - akuapem
    file_path="../../data/asante_twi_bible.txt", # stage 2 - asante
    block_size=128,
)

# Create "data collator" from dataset and tokenizer - with 15% chance of masking
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

# Define training arguments
from transformers import Trainer, TrainingArguments
示例#3
0
aita_data = pd.read_csv('aita_clean.csv')
aita_data_trimmed = aita_data[['body','is_asshole']].copy()

print("Dataframe size before dropping empty rows is: "+str(aita_data_trimmed.size))
aita_data_trimmed = aita_data_trimmed[aita_data_trimmed['body'].astype(str).map(len) > 50]
print("Dataframe size after dropping empty rows is: " +str(aita_data_trimmed.size))

aita_trimmed_texts = list(aita_data_trimmed['body'])
aita_trimmed_labels = list(aita_data_trimmed['is_asshole'])

train_texts, val_texts, train_labels, val_labels = train_test_split(aita_trimmed_texts, aita_trimmed_labels, test_size=.2)

#print(aita_data_train['body'].astype(str).apply(lambda x:len(x)).max())

print("Generating tokens...")
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
print("Tokens generated. Constructing dataset...")
class AITADataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
示例#4
0
def main():
    # define parser and arguments
    args = get_train_test_args()

    util.set_seed(args.seed)
    model = DistilBertForQuestionAnswering.from_pretrained(
        "distilbert-base-uncased")
    tokenizer = DistilBertTokenizerFast.from_pretrained(
        'distilbert-base-uncased')

    if args.do_train:
        if not os.path.exists(args.save_dir):
            os.makedirs(args.save_dir)
        args.save_dir = util.get_save_dir(args.save_dir, args.run_name)
        log = util.get_logger(args.save_dir, 'log_train')
        log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}')
        log.info("Preparing Training Data...")
        args.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')
        trainer = Trainer(args, log)
        train_dataset, _ = get_dataset(args, args.train_datasets,
                                       args.train_dir, tokenizer, 'train')
        log.info("Preparing Validation Data...")
        val_dataset, val_dict = get_dataset(args, args.train_datasets,
                                            args.val_dir, tokenizer, 'val')
        train_loader = DataLoader(train_dataset,
                                  batch_size=args.batch_size,
                                  sampler=RandomSampler(train_dataset))
        val_loader = DataLoader(val_dataset,
                                batch_size=args.batch_size,
                                sampler=SequentialSampler(val_dataset))
        if (args.checkpoint != 'none'):
            checkpoint_path = os.path.join(args.checkpoint, 'checkpoint')
            model = DistilBertForQuestionAnswering.from_pretrained(
                checkpoint_path)
        best_scores = trainer.train(model, train_loader, val_loader, val_dict)
    if args.do_eval:
        args.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')
        split_name = 'test' if 'test' in args.eval_dir else 'validation'
        log = util.get_logger(args.save_dir, f'log_{split_name}')
        trainer = Trainer(args, log)
        checkpoint_path = os.path.join(args.save_dir, 'checkpoint')
        model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path)
        model.to(args.device)
        eval_dataset, eval_dict = get_dataset(args, args.eval_datasets,
                                              args.eval_dir, tokenizer,
                                              split_name)
        eval_loader = DataLoader(eval_dataset,
                                 batch_size=args.batch_size,
                                 sampler=SequentialSampler(eval_dataset))
        eval_preds, eval_scores = trainer.evaluate(model,
                                                   eval_loader,
                                                   eval_dict,
                                                   return_preds=True,
                                                   split=split_name)
        results_str = ', '.join(f'{k}: {v:05.2f}'
                                for k, v in eval_scores.items())
        log.info(f'Eval {results_str}')
        # Write submission file
        sub_path = os.path.join(args.save_dir,
                                split_name + '_' + args.sub_file)
        log.info(f'Writing submission file to {sub_path}...')
        with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh:
            csv_writer = csv.writer(csv_fh, delimiter=',')
            csv_writer.writerow(['Id', 'Predicted'])
            for uuid in sorted(eval_preds):
                csv_writer.writerow([uuid, eval_preds[uuid]])
def init_process(local_rank, backend, hparams, logger):
    os.environ['MASTER_ADDR'] = '127.0.0.1'
    os.environ['MASTER_PORT'] = '29500'
    dist.init_process_group(backend,
                            rank=local_rank,
                            world_size=hparams.num_gpus)
    torch.cuda.set_device(local_rank)
    torch.backends.cudnn.benchmark = True

    if local_rank != 0:
        logger.setLevel(logging.WARNING)

    if local_rank == 0:
        writer = SummaryWriter()

        if not os.path.exists("save"):
            os.mkdir("save")
        save_path = "save/model_{}.pt".format(
            re.sub("\s+", "_", time.asctime()))

    random.seed(hparams.seed)
    reader = Reader(hparams)
    start = time.time()
    logger.info("Loading data...")
    reader.load_data("train")
    end = time.time()
    logger.info("Loaded. {} secs".format(end - start))

    tokenizer = DistilBertTokenizerFast.from_pretrained(
        "distilbert-base-uncased")

    lr = hparams.lr

    model = Dial(hparams).cuda()
    optimizer = Adam(model.parameters(), lr)
    model = torch.nn.parallel.DistributedDataParallel(
        model,
        device_ids=[local_rank],
        output_device=local_rank,
        find_unused_parameters=True)

    # load saved model, optimizer
    if hparams.save_path is not None:
        load(model, optimizer, hparams.save_path)
        dist.barrier()

    train.max_iter = len(list(reader.make_batch(reader.train)))
    validate.max_iter = len(list(reader.make_batch(reader.dev)))
    train.warmup_steps = train.max_iter * hparams.max_epochs * hparams.warmup_steps

    train.global_step = 0
    max_joint_acc = 0
    early_stop_count = hparams.early_stop_count

    for epoch in range(hparams.max_epochs):
        logger.info("Train...")
        start = time.time()
        if local_rank == 0:
            train(model, reader, optimizer, writer, hparams, tokenizer,
                  local_rank)
        else:
            train(model, reader, optimizer, None, hparams, tokenizer,
                  local_rank)
        end = time.time()
        logger.info("epoch: {}, {:.4f} secs".format(epoch + 1, end - start))

        logger.info("Validate...")
        loss, joint_acc, slot_acc = validate(model, reader, hparams, tokenizer,
                                             local_rank)
        logger.info(
            "loss: {:.4f}, joint accuracy: {:.4f}, slot accuracy: {:.4f}".
            format(loss, joint_acc, slot_acc))
        if local_rank == 0:
            writer.add_scalar("Val/loss", loss, epoch + 1)
            writer.add_scalar("Val/joint_acc", joint_acc, epoch + 1)
            writer.add_scalar("Val/slot_acc", slot_acc, epoch + 1)

        if joint_acc > max_joint_acc:  # save model
            if local_rank == 0:
                save(model, optimizer, save_path)
                logger.info("Saved to {}.".format(os.path.abspath(save_path)))
            dist.barrier()  # synchronize
            max_joint_acc = joint_acc
            early_stop_count = hparams.early_stop_count
        else:  # ealry stopping
            if early_stop_count == 0:
                logger.info("Early stopped.")
                break
            elif early_stop_count == 2:
                lr = lr / 2
                logger.info("learning rate schedule: {}".format(lr))
                for param in optimizer.param_groups:
                    param["lr"] = lr
            early_stop_count -= 1
            logger.info("early stop count: {}".format(early_stop_count))
    logger.info("Training finished.")
def tag():
    parser = ArgumentParser()
    parser.add_argument('text', type=str, help='Text to tag')
    parser.add_argument('--name',
                        type=str,
                        help='Name of the model to use',
                        default='best')

    args = parser.parse_args()

    # ########## P R E P A R E   D A T A ##########
    tokenizer = DistilBertTokenizerFast.from_pretrained(
        'distilbert-base-uncased')
    # Pad to 512. All sentences in the dataset have a lower number of tokens.
    tokenized = tokenizer(args.text,
                          padding='max_length',
                          max_length=512,
                          return_attention_mask=True,
                          return_special_tokens_mask=True,
                          return_offsets_mapping=True,
                          return_token_type_ids=False)

    token_ids = torch.tensor(tokenized['input_ids'],
                             dtype=torch.long).unsqueeze(0)
    offsets = torch.tensor(tokenized['offset_mapping'],
                           dtype=torch.long).unsqueeze(0)
    att_masks = torch.tensor(tokenized['attention_mask'],
                             dtype=torch.long).unsqueeze(0)
    special_masks = torch.tensor(tokenized['special_tokens_mask'],
                                 dtype=torch.long).unsqueeze(0)
    special_masks = special_masks.logical_not()

    # ########## P R E P A R E   M O D E L (S) ##########
    weights_path = os.path.join('weights', args.name)

    checkpoint_names = []
    for file in os.listdir(weights_path):
        if file.endswith('.ckpt'):
            checkpoint_names.append(os.path.join(weights_path, file))

    # ########## O B T A I N   P R E D I C T I O N S ##########
    predicted_spans = []
    for i, checkpoint in enumerate(checkpoint_names):
        model = MultiDepthDistilBertModel.load_from_checkpoint(
            checkpoint_path=checkpoint)

        logits = model(token_ids, att_masks)
        preds = torch.argmax(logits, -1)
        predicted_spans.append(preds2spans(preds, special_masks, offsets))

    if len(checkpoint_names) == 1:
        predicted_spans = predicted_spans[0]
    else:
        predicted_spans = compute_ensemble_predictions(predicted_spans)

    # ########## G E N E R A T E   O U T P U T ##########
    predicted_spans = predicted_spans[0]
    text = ''
    inside = False
    for i, char in enumerate(args.text):
        if not inside and i in predicted_spans:
            text += '['
            inside = True
        elif inside and i not in predicted_spans:
            text += ']'
            inside = False
        text += char
    if inside:
        text += ']'

    print(f"Input text  --> {args.text}")
    print(f"Tagged text --> {text}")
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--exp", type=int, default=2)
    parser.add_argument("--save", type=str, default="./model1_best_base.pt")
    args = parser.parse_args()

    # Data and Tokenization
    device = "cuda" if torch.cuda.is_available() else "cpu"
    tokenizer = DistilBertTokenizerFast.from_pretrained(
        "distilbert-base-uncased")

    batch_size = 4
    train_dataset = TorchDataset(
        file_name="./data/diverse.triplets.train.tsv",
        queries_path="./data/diverse.queries.all.tsv",
        passages_path="./data/diverse.passages.all.tsv",
    )
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=batch_size,
                                  shuffle=True)

    dev_dataset = TorchDataset(
        file_name="./data/diverse.triplets.dev.tsv",
        queries_path="./data/diverse.queries.all.tsv",
        passages_path="./data/diverse.passages.all.tsv",
    )
    dev_dataloader = DataLoader(dev_dataset,
                                batch_size=batch_size,
                                shuffle=False)

    # Model Training and Evaluation
    NUM_EPOCHS = 1
    LEARNING_RATE = 0.00003

    # load model
    model = DistilBertForSequenceClassification.from_pretrained(
        "distilbert-base-uncased")
    if args.exp == 3:
        # model.load_state_dict(torch.load(model_path+"model1_best.pt"))
        model_frozen = copy.deepcopy(model)
        for param in model_frozen.distilbert.parameters():
            param.requires_grad = False
        model = model_frozen

    model.to(device)
    model.train()
    if args.exp < 3:
        optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    elif args.exp == 3:
        optimizer = torch.optim.Adam(model.distilbert.parameters(),
                                     lr=LEARNING_RATE)

    def evaluate(inputs, model, tokenizer, labels):
        encodings = tokenizer(
            inputs,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=512,
        )
        ids, masks = encodings["input_ids"], encodings["attention_mask"]
        outputs = model(ids.to(device),
                        masks.to(device),
                        labels=labels.to(device))

        return outputs

    dataloader = train_dataloader
    N = len(dataloader)
    lowest_loss = float("inf")
    start = time.time()
    learning_curve_y = []
    learning_curve_x = []

    for epoch in range(NUM_EPOCHS):
        epoch_loss = 0
        for i, (queries, pos_docs, neg_docs) in enumerate(dataloader):
            if args.exp != 1:
                optimizer.zero_grad()  # set gradient to zero

                queries = list(queries) * 2  # 2*B
                docs = list(pos_docs) + list(neg_docs)

                labels = torch.cat(
                    [torch.ones(len(pos_docs)),
                     torch.zeros(len(neg_docs))]).long().to(device)  # 2*batch,

                outputs = evaluate(
                    inputs=list(zip(queries, docs)),
                    model=model,
                    tokenizer=tokenizer,
                    labels=labels,
                )

                loss = outputs.loss
                loss.backward()
                optimizer.step()

                epoch_loss += float(loss)

                if i % 10 == 0:
                    elapsed_time = time.time() - start
                    remaining_time = elapsed_time * (1 / (i + 1) * N - 1)
                    print(
                        f"{i}: remaining time: {remaining_time:.1f} | est. epoch loss: {epoch_loss / (i + 1):.4f}"
                    )

            if i % 10 == 0:
                with torch.no_grad():
                    correct = total = 0
                    val_start = time.time()
                    for dq, dp, dn in dev_dataloader:
                        queries = list(dq) * 2  # 2*B
                        docs = list(dp) + list(dn)
                        labels = torch.cat(
                            [torch.ones(len(dp)),
                             torch.zeros(len(dn))]).long().to(device)
                        outputs = evaluate(inputs=list(zip(queries, docs)),
                                           model=model,
                                           tokenizer=tokenizer,
                                           labels=labels)

                        predicted_classes = outputs.logits.argmax(dim=-1)
                        correct += (labels == predicted_classes).sum()

                        total += len(labels)
                        if time.time() - val_start > 15:
                            break
                    print(
                        f"{i}: est. validation accuracy: {correct / total:.4f}"
                    )
                    learning_curve_y.append(correct / total)
                    learning_curve_x.append(i * batch_size)  # epoch normally

            if (epoch_loss / (i + 1)) < lowest_loss:
                if args.exp == 1:
                    torch.save(model.state_dict(), "model1_best_pretrain.pt")
                elif args.exp == 2:
                    torch.save(model.state_dict(), "model1_best_base.pt")
                elif args.exp == 3:
                    torch.save(model.state_dict(), "model1_best_freeze.pt")
                lowest_loss = epoch_loss / (i + 1)

        print(f"loss for epoch {epoch} is {epoch_loss}")
        generate_data_for_plot(learning_curve_y, learning_curve_x)
    def run(self):

        torch.cuda.empty_cache()

        # raw data polling and pretreatment datas
        self.generate_data()

        # generate save model directory
        self.generate_model_directory()

        if self.tokenizer_type != '':
            # generate corpus by Okt konlpy
            # self.generate_custom_morphs(self.list_memo)

            # generate tokenizer model
            self.generate_custom_vocab()

        tokenizer = None
        if self.tokenizer_type == '':
            # base tokenizer
            tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased",
                                                            lowercase=True,
                                                            strip_accents=False,
                                                            local_files_only=False)
        else:
            # word piece tokenizer
            tokenizer = DistilBertTokenizerFast.from_pretrained(self.vocab_root_dir + self.vocab_dir,
                                                                strip_accents=False,
                                                                lowercase=True)

        self.setPrint('Load Customer Vocab size : {}'.format(tokenizer.vocab_size))
        # tokenizer Loading check
        # tokenized_input_for_pytorch = tokenizer_for_load("i am very happy now", return_tensors="pt")
        # encoded_text = tokenizer("전화 통화가 정상적으로 안됨", return_tensors="pt")
        # self.setPrint("Tokens Text List: {}".format(
        #     [tokenizer.convert_ids_to_tokens(s) for s in encoded_text['input_ids'].tolist()[0]]))
        # self.setPrint("Tokens IDX  List: {}".format(encoded_text['input_ids'].tolist()[0]))
        # self.setPrint("Tokens Mask List: {}".format(encoded_text['attention_mask'].tolist()[0]))

        # transformed train data
        encoded_data_train = tokenizer.batch_encode_plus(
            self.Train_Data_X,
            add_special_tokens=True,
            return_attention_mask=True,
            # padding='longest',
            padding=True,
            max_length=256,
            return_tensors='pt',
            truncation=True
        )
        # transformed validation data
        encoded_data_val = tokenizer.batch_encode_plus(
            self.Test_Data_X,
            add_special_tokens=True,
            return_attention_mask=True,
            # padding='longest',
            padding=True,
            max_length=256,
            return_tensors='pt',
            truncation=True
        )

        input_ids_train = encoded_data_train['input_ids']
        attention_masks_train = encoded_data_train['attention_mask']
        labels_train = torch.tensor(self.Train_Data_Y)

        input_ids_test = encoded_data_val['input_ids']
        attention_masks_test = encoded_data_val['attention_mask']
        labels_test = torch.tensor(self.Test_Data_Y)

        dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
        dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

        # local_files_only = True
        self.model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",
                                                                         num_labels=len(self.label_index),
                                                                         output_attentions=False,
                                                                         output_hidden_states=False,
                                                                         local_files_only=False).to(self.device)

        # dataLoader
        dataloader_train = DataLoader(dataset_train,
                                      sampler=RandomSampler(dataset_train),
                                      batch_size=self.batch_size,
                                      drop_last=True)

        dataloader_test = DataLoader(dataset_test,
                                     sampler=RandomSampler(dataset_test),
                                     batch_size=self.batch_size)

        optimizer = AdamW(self.model.parameters(), lr=self.learning_rate, eps=1e-8)
        scheduler = get_linear_schedule_with_warmup(optimizer=optimizer,
                                                    num_warmup_steps=0,
                                                    num_training_steps=len(dataloader_train) * self.epoch)

        # scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer=optimizer,
        #                                                                num_warmup_steps=0,
        #                                                                num_training_steps=len(dataloader_train) * self.epoch)
        # for loss f1 graph
        total_train_loss = np.array([0.0000] * self.epoch)
        total_val_loss = np.array([0.0000] * self.epoch)
        total_score = np.array([0.0000] * self.epoch)

        # Training start
        for epoch in range(1, self.epoch + 1):
            self.setPrint('Start of Epoch {}'.format(epoch))
            self.model.train()
            loss_train_total = 0

            for idx, batch in enumerate(dataloader_train):
                self.model.zero_grad()
                batch = tuple(b.to(self.device) for b in batch)
                inputs = {'input_ids': batch[0],
                          'attention_mask': batch[1],
                          'labels': batch[2],
                          }
                outputs = self.model(**inputs)
                loss = outputs[0]
                loss_train_total += loss.item()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()

                if idx % 100 == 0:
                    self.setPrint('[{}]Epoch {}/{} training_loss : {:.4f}'.format(epoch, idx, len(dataloader_train),
                                                                                  loss.item() / len(batch)))
                # gpu memory reset
                batch = None
                torch.cuda.empty_cache()

            # model save
            torch.save(self.model.state_dict(),
                       self.model_root_dir + self.model_dir + 'BERT_dict_epoch_{}.model'.format(epoch))
            self.setPrint('Save fine_tuned_BERT_epoch_{}.model'.format(epoch))
            self.setPrint('\nEnd of Epoch {}'.format(epoch))

            loss_train_avg = loss_train_total / len(dataloader_train)
            self.setPrint('[{}] Epoch Training loss: {:.4f}'.format(epoch, loss_train_avg))
            total_train_loss[epoch - 1] = round(loss_train_avg, 4)

            val_loss, predictions, true_vals = self.evaluate(dataloader_test)
            val_f1 = self.f1_score_func(predictions, true_vals)

            total_score[epoch - 1] = round(val_f1, 4)
            total_val_loss[epoch - 1] = round(val_loss, 4)

            self.setPrint('[{}] Validation loss: {:.4f}'.format(epoch, val_loss))
            self.setPrint('[{}] F1 Score : {:.4f}'.format(epoch, val_f1))

        # generate graph
        self.generate_graph(total_train_loss, total_val_loss, total_score)