def main(): NUM_TRAIN_DATA = 150000 NUM_TEST_DATA = 5000 MODEL_DIR = './electra_chinese_base' MAX_LEN = 512 BATCH_SIZE = 8 * 2 # 8gpu * 16 LR = 1e-5 NUM_LABELS = 33 EPOCHS = 4 # read data content, target = read_data('../../corpus/ettoday_2017.json') # train dataloader examples = DataProcessor().get_train_examples( content[:NUM_TRAIN_DATA], target[:NUM_TRAIN_DATA]) train_dataset = convert_examples_to_features( examples, max_length=MAX_LEN, tokenizer=ElectraTokenizerFast.from_pretrained(MODEL_DIR)) train_loader = DataLoader( train_dataset, shuffle=True, batch_size=BATCH_SIZE) # test dataloader examples = DataProcessor().get_test_examples( content[NUM_TRAIN_DATA:NUM_TEST_DATA + NUM_TRAIN_DATA], target[NUM_TRAIN_DATA:NUM_TRAIN_DATA + NUM_TEST_DATA]) test_dataset = convert_examples_to_features( examples, max_length=MAX_LEN, tokenizer=ElectraTokenizerFast.from_pretrained(MODEL_DIR)) test_loader = DataLoader( test_dataset, shuffle=False, batch_size=BATCH_SIZE) # start training and callback for eval # train(train_loader, MODEL_DIR, num_labels=NUM_LABELS, epochs=EPOCHS, eval_callback=evaluate, test_loader=train_loader) train(train_loader, MODEL_DIR, num_labels=NUM_LABELS, lr=LR, epochs=EPOCHS, eval_callback=evaluate, test_loader=test_loader)
def init_electra(): electra_max_len = 512 electra_path = "electra_base_turkish_cased_discriminator/" electra_model_name = "dbmdz-electra-base-turkish-cased-discriminator_seqlen512_bacth64_epochs15/" electra_tokenizer = ElectraTokenizerFast.from_pretrained( electra_path, do_lower_case=False) electra_model_class = Model(electra_max_len, electra_path, electra_model_name, electra_tokenizer, "electra") print("2. ELECTRA LOADED") return electra_model_class
def getTokenizer(model_name): if 'roberta' in model_name: return RobertaTokenizerFast.from_pretrained(model_name, add_prefix_space=False) elif model_name.startswith('bert'): return BertTokenizerFast.from_pretrained(model_name, add_prefix_space=False) elif 'bart' in model_name: return RobertaTokenizerFast.from_pretrained( 'roberta-large', add_prefix_space=False ) #check https://github.com/huggingface/transformers/blob/68e19f1c228c92d5d800533f558faff24b57127a/src/transformers/tokenization_bart.py#L27 elif 'electra' in model_name: return ElectraTokenizerFast.from_pretrained(model_name, add_prefix_space=False) else: return AutoTokenizer.from_pretrained(model_name, add_prefix_space=False)
def read_data(file): df = pd.read_json(file) df = shuffle(df) content = (df['title'] + ' ' + df['content']).to_list() target = df['category'].to_list() return content, target if __name__ == '__main__': import pandas as pd NUM_TEST_DATA = 50016 MODEL_DIR = './electra_chinese_base' MAX_LEN = 512 BATCH_SIZE = 16 * 2 # 8gpu * 16 NUM_LABELS = 33 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" content, target = read_data('../../corpus/ettoday_2017.json') examples = DataProcessor().get_test_examples(content[:NUM_TEST_DATA], target[:NUM_TEST_DATA]) test_dataset = convert_examples_to_features( examples, max_length=MAX_LEN, tokenizer=ElectraTokenizerFast.from_pretrained(MODEL_DIR)) test_loader = DataLoader(test_dataset, shuffle=False, batch_size=BATCH_SIZE) evaluate(test_loader, MODEL_DIR, 'step_18749.ckpt', NUM_LABELS)
c.max_length = 128 elif c.size == "base": c.lr = 1e-4 c.layer_lr_decay = 0.8 c.max_length = 512 elif c.size == "large": c.lr = 5e-5 c.layer_lr_decay = 0.9 c.max_length = 512 else: raise ValueError(f"Invalid size {c.size}") if c.pretrained_checkpoint is None: c.max_length = 512 # All public models is ++, which use max_length 512 # huggingface/transformers hf_tokenizer = ElectraTokenizerFast.from_pretrained( f"google/electra-{c.size}-discriminator") electra_config = ElectraConfig.from_pretrained( f"google/electra-{c.size}-discriminator") # wsc if c.wsc_trick: from _utils.wsc_trick import * # importing spacy model takes time # logging # light logging callback here is to only log the last score and avoid exceeding the api access limit if c.logger == "neptune": import neptune from fastai.callback.neptune import NeptuneCallback class LightNeptuneCallback(NeptuneCallback): def after_batch(self):
from data_prepocessing import add_token_positions, add_end_idx, read_squad from transformers import AutoTokenizer, ElectraForQuestionAnswering, ElectraConfig, AdamW, ElectraTokenizerFast from torch.utils.data import DataLoader from transformers import AdamW from types import SimpleNamespace import argparse import torch import itertools import os import json import subprocess import matplotlib.pyplot as plt from tqdm import tqdm import time tokenizer = ElectraTokenizerFast.from_pretrained('deepset/electra-base-squad2') #device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') device = torch.device('cpu') EPOCHS = 3 BATCH_SIZE = 16 RESULTS_FOLDER = 'results' LOSS_FILE = 'losses.json' IMAGE_FOLDER = 'img' # Building the parser syntax parser = argparse.ArgumentParser( description= 'Use me if you want load and preprocessing the custom squad data') #Configuration parameters parser.add_argument('-lr',
def __init__(self) -> None: self.lists = {} # M-BERT from transformers import BertTokenizerFast, BertForMaskedLM self.bert_multilingual_tokenizer = BertTokenizerFast.from_pretrained( 'bert-base-multilingual-cased') self.bert_multilingual_model = BertForMaskedLM.from_pretrained( 'bert-base-multilingual-cased').eval() self.lists["M-BERT"] = { "Tokenizer": self.bert_multilingual_tokenizer, "Model": self.bert_multilingual_model } print("====================================") print("[BERT] Google Multilingual BERT loaded") print("====================================") # KR-BERT from transformers import BertTokenizerFast, BertForMaskedLM self.krbert_tokenizer = BertTokenizerFast.from_pretrained( 'snunlp/KR-Medium') self.krbert_model = BertForMaskedLM.from_pretrained( 'snunlp/KR-Medium').eval() self.lists["KR-Medium"] = { "Tokenizer": self.krbert_tokenizer, "Model": self.krbert_model } print("====================================") print("[BERT] KR-BERT loaded") print("====================================") # BERT from transformers import BertTokenizerFast, BertForMaskedLM self.bert_kor_tokenizer = BertTokenizerFast.from_pretrained( 'kykim/bert-kor-base') self.bert_kor_model = BertForMaskedLM.from_pretrained( 'kykim/bert-kor-base').eval() self.lists["bert-kor-base"] = { "Tokenizer": self.bert_kor_tokenizer, "Model": self.bert_kor_model } print("====================================") print("[BERT] BERT-kor-base loaded") print("====================================") # ALBERT from transformers import AlbertForMaskedLM self.albert_tokenizer = BertTokenizerFast.from_pretrained( 'kykim/albert-kor-base') self.albert_model = AlbertForMaskedLM.from_pretrained( 'kykim/albert-kor-base').eval() self.lists["albert-kor-base"] = { "Tokenizer": self.albert_tokenizer, "Model": self.albert_model } print("====================================") print("[BERT] ALBERT-kor-base loaded") print("====================================") # XLM-Roberta from transformers import XLMRobertaTokenizerFast, XLMRobertaForMaskedLM self.xlmroberta_tokenizer = XLMRobertaTokenizerFast.from_pretrained( 'xlm-roberta-base') self.xlmroberta_model = XLMRobertaForMaskedLM.from_pretrained( 'xlm-roberta-base').eval() self.lists["xlm-roberta-base"] = { "Tokenizer": self.xlmroberta_tokenizer, "Model": self.xlmroberta_model } print("====================================") print("[BERT] XLM-Roberta-kor loaded") print("====================================") from transformers import BertTokenizerFast, EncoderDecoderModel self.tokenizer_bertshared = BertTokenizerFast.from_pretrained( "kykim/bertshared-kor-base") self.bertshared_model = EncoderDecoderModel.from_pretrained( "kykim/bertshared-kor-base") self.lists["bertshared-kor-base"] = { "Tokenizer": self.tokenizer_bertshared, "Model": self.bertshared_model } print("====================================") print("[Seq2seq + BERT] bertshared-kor-base loaded") print("====================================") # gpt3-kor-small_based_on_gpt2 from transformers import BertTokenizerFast, GPT2LMHeadModel self.tokenizer_gpt3 = BertTokenizerFast.from_pretrained( "kykim/gpt3-kor-small_based_on_gpt2") self.model_gpt3 = GPT2LMHeadModel.from_pretrained( "kykim/gpt3-kor-small_based_on_gpt2") self.lists["gpt3-kor-small_based_on_gpt2"] = { "Tokenizer": self.tokenizer_gpt3, "Model": self.model_gpt3 } print("====================================") print("[GPT3] gpt3-small-based-on-gpt2 loaded") print("====================================") # electra-base-kor from transformers import ElectraTokenizerFast, ElectraModel self.tokenizer_electra = ElectraTokenizerFast.from_pretrained( "kykim/electra-kor-base") self.electra_model = ElectraModel.from_pretrained( "kykim/electra-kor-base") self.lists["electra-kor-base"] = { "Tokenizer": self.tokenizer_electra, "Model": self.electra_model } print("====================================") print("[ELECTRA] electra-kor-base loaded") print("====================================") from transformers import ElectraTokenizerFast, ElectraForQuestionAnswering self.electra_tokenizer_QA = ElectraTokenizerFast.from_pretrained( "monologg/koelectra-base-v3-finetuned-korquad") self.electra_model_QA = ElectraForQuestionAnswering.from_pretrained( "monologg/koelectra-base-v3-finetuned-korquad") self.lists["electra-kor-QA"] = { "Tokenizer": self.electra_tokenizer_QA, "Model": self.electra_model_QA } print("====================================") print("[ELECTRA] koelectra-base-v3-finetuned-korquad loaded") print("====================================")
# Setting of different sizes i = ['small', 'base', 'large'].index(c.size) c.mask_prob = [0.15, 0.15, 0.25][i] c.lr = [5e-4, 2e-4, 2e-4][i] c.bs = [128, 256, 2048][i] c.steps = [10**6, 766*1000, 400*1000][i] c.max_length = [128, 512, 512][i] generator_size_divisor = [4, 3, 4][i] disc_config = ElectraConfig.from_pretrained(f'google/electra-{c.size}-discriminator') gen_config = ElectraConfig.from_pretrained(f'google/electra-{c.size}-generator') # note that public electra-small model is actually small++ and don't scale down generator size gen_config.hidden_size = int(disc_config.hidden_size/generator_size_divisor) gen_config.num_attention_heads = int(disc_config.num_attention_heads/generator_size_divisor) gen_config.intermediate_size = int(disc_config.intermediate_size/generator_size_divisor) hf_tokenizer = ElectraTokenizerFast.from_pretrained(f"google/electra-{c.size}-generator") # Path to data Path('./datasets', exist_ok=True) Path('./checkpoints/pretrain').mkdir(exist_ok=True, parents=True) if c.size in ['small', 'base']: wiki_cache_dir = Path("./datasets/wikipedia/20200501.en/1.0.0") book_cache_dir = Path("./datasets/bookcorpus/plain_text/1.0.0") wbdl_cache_dir = Path("./datasets/wikibook_dl") wbdl_cache_dir.mkdir(exist_ok=True) # Print info print(f"process id: {os.getpid()}") print(c) print(hparam_update)
def main(): parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments, LoggingArguments, PathArguments) ) ( model_args, data_args, train_args, log_args, path_args, remaining_strings, ) = parser.parse_args_into_dataclasses(return_remaining_strings=True) # SageMaker may have some extra strings. TODO: Test this on SM. assert len(remaining_strings) == 0, f"The args {remaining_strings} could not be parsed." hvd.init() gpus = tf.config.list_physical_devices("GPU") for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU") if train_args.eager == "true": tf.config.experimental_run_functions_eagerly(True) tokenizer = ElectraTokenizerFast.from_pretrained("bert-base-uncased") gen_config = ElectraConfig.from_pretrained(f"google/electra-{model_args.model_size}-generator") dis_config = ElectraConfig.from_pretrained( f"google/electra-{model_args.model_size}-discriminator" ) gen = TFElectraForMaskedLM(config=gen_config) dis = TFElectraForPreTraining(config=dis_config) optimizer = get_adamw_optimizer(train_args) # Tie the weights if model_args.electra_tie_weights == "true": gen.electra.embeddings = dis.electra.embeddings loaded_optimizer_weights = None if model_args.load_from == "checkpoint": checkpoint_path = os.path.join(path_args.filesystem_prefix, model_args.checkpoint_path) dis_ckpt, gen_ckpt, optimizer_ckpt = get_checkpoint_paths_from_prefix(checkpoint_path) if hvd.rank() == 0: dis.load_weights(dis_ckpt) gen.load_weights(gen_ckpt) loaded_optimizer_weights = np.load(optimizer_ckpt, allow_pickle=True) start_time = time.perf_counter() if hvd.rank() == 0: # Logging should only happen on a single process # https://stackoverflow.com/questions/9321741/printing-to-screen-and-writing-to-a-file-at-the-same-time level = logging.INFO format = "%(asctime)-15s %(name)-12s: %(levelname)-8s %(message)s" handlers = [ TqdmLoggingHandler(), ] summary_writer = None # Only create a writer if we make it through a successful step logging.basicConfig(level=level, format=format, handlers=handlers) wandb_run_name = None current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") if log_args.run_name is None: metadata = ( f"electra-{hvd.size()}gpus" f"-{train_args.per_gpu_batch_size * hvd.size() * train_args.gradient_accumulation_steps}globalbatch" f"-{train_args.total_steps}steps" ) run_name = ( f"{current_time}-{metadata}-{train_args.name if train_args.name else 'unnamed'}" ) else: run_name = log_args.run_name logger.info(f"Training with dataset at {path_args.train_dir}") logger.info(f"Validating with dataset at {path_args.val_dir}") train_glob = os.path.join(path_args.filesystem_prefix, path_args.train_dir, "*.tfrecord*") validation_glob = os.path.join(path_args.filesystem_prefix, path_args.val_dir, "*.tfrecord*") train_filenames = glob.glob(train_glob) validation_filenames = glob.glob(validation_glob) logger.info( f"Number of train files {len(train_filenames)}, number of validation files {len(validation_filenames)}" ) tf_train_dataset = get_dataset_from_tfrecords( model_type=model_args.model_type, filenames=train_filenames, per_gpu_batch_size=train_args.per_gpu_batch_size, max_seq_length=data_args.max_seq_length, ) tf_train_dataset = tf_train_dataset.prefetch(buffer_size=8) if hvd.rank() == 0: tf_val_dataset = get_dataset_from_tfrecords( model_type=model_args.model_type, filenames=validation_filenames, per_gpu_batch_size=train_args.per_gpu_batch_size, max_seq_length=data_args.max_seq_length, ) tf_val_dataset = tf_val_dataset.prefetch(buffer_size=8) wandb_run_name = None step = 1 for batch in tf_train_dataset: learning_rate = optimizer.learning_rate(step=tf.constant(step, dtype=tf.float32)) ids = batch["input_ids"] attention_mask = batch["attention_mask"] train_result = train_step( optimizer=optimizer, gen=gen, dis=dis, ids=ids, attention_mask=attention_mask, mask_token_id=tokenizer.mask_token_id, ) if step == 1: # Horovod broadcast if hvd.rank() == 0 and loaded_optimizer_weights is not None: optimizer.set_weights(loaded_optimizer_weights) hvd.broadcast_variables(gen.variables, root_rank=0) hvd.broadcast_variables(dis.variables, root_rank=0) hvd.broadcast_variables(optimizer.variables(), root_rank=0) step = optimizer.get_weights()[0] is_final_step = step >= train_args.total_steps if hvd.rank() == 0: do_log = step % log_args.log_frequency == 0 do_checkpoint = (step > 1) and ( (step % log_args.checkpoint_frequency == 0) or is_final_step ) do_validation = step % log_args.validation_frequency == 0 if do_log: elapsed_time = time.perf_counter() - start_time # Off for first log it_s = log_args.log_frequency / elapsed_time start_time = time.perf_counter() description = f"Step {step} -- gen_loss: {train_result.gen_loss:.3f}, dis_loss: {train_result.dis_loss:.3f}, gen_acc: {train_result.gen_acc:.3f}, dis_acc: {train_result.dis_acc:.3f}, it/s: {it_s:.3f}\n" logger.info(description) if do_validation: for batch in tf_val_dataset.take(1): val_ids = batch["input_ids"] val_attention_mask = batch["attention_mask"] val_result = val_step( gen=gen, dis=dis, ids=val_ids, attention_mask=val_attention_mask, mask_token_id=tokenizer.mask_token_id, ) log_example( tokenizer, val_ids, val_result.masked_ids, val_result.corruption_mask, val_result.gen_ids, val_result.dis_preds, ) description = f"VALIDATION, Step {step} -- val_gen_loss: {val_result.gen_loss:.3f}, val_dis_loss: {val_result.dis_loss:.3f}, val_gen_acc: {val_result.gen_acc:.3f}, val_dis_acc: {val_result.dis_acc:.3f}\n" logger.info(description) train_metrics = { "learning_rate": learning_rate, "train/loss": train_result.loss, "train/gen_loss": train_result.gen_loss, "train/dis_loss": train_result.dis_loss, "train/gen_acc": train_result.gen_acc, "train/dis_acc": train_result.dis_acc, } all_metrics = {**train_metrics} if do_validation: val_metrics = { "val/loss": val_result.loss, "val/gen_loss": val_result.gen_loss, "val/dis_loss": val_result.dis_loss, "val/gen_acc": val_result.gen_acc, "val/dis_acc": val_result.dis_acc, } all_metrics = {**all_metrics, **val_metrics} if do_log: all_metrics = {"it_s": it_s, **all_metrics} if is_wandb_available(): if wandb_run_name is None: config = { **asdict(model_args), **asdict(data_args), **asdict(train_args), **asdict(log_args), **asdict(path_args), "global_batch_size": train_args.per_gpu_batch_size * hvd.size(), "n_gpus": hvd.size(), } wandb.init(config=config, project="electra") wandb.run.save() wandb_run_name = wandb.run.name wandb.log({"step": step, **all_metrics}) # Create summary_writer after the first step if summary_writer is None: summary_writer = tf.summary.create_file_writer( os.path.join(path_args.filesystem_prefix, path_args.log_dir, run_name) ) config = { **asdict(model_args), **asdict(data_args), **asdict(train_args), **asdict(log_args), **asdict(path_args), "global_batch_size": train_args.per_gpu_batch_size * hvd.size(), "n_gpus": hvd.size(), } # Log to TensorBoard with summary_writer.as_default(): for name, val in all_metrics.items(): tf.summary.scalar(name, val, step=step) if do_checkpoint: dis_model_ckpt = os.path.join( path_args.filesystem_prefix, path_args.checkpoint_dir, f"{run_name}-step{step}-discriminator.ckpt", ) gen_model_ckpt = os.path.join( path_args.filesystem_prefix, path_args.checkpoint_dir, f"{run_name}-step{step}-generator.ckpt", ) optimizer_ckpt = os.path.join( path_args.filesystem_prefix, path_args.checkpoint_dir, f"{run_name}-step{step}-optimizer.npy", ) logger.info( f"Saving discriminator model at {dis_model_ckpt}, generator model at {gen_model_ckpt}, optimizer at {optimizer_ckpt}" ) dis.save_weights(dis_model_ckpt) gen.save_weights(gen_model_ckpt) np.save(optimizer_ckpt, optimizer.get_weights()) step += 1 if is_final_step: break
def load(cls, pretrained_model_name_or_path, revision=None, tokenizer_class=None, use_fast=True, **kwargs): """ Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from model config or define it manually via `tokenizer_class`. :param pretrained_model_name_or_path: The path of the saved pretrained model or its name (e.g. `bert-base-uncased`) :type pretrained_model_name_or_path: str :param revision: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. :type revision: str :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`) :type tokenizer_class: str :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or use the Python one (False). Only DistilBERT, BERT and Electra fast tokenizers are supported. :type use_fast: bool :param kwargs: :return: Tokenizer """ pretrained_model_name_or_path = str(pretrained_model_name_or_path) kwargs["revision"] = revision if tokenizer_class is None: tokenizer_class = cls._infer_tokenizer_class( pretrained_model_name_or_path) logger.info(f"Loading tokenizer of type '{tokenizer_class}'") # return appropriate tokenizer object ret = None if "AlbertTokenizer" in tokenizer_class: if use_fast: ret = AlbertTokenizerFast.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) else: ret = AlbertTokenizer.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) elif "XLMRobertaTokenizer" in tokenizer_class: if use_fast: ret = XLMRobertaTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = XLMRobertaTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "RobertaTokenizer" in tokenizer_class: if use_fast: ret = RobertaTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = RobertaTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "DistilBertTokenizer" in tokenizer_class: if use_fast: ret = DistilBertTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = DistilBertTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "BertTokenizer" in tokenizer_class: if use_fast: ret = BertTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = BertTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "XLNetTokenizer" in tokenizer_class: if use_fast: ret = XLNetTokenizerFast.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) else: ret = XLNetTokenizer.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) elif "ElectraTokenizer" in tokenizer_class: if use_fast: ret = ElectraTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = ElectraTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "EmbeddingTokenizer": if use_fast: logger.error( 'EmbeddingTokenizerFast is not supported! Using EmbeddingTokenizer instead.' ) ret = EmbeddingTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = EmbeddingTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "CamembertTokenizer" in tokenizer_class: if use_fast: ret = CamembertTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = CamembertTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "DPRQuestionEncoderTokenizer" in tokenizer_class: if use_fast: ret = DPRQuestionEncoderTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = DPRQuestionEncoderTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "DPRContextEncoderTokenizer" in tokenizer_class: if use_fast: ret = DPRContextEncoderTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = DPRContextEncoderTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) if ret is None: raise Exception("Unable to load tokenizer") else: return ret
from transformers import ElectraForSequenceClassification, ElectraTokenizerFast, Trainer, TrainingArguments from nlp import load_dataset import torch import numpy as np from sklearn.metrics import accuracy_score, precision_recall_fscore_support model = ElectraForSequenceClassification.from_pretrained( 'models/ELECTRA_last_line') tokenizer = ElectraTokenizerFast.from_pretrained( 'google/electra-small-discriminator') def tokenize(batch): return tokenizer(batch['text'], truncation=True, max_length=128, add_special_tokens=True, padding='max_length', return_attention_mask=True) test_dataset = load_dataset( 'json', data_files={'test': 'dataset_last_line/quanta_test.json'}, field='questions')['test'] test_dataset = test_dataset.map( lambda example: {'label': [0 if example['difficulty'] == 'School' else 1]}) test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset)) test_dataset.set_format('torch',
def load_model_tokenizer(path): return ElectraForQuestionAnswering.from_pretrained(path), \ ElectraTokenizerFast.from_pretrained(path)
from transformers import ElectraConfig, ElectraTokenizerFast, TFElectraForMaskedLM, create_optimizer from transformers import DataCollatorForLanguageModeling, TFTrainer, TFTrainingArguments, LineByLineTextDataset #from tokenizers import Tokenizer #from tokenizers.models import WordPiece from transformers.modeling_tf_utils import TFMaskedLanguageModelingLoss import deco from deco.sources import Dataset import sys from functools import partial import tensorflow as tf tokenizer = ElectraTokenizerFast("/data/pubmed/model/vocab.txt") #tokens = tokenizer.tokenize("hello world") #res = tokenizer(tokens, is_split_into_words=True) #print(res) tokenizer_func = partial(tokenizer, max_length=128, truncation=True, padding='max_length', \ return_token_type_ids=True, return_attention_mask=True) #res = tokenizer_func("Hi. What's up.") data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) #ds = Dataset.from_lines("abstracts/*.tsv").map(str.strip) \ # .where(lambda a: a).map(tokenizer.tokenize).whole_word_mask().top(10) #for item in ds: # print(item) #sys.exit() ds = Dataset.from_lines("abstracts/*.tsv").map(str.strip) \ .where(lambda a: a).map(tokenizer_func).batch(32).map(data_collator)