def get_bpe_tokenizer(vocab, uppercase=False): return tokenizers.ByteLevelBPETokenizer( vocab=vocab, add_prefix_space=True, lowercase=not uppercase, trim_offsets=True, )
def __init__(self, data_path, max_length=64, qa=False, model='bert'): super(InferDataset, self).__init__() assert model in ['bert', 'roberta'] self.data_path = data_path self.id_list = list() self.text_list = list() self.label = list() self.max_length = max_length self.model = model self.label_map = {'neutral': 0, 'negative': 1, 'positive': 2} if self.model == 'bert': self.sent_id = {0: 8699, 1: 4997, 2: 3893} self.tokenizer = tokenizers.BertWordPieceTokenizer( '/home/liu/DL_workstation/tweet-sent/tweet-pytorch/tools/vocab.txt', lowercase=True) elif self.model == 'roberta': self.sent_id = {2: 1313, 1: 2430, 0: 7974} self.tokenizer = tokenizers.ByteLevelBPETokenizer( vocab_file= '/home/liu/DL_workstation/tweet-sent/tweet-pytorch/tools/vocab.json', merges_file= '/home/liu/DL_workstation/tweet-sent/tweet-pytorch/tools/merges.txt', lowercase=True, add_prefix_space=True) self.qa = qa if self.qa: print('\n\n Inferring with Question Answering Mode...') self.parse(self.data_path) self.invalid_cnt = 0
def create_tokenizer(self): tokenizer = tokenizers.ByteLevelBPETokenizer( vocab_file='/Users/ccw/PycharmProjects/Wilfred/tf-roberta/roberta-base-vocab.json', merges_file='/Users/ccw/PycharmProjects/Wilfred/tf-roberta/roberta-base-merges.txt', lowercase=True, add_prefix_space=True) return tokenizer
def __init__(self, df, max_len=96): self.df = df self.max_len = max_len self.labeled = 'selected_text' in df self.tokenizer = tokenizers.ByteLevelBPETokenizer( vocab_file=config.vocab_file, merges_file=config.merges_file, lowercase=True, add_prefix_space=True)
def __init__(self, df, max_len=96): self.df = df self.max_len = max_len self.labeled = 'selected_text' in df self.tokenizer = tokenizers.ByteLevelBPETokenizer( vocab_file='roberta/vocab.json', merges_file='roberta/merges.txt', lowercase=True, add_prefix_space=True)
def get_tokenizer(self): tokenizer = tokenizers.ByteLevelBPETokenizer( vocab_file=self.config.data.roberta.path + self.config.data.roberta.vocab, merges_file=self.config.data.roberta.path + self.config.data.roberta.merges, lowercase=self.config.data.roberta.lowercase, add_prefix_space=self.config.data.roberta.add_prefix_space) return tokenizer
def __init__(self, data_df, config): self.data_df, self.config = data_df, config self.maxlen = self.config['maxlen'] self.labeled = 'selected_text' in data_df self.tokenizer = tokenizers.ByteLevelBPETokenizer( vocab_file=self.config['vocab_file_path'], merges_file=self.config['merge_file_path'], lowercase=True, add_prefix_space=True)
def __init__(self, df, max_len=96, use_fifth=True): self.df = df self.max_len = max_len self.labeled = 'selected_text' in df self.tokenizer = tokenizers.ByteLevelBPETokenizer( vocab_file=ROOT_PATH + '/input/roberta-base/vocab.json', merges_file=ROOT_PATH + '/input/roberta-base/merges.txt', lowercase=True, add_prefix_space=True) self.use_fifth = use_fifth
def __init__(self, df, max_len=int(config['MODEL']['MAXLEN'])): self.df = df self.max_len = max_len self.labeled = 'selected_text' in df self.tokenizer = tokenizers.ByteLevelBPETokenizer( vocab_file=os.path.join(config['PATHS']['ROBERTA_PATH'], 'vocab.json'), merges_file=os.path.join(config['PATHS']['ROBERTA_PATH'], 'merges.txt'), lowercase=True, add_prefix_space=True)
def __init__(self, save_tokenizer_path: str, training_files, special_tokens, min_frequency: int, lowercase: bool, vocab_size: int): super(CustomTokenizerTrainer, self).__init__() self.save_tokenizer_path = save_tokenizer_path self.training_files = training_files self.special_tokens = special_tokens self.min_frequency = min_frequency self.lowercase = lowercase self.VOCAB_SIZE = vocab_size self.tokenizer = tokenizers.ByteLevelBPETokenizer( lowercase=self.lowercase)
class config: MAX_LEN = 128 TRAIN_BATCH_SIZE = 2 EPOCH = 5 BERT_PATH = './roberta_input' SAVE_PATH = './output' TOKENIZER = tokenizers.ByteLevelBPETokenizer( vocab_file=os.path.join(BERT_PATH, 'vocab.json'), merges_file=os.path.join(BERT_PATH, 'merges.txt'), lowercase=True, # add_prefix_space=True )
def main(text_path, out_directory): Path(out_directory).mkdir(exist_ok=True, parents=True) english_tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") german_tokenizer = tokenizers.ByteLevelBPETokenizer() german_tokenizer.train( [text_path], vocab_size=english_tokenizer.vocab_size, special_tokens=["<|endoftext|>"], show_progress=True, ) german_tokenizer.save_model(out_directory)
class config: TRAINING_FILE = "../input/twe-myfolds/train_folds_20200425.csv" TRAIN_BATCH_SIZE = 96 VALID_BATCH_SIZE = 96 MAX_LEN = 128 EPOCHS = 6 conf_file = '/kaggle/input/all-weights/config.json' MODEL_PATH = "/kaggle/input/all-weights/" TOKENIZER = tokenizers.ByteLevelBPETokenizer( vocab_file=f"{MODEL_PATH}/vocab.json", merges_file=f"/kaggle/input/tweeter-offline-eval/merges-1.txt", lowercase=True, add_prefix_space=True)
class config: MAX_LEN = 128 TRAIN_BATCH_SIZE = 32 VALID_BATCH_SIZE = 8 EPOCHS = 5 ROBERTA_PATH = "../input/roberta-base" MODEL_PATH = "model.bin" TRAINING_FILE = "../input/tweet-train-folds-v2/train_folds.csv" TOKENIZER = tokenizers.ByteLevelBPETokenizer( vocab_file=f"{ROBERTA_PATH}/vocab.json", merges_file=f"{ROBERTA_PATH}/merges.txt", lowercase=True, add_prefix_space=True )
def __init__(self, df, max_len=128): # Dataframe dữ liệu self.df = df # độ dài tối đa của câu self.max_len = max_len # Nhãn self.labeled = 'selected_text' in df # Khởi tạo mã hóa BPE self.tokenizer = tokenizers.ByteLevelBPETokenizer( vocab_file='./roberta.base.torch/vocab.json', merges_file='./roberta.base.torch/merges.txt', lowercase=True, add_prefix_space=True )
def __init__(self): self.MAX_LEN = 192 self.TRAIN_BATCH_SIZE = 32 self.VALID_BATCH_SIZE = 8 self.EPOCHS = 5 self.WEIGHTS_DIR = 'weights' self.BERT_PATH = "/data/tweet-sentiment-extraction/roberta-base" self.MODEL_PATH = "model.bin" self.TRAINING_FILE = "/data/tweet-sentiment-extraction/train_folds.csv" self.TEST_FILE = "/data/tweet-sentiment-extraction/test.csv" self.TRAIN_FILE = "/data/tweet-sentiment-extraction/train.csv" self.SAMPLE_FILE = "/data/tweet-sentiment-extraction/sample_submission.csv" self.TOKENIZER = tokenizers.ByteLevelBPETokenizer( vocab_file=f"{self.BERT_PATH}/vocab.json", merges_file=f"{self.BERT_PATH}/merges.txt", lowercase=True, add_prefix_space=True)
def get_tokenizer(self, model_path): tokenizer = None if 'roberta' in self.transformer_type: tokenizer = tokenizers.ByteLevelBPETokenizer( vocab_file=model_path+'vocab.json', merges_file=model_path+'merges.txt', lowercase=True, add_prefix_space=True) elif 'bert' in self.transformer_type: vocab_path = os.path.join(model_path, 'vocab.txt') tokenizer = tokenizers.BertWordPieceTokenizer( vocab_path, lowercase=True ) else: raise RuntimeError(f'{self.transformer_type} is not supported') return tokenizer
def main(): tokenizer = RobertaTokenizer.from_pretrained('roberta-base') tokenizer.save_vocabulary('.') MAX_LEN = 96 tokenizer = tokenizers.ByteLevelBPETokenizer( vocab_file='vocab.json', merges_file='merges.txt', lowercase=True, add_prefix_space=True ) sentiment_id = {'positive': 1313, 'negative': 2430, 'neutral': 7974} twitter_train = pd.read_csv('./kaggle/input/tweet-sentiment-extraction/train.csv', delimiter=',') twitter_test = pd.read_csv('./kaggle/input/tweet-sentiment-extraction/test.csv', delimiter=',') twitter_train = twitter_train.dropna() sentimentExtract = Sentiment(twitter_train[0:21984]) sentimentExtract.train()
def __init__(self, df, max_len=96, vocab_file='../input/roberta-base/vocab.json', merges_file='../input/roberta-base/merges.txt', change_sentiment_p=0.0, premake_dataset=False, ): self.df = df self.max_len = max_len self.labeled = 'selected_text' in df self.tokenizer = tokenizers.ByteLevelBPETokenizer( vocab_file=vocab_file, merges_file=merges_file, lowercase=True, add_prefix_space=True) self.change_sentiment_p = change_sentiment_p self.uniq_sentiment = np.unique(self.df['sentiment'].values) self.premake_dataset = premake_dataset if self.premake_dataset: self.dataset = [self.make_data(i) for i in range(len(self.df))]
class config: LEARNING_RATE = 4e-5 MAX_LEN = 128 TRAIN_BATCH_SIZE = 50 VALID_BATCH_SIZE = 32 EPOCHS = 3 INPUT_PATH = "/content/drive/My Drive/Tweet Sentiment Extraction/input/" TRAINING_FILE = f"{INPUT_PATH}tweet-sentiment-extraction/train_8folds.csv" ROBERTA_PATH = f"{INPUT_PATH}roberta-base/" TOKENIZER_N = transformers.RobertaTokenizer( vocab_file = f'{ROBERTA_PATH}vocab.json', merges_file = f'{ROBERTA_PATH}merges.txt', lowercase = True, add_prefix_space = True ) # OLD TOKENIZER = tokenizers.ByteLevelBPETokenizer( vocab_file=f"{ROBERTA_PATH}/vocab.json", merges_file=f"{ROBERTA_PATH}/merges.txt", lowercase=True, add_prefix_space=True )
def __init__( self, args, dataset, source_dictionary, dropout=0.1, seed=1, ): super().__init__(dataset) self.source_dictionary = source_dictionary self.epoch = 0 self.seed = seed self.dropout = dropout import tokenizers self.hf_tokenizer = tokenizers.ByteLevelBPETokenizer( args.gpt2_encoder_json, args.gpt2_vocab_bpe, add_prefix_space=True, dropout=self.dropout, )
def custom_bpe_tokenizer(corpus, text_filepath, tokenizer_save_path, vocab_size=10000, min_frequency=3): if type(corpus[0]) == list: corpus = [" ".join(i) for i in corpus] try: os.makedirs(text_filepath) except OSError: pass tokenizer = tokenizers.ByteLevelBPETokenizer( vocab_file=None, merges_file=None, ) #SentencePieceBPETokenizer() df = pd.DataFrame() df['text'] = corpus df.to_csv(os.path.join(text_filepath, 'file.txt'), header=False, index=False) try: os.makedirs(tokenizer_save_path) except OSError: pass tokenizer.train( files=os.path.join(text_filepath, 'file.txt'), vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=['[PAD]', '[UNK]', '[CLS]', '[MASK]', '[SEP]']) tokenizer.save(directory=tokenizer_save_path, name='bpe') os.remove(os.path.join(text_filepath, 'file.txt'))
import numpy as np import pandas as pd from sklearn.model_selection import StratifiedKFold from tqdm import tqdm import tokenizers DIR = "./inputs/datasets/tkm/" MAX_LEN = 120 # PATH = '../input/tf-roberta/' PATH = './inputs/datasets/roberta/tokenizer/' tokenizer = tokenizers.ByteLevelBPETokenizer( # vocab_file=PATH + 'vocab-roberta-base.json', # merges_file=PATH + 'merges-roberta-base.txt', vocab_file=PATH + 'vocab.json', merges_file=PATH + 'merges.txt', lowercase=True, add_prefix_space=True) SEED = 88888 np.random.seed(SEED) sentiment_id = {'positive': 1313, 'negative': 2430, 'neutral': 7974} def proc(train): # 前処理 ct = train.shape[0] input_ids = np.ones((ct, MAX_LEN), dtype='int32') attention_mask = np.zeros((ct, MAX_LEN), dtype='int32') token_type_ids = np.zeros((ct, MAX_LEN), dtype='int32') start_tokens = np.zeros((ct, MAX_LEN), dtype='int32')
def run(args): train_df = load_data.load_custom_text_as_pd(args.train_data,sep='\t',header=True, \ text_column=['Text'],target_column=['Label']) val_df = load_data.load_custom_text_as_pd(args.val_data,sep='\t', header=True, \ text_column=['Text'],target_column=['Label']) train_df = pd.DataFrame(train_df, copy=False) val_df = pd.DataFrame(val_df, copy=False) model_save_dir = args.model_save_path try: os.makedirs(model_save_dir) except OSError: pass train_df.labels, label2idx = data_utils.convert_categorical_label_to_int(train_df.labels, \ save_path=os.path.join(model_save_dir,'label2idx.pkl')) val_df.labels, _ = data_utils.convert_categorical_label_to_int(val_df.labels, \ save_path=os.path.join(model_save_dir,'label2idx.pkl')) if args.berttweettokenizer_path: tokenizer = BERTweetTokenizer(args.berttweettokenizer_path) else: tokenizer = AutoTokenizer.from_pretrained( args.transformer_model_pretrained_path) if not args.berttweettokenizer_path: bpetokenizer = tokenizers.ByteLevelBPETokenizer(args.bpe_vocab_path, \ args.bpe_merges_path) else: bpetokenizer = None if bpetokenizer: train_corpus = data_utils.Corpus(train_df.copy(), tokenizer=bpetokenizer.encode) val_corpus = data_utils.Corpus(val_df.copy(), tokenizer=bpetokenizer.encode) else: train_corpus = data_utils.Corpus(train_df.copy(), tokenizer=tokenizer.tokenize) val_corpus = data_utils.Corpus(val_df.copy(), tokenizer=tokenizer.tokenize) train_dataset = data_utils.TransformerDataset(train_corpus.data.words, bpetokenizer=bpetokenizer, tokenizer=tokenizer, MAX_LEN=args.max_text_len, \ target_label=train_corpus.data.labels, sequence_target=False, target_text=None, conditional_label=None, conditional_all_labels=None) val_dataset = data_utils.TransformerDataset(val_corpus.data.words, bpetokenizer=bpetokenizer, tokenizer=tokenizer, MAX_LEN=args.max_text_len, \ target_label=val_corpus.data.labels, sequence_target=False, target_text=None, conditional_label=None, conditional_all_labels=None) if _torch_tpu_available and args.use_TPU: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True) val_sampler = torch.utils.data.distributed.DistributedSampler( val_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=False) if _torch_tpu_available and args.use_TPU: train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.train_batch_size, sampler=train_sampler, drop_last=True, num_workers=2) val_data_loader = torch.utils.data.DataLoader( val_dataset, batch_size=args.eval_batch_size, sampler=val_sampler, drop_last=False, num_workers=1) else: train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.train_batch_size) val_data_loader = torch.utils.data.DataLoader( val_dataset, batch_size=args.eval_batch_size) config = AutoConfig.from_pretrained(args.transformer_config_path, output_hidden_states=True, output_attentions=True) basemodel = AutoModel.from_pretrained( args.transformer_model_pretrained_path, config=config) model = transformer_models.TransformerWithCLS(basemodel) if args.use_torch_trainer: device = torch.device( "cuda" if _torch_gpu_available and args.use_gpu else "cpu") if _torch_tpu_available and args.use_TPU: device = xm.xla_device() if args.use_TPU and _torch_tpu_available and args.num_tpus > 1: train_data_loader = torch_xla.distributed.parallel_loader.ParallelLoader( train_data_loader, [device]) train_data_loader = train_data_loader.per_device_loader(device) trainer = BasicTrainer(model, train_data_loader, val_data_loader, device, args.transformer_model_pretrained_path, test_data_loader=val_data_loader) param_optimizer = list(trainer.model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.001, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] num_train_steps = int(len(train_data_loader) * args.epochs) if _torch_tpu_available and args.use_TPU: optimizer = AdamW(optimizer_parameters, lr=args.lr * xm.xrt_world_size()) else: optimizer = AdamW(optimizer_parameters, lr=args.lr) if args.use_apex and _has_apex: model, optimizer = amp.initialize(model, optimizer, opt_level="O1") scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) def _mp_fn(rank, flags, trainer, epochs, lr, metric, loss_function, optimizer, scheduler, model_save_path, num_gpus, num_tpus, \ max_grad_norm, early_stopping_rounds, snapshot_ensemble, is_amp, seed): torch.set_default_tensor_type('torch.FloatTensor') a = trainer.train(epochs, lr, metric, loss_function, optimizer, scheduler, model_save_path, num_gpus, num_tpus, \ max_grad_norm, early_stopping_rounds, snapshot_ensemble, is_amp, seed) FLAGS = {} if _torch_tpu_available and args.use_TPU: xmp.spawn(_mp_fn, args=(FLAGS, trainer, args.epochs, args.lr, args.metric, args.loss_function, optimizer, scheduler, args.model_save_path, args.num_gpus, args.num_tpus, \ 1, 3, False, args.use_apex, args.seed), nprocs=8, start_method='fork') else: trainer.train(args.epochs, args.lr, args.metric, args.loss_function, optimizer, scheduler, args.model_save_path, args.num_gpus, args.num_tpus, \ max_grad_norm=1, early_stopping_rounds=3, snapshot_ensemble=False, is_amp=args.use_apex, seed=args.seed) test_output = trainer.test_output elif args.use_lightning_trainer and _torch_lightning_available: from pytorch_lightning import Trainer, seed_everything seed_everything(args.seed) log_args = { 'description': args.transformer_model_pretrained_path, 'loss': args.loss_function, 'epochs': args.epochs, 'learning_rate': args.lr } if _has_wandb and not _torch_tpu_available and args.wandb_logging: wandb.init(project="WNUT-Task-2", config=log_args) wandb_logger = WandbLogger() checkpoint_callback = ModelCheckpoint(filepath=args.model_save_path, save_top_k=1, verbose=True, monitor='val_loss', mode='min') earlystop = EarlyStopping(monitor='val_loss', patience=3, verbose=False, mode='min') if args.use_gpu and _torch_gpu_available: print("using GPU") if args.wandb_logging: if _has_apex: trainer = Trainer(gpus=args.num_gpus, max_epochs=args.epochs, logger=wandb_logger, precision=16, \ checkpoint_callback=checkpoint_callback, callbacks=[earlystop]) else: trainer = Trainer(gpus=args.num_gpus, max_epochs=args.epochs, logger=wandb_logger, \ checkpoint_callback=checkpoint_callback, callbacks=[earlystop]) else: if _has_apex: trainer = Trainer(gpus=args.num_gpus, max_epochs=args.epochs, precision=16, \ checkpoint_callback=checkpoint_callback, callbacks=[earlystop]) else: trainer = Trainer(gpus=args.num_gpus, max_epochs=args.epochs, \ checkpoint_callback=checkpoint_callback, callbacks=[earlystop]) elif args.use_TPU and _torch_tpu_available: print("using TPU") if _has_apex: trainer = Trainer(num_tpu_cores=args.num_tpus, max_epochs=args.epochs, precision=16, \ checkpoint_callback=checkpoint_callback, callbacks=[earlystop]) else: trainer = Trainer(num_tpu_cores=args.num_tpus, max_epochs=args.epochs, \ checkpoint_callback=checkpoint_callback, callbacks=[earlystop]) else: print("using CPU") if args.wandb_logging: if _has_apex: trainer = Trainer(max_epochs=args.epochs, logger=wandb_logger, precision=16, \ checkpoint_callback=checkpoint_callback, callbacks=[earlystop]) else: trainer = Trainer(max_epochs=args.epochs, logger=wandb_logger, \ checkpoint_callback=checkpoint_callback, callbacks=[earlystop]) else: if _has_apex: trainer = Trainer(max_epochs=args.epochs, precision=16, \ checkpoint_callback=checkpoint_callback, callbacks=[earlystop]) else: trainer = Trainer(max_epochs=args.epochs, checkpoint_callback=checkpoint_callback, callbacks=[earlystop]) num_train_steps = int(len(train_data_loader) * args.epochs) pltrainer = PLTrainer(num_train_steps, model, args.metric, args.loss_function, args.lr, seed=42) #try: # print ("Loaded model from previous checkpoint") # pltrainer = PLTrainer.load_from_checkpoint(args.model_save_path) #except: # pass trainer.fit(pltrainer, train_data_loader, val_data_loader) test_output = (pltrainer, val_data_loader) idx2label = {value: key for key, value in label2idx.items()} test_output = [idx2label[i] for i in test_output] return test_output
import tokenizers DEVICE = 'cpu' #cuda MAX_LEN = 128 TRAIN_BATCH_SIZE = 8 VALID_BATCH_SIZE = 8 EPOCHS = 6 BERT_PATH = "../pretrained_models/roberta-base/" MODEL_PATH = "pytorch_model.bin" TRAINING_FILE = "../data/train.csv" TEST_FILE = "../data/test.csv" TOKENIZER = tokenizers.ByteLevelBPETokenizer( vocab_file="../pretrained_models/roberta-base/vocab.json", merges_file="../pretrained_models/roberta-base/merges.txt", lowercase=True, add_prefix_space=True)
os.environ['PYTHONHASHSEED'] = str(seed_value) if torch.cuda.is_available(): torch.cuda.manual_seed(seed_value) torch.cuda.manual_seed_all(seed_value) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = True seed = 42 seed_everything(seed) data_save = {} # 其他地方会再次用到分词器,是否可缓存,没尝试过 bptokenizer = tokenizers.ByteLevelBPETokenizer( vocab_file='roberta/vocab.json', merges_file='roberta/merges.txt', lowercase=True, add_prefix_space=True) # fw = open("testfin.txt", "w", encoding='utf-8') class TweetDataset(torch.utils.data.Dataset): def __init__(self, df, max_len=96): self.df = df self.max_len = max_len self.labeled = 'selected_text' in df self.tokenizer = tokenizers.ByteLevelBPETokenizer( vocab_file='roberta/vocab.json', merges_file='roberta/merges.txt', lowercase=True, add_prefix_space=True)
# Reading the data from the csv files train_data = pd.read_csv(training_path) test_data = pd.read_csv(test_path) sentiment_id = {'positive': 1313, 'negative': 2430, 'neutral': 7974} # Preprocessing the data import os TOKENIZE_PATH = './RoBERTA Files/' MAX_LEN = 100 # Initializing thr token variable for tokenization tokenizer = tokenizers.ByteLevelBPETokenizer( vocab_file=TOKENIZE_PATH + 'vocab.json', merges_file=TOKENIZE_PATH + 'merges.txt', lowercase=True, add_prefix_space=True) train_data.dropna(axis=0) # For bert model we need to tokenize the data as per our needs # For tokenizing the data we are using a pretrained tokenizer from the Roberta Hugging Face # Assuming to be the mazimum length of the tweet be 100 words MAX_LEN = 100 instances = train_data.shape[0] # Inititalizing the tokenization arrays input_ids = np.ones((instances, MAX_LEN), dtype='int32') attention_mask = np.zeros((instances, MAX_LEN), dtype='int32') token_type_ids = np.zeros((instances, MAX_LEN), dtype='int32')
import gc gc.collect() torch.cuda.empty_cache() # max_len = 160 # train_batch_size = 16 # valid_batch_size = 8 # epochs = 3 # roberta_path = "./roberta-base" roberta_path = "./roberta-base" training_file = "./train-kfolds/train_5folds.csv" tokenizer = tokenizers.ByteLevelBPETokenizer( vocab_file = "./roberta-base/vocab.json", merges_file = "./roberta-base/merges.txt", lowercase = True, add_prefix_space = True ) # train(0, epochs, training_file, tokenizer, max_len, train_batch_size, valid_batch_size, roberta_path) # train(1, epochs, training_file, tokenizer, max_len, train_batch_size, valid_batch_size, roberta_path) # train(2, epochs, training_file, tokenizer, max_len, train_batch_size, valid_batch_size, roberta_path) # train(3, epochs, training_file, tokenizer, max_len, train_batch_size, valid_batch_size, roberta_path) # train(4, epochs, training_file, tokenizer, max_len, train_batch_size, valid_batch_size, roberta_path) print("fold: ", args.fold) train(args.fold, epochs, training_file, tokenizer, max_len, train_batch_size, valid_batch_size, roberta_path, args.lr, args.patience, args.num_warmup_steps) #train(1, epochs, training_file, tokenizer, max_len, train_batch_size, valid_batch_size, roberta_path, args.lr, args.patience, args.num_warmup_steps) #train(2, epochs, training_file, tokenizer, max_len, train_batch_size, valid_batch_size, roberta_path, args.lr, args.patience, args.num_warmup_steps) #train(3, epochs, training_file, tokenizer, max_len, train_batch_size, valid_batch_size, roberta_path, args.lr, args.patience, args.num_warmup_steps) #train(4, epochs, training_file, tokenizer, max_len, train_batch_size, valid_batch_size, roberta_path, args.lr, args.patience, args.num_warmup_steps)
import os import tokenizers MAX_LEN = 192 TRAIN_BATCH_SIZE = 32 VALID_BATCH_SIZE = 8 EPOCHS = 10 SEED = 43 ROBERTA_PATH = '/home/mikhail/workspace/roberta-base' MODEL_PATH = 'model.bin' TRAINING_FILE_WITHOUT_FOLDS = '../input/train.csv' TRAINING_FILE = '../input/train_folds.csv' VALID_FILE = '../input/valid.csv' TOKENIZER = tokenizers.ByteLevelBPETokenizer( vocab_file=f"{ROBERTA_PATH}/vocab.json", merges_file=f"{ROBERTA_PATH}/merges.txt", lowercase=True, add_prefix_space=True )
print('model_path: %s' % model_path) """ model config """ train_df = pd.read_csv(train_file) test_df = pd.read_csv(test_file) sub_df = pd.read_csv(sub_file) train_df.dropna(inplace=True) train_df = train_df.reset_index(drop=True) print(train_df.shape, test_df.shape, sub_df.shape) """ load data """ tokenizer = tokenizers.ByteLevelBPETokenizer( vocab_file=os.path.join(roberta_path, 'vocab.json'), merges_file=os.path.join(roberta_path, 'merges.txt'), lowercase=True, add_prefix_space=True) roberta_config = transformers.RobertaConfig.from_pretrained(roberta_path) roberta_config.output_hidden_states = True sentiment_d = {'positive': 1313, 'negative': 2430, 'neutral': 7974} """ roberta config """ """ training """ n_splits = 5 max_epochs = 5 initial_lr = 3e-5 is_gpu = torch.cuda.is_available() device = torch.device('cuda' if is_gpu else 'cpu')