padding='max_length', max_length=arguments['target_max_len']) encodings = { 'input_ids': input_encodings['input_ids'], 'attention_mask': input_encodings['attention_mask'], 'target_ids': target_encodings['input_ids'], 'target_attention_mask': target_encodings['attention_mask'] } return encodings print('Getting data from huggingface datasets') # Use the following to load only a percentage of data for sample efficiency tests train_dataset = nlp.load_dataset(arguments['dataset_name'], split='train[:60%]') valid_dataset = nlp.load_dataset(arguments['dataset_name'], split='validation[:100%]') #train_dataset = nlp.load_dataset(arguments['dataset_name'], split = nlp.Split.TRAIN) #valid_dataset = nlp.load_dataset(arguments['dataset_name'], split = nlp.Split.VALIDATION) train_dataset = train_dataset.map(format_example, load_from_cache_file=False) train_dataset = train_dataset.map(convert_to_features, batched=True, load_from_cache_file=False) valid_dataset = valid_dataset.map(format_example, load_from_cache_file=False) valid_dataset = valid_dataset.map(convert_to_features, batched=True, load_from_cache_file=False)
), f"Dataset name '{args.dataset}' should be part of the TFRecords directory name '{args.tfrecords_dir}', don't mix datasets!" assert ( args.skip_tfrecords or str(args.max_seq_length) in args.tfrecords_dir ), f"Sequence length '{args.max_seq_length}' should be part of the TFRecords directory name '{args.tfrecords_dir}', don't mix datasets!" if not os.path.exists(args.cache_dir): os.makedirs(args.cache_dir, exist_ok=True) if not args.skip_tfrecords and not os.path.exists(args.tfrecords_dir): os.makedirs(args.tfrecords_dir, exist_ok=True) start_time = time.perf_counter() print(f"Loading dataset: {args.dataset}") if args.dataset.startswith("wikitext"): dset = nlp.load_dataset("wikitext", f"{args.dataset}-raw-v1", split="train", cache_dir=args.cache_dir) elif args.dataset == "wikipedia": dset = nlp.load_dataset("wikipedia", "20200501.en", split="train", cache_dir=args.cache_dir) dset.drop(columns=["title"]) dset.features.pop("title") elif args.dataset == "bookcorpus": dset = nlp.load_dataset("bookcorpus", split="train", cache_dir=args.cache_dir) elif args.dataset == "wikibooks": dset_wikipedia = nlp.load_dataset("wikipedia", "20200501.en",
max_length=2) encodings = { 'input_ids': input_encodings['input_ids'], 'attention_mask': input_encodings['attention_mask'], 'target_ids': target_encodings['input_ids'], 'target_attention_mask': target_encodings['attention_mask'] } return encodings print('Loading commonsense_qa train and valid datasets') # Load the commonsense_qa datasets train_dataset = nlp.load_dataset('commonsense_qa', split=nlp.Split.TRAIN) valid_dataset = nlp.load_dataset('commonsense_qa', split=nlp.Split.VALIDATION) # Load the commonsense_qa concept sentences train_concept_sentences = get_sentences(TRAIN_SENTENCES_FILE, train_dataset) valid_concept_sentences = get_sentences(VALID_SENTENCES_FILE, valid_dataset) train_dataset = train_dataset.map(format_example_train, with_indices=True, load_from_cache_file=False) train_dataset = train_dataset.map(convert_to_features, batched=True, load_from_cache_file=False) valid_dataset = valid_dataset.map(format_example_valid, with_indices=True,
from transformers import BertForSequenceClassification, BertTokenizerFast, Trainer, TrainingArguments from nlp import load_dataset import torch from config import * model = BertForSequenceClassification.from_pretrained('bert-base-uncased') tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased') def tokenize(batch): return tokenizer(batch['text'], padding=True, truncation=True) train_dataset, test_dataset = load_dataset('imdb', split=['train', 'test']) train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset)) test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(train_dataset)) train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label']) test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label']) training_args = TrainingArguments( output_dir='./results', num_train_epochs=EPOCHS, per_device_train_batch_size=BATCH_SIZE, per_device_eval_batch_size=BATCH_SIZE, warmup_steps=500,
def __init__(self): self.dataset = nlp.load_dataset('imdb') self.counter = 0
import nlp ELI5 = nlp.load_dataset('eli5') WIKI40B_SNIPPETS = nlp.load_dataset('wiki_snippets', name='wiki40b_en_100_0')['train'] SAVED_RETRIEVER = "retriever_models/eli5_retriever_model_l-8_h-768_b-512-512" SAVED_REPRESENTATIONS = "wiki_index/wiki40b_passages_reps_32_l-8_h-768_b-512-512.dat" class ArgumentsQAR(): def __init__(self): self.batch_size = 512 self.max_length = 128 self.checkpoint_batch_size = 32 self.print_freq = 100 self.pretrained_model_name = "google/bert_uncased_L-8_H-768_A-12" self.model_save_name = SAVED_RETRIEVER self.learning_rate = 2e-4 self.num_epochs = 10
def get_dataset(name, tokenizer, split): if name == 'mnli': dataset = load_dataset('glue','mnli', split=split) else: dataset = load_dataset(name, split=split) input_ids = np.zeros(shape=(len(dataset),512)) token_type_ids = np.zeros(shape=(len(dataset),512)) attention_mask = np.zeros(shape=(len(dataset),512)) answer = np.zeros(shape=(len(dataset))) # input_ids = [] # token_type_ids = [] # attention_mask = [] # answer = [] if name=='boolq': for i in range(len(dataset)): tensor_features = tokenizer.__call__(dataset[i]['question'], dataset[i]['passage'], stride=128, return_tensors='np', max_length = 512, padding='max_length', truncation=True,return_overflowing_tokens=True) input_ids[i] = tensor_features['input_ids'] token_type_ids[i] = tensor_features['token_type_ids'] attention_mask[i] = tensor_features['attention_mask'] # append越來越慢 https://hant-kb.kutu66.com/others/post_544244 if dataset[i]['answer']==True: # answer.append(1) answer[i] = 1 elif dataset[i]['answer']==False: # answer.append(0) answer[i] = 0 # if i == 1000: # break input_ids = torch.LongTensor(input_ids) token_type_ids = torch.LongTensor(token_type_ids) attention_mask = torch.LongTensor(attention_mask) answer = torch.LongTensor(answer) elif name=='snli' or name=='mnli': # label 0 : entailment, label 1 : neural, label 2 : contradiction for i in tqdm(range(len(dataset))): tensor_features = tokenizer.__call__(dataset[i]['premise'], dataset[i]['hypothesis'], return_tensors='np' , stride=128, max_length = 512, padding='max_length', truncation=True,return_overflowing_tokens=True) input_ids[i] = tensor_features['input_ids'] token_type_ids[i] = tensor_features['token_type_ids'] attention_mask[i] = tensor_features['attention_mask'] if dataset[i]['label']==-1: answer[i] = 3 else: answer[i] = dataset[i]['label'] # if i == 1000: # break input_ids = torch.LongTensor(input_ids) token_type_ids = torch.LongTensor(token_type_ids) attention_mask = torch.LongTensor(attention_mask) answer = torch.LongTensor(answer) return TensorDataset(input_ids, token_type_ids, attention_mask, answer)
ap.add_argument("-data_folder", "--data_folder", required=True, help="Path to the dataset") ap.add_argument("-model_folder", "--model_folder", required=True, help="Path to the model") args = vars(ap.parse_args()) DATA_FOLDER = args['data_folder'] MODEL_FOLDER = args['model_folder'] #Load dataset #dataset = load_dataset(os.path.join(DATA_FOLDER, 'de_politik_news.py'), cache_dir=os.path.join(DATA_FOLDER, '.de-politic-news')) dataset = load_dataset('de_politik_news.py', cache_dir=DATA_FOLDER) #Tokenize test dataset tokenizer = BertTokenizer.from_pretrained('bert-base-german-cased') encoded_test = dataset['test'].map(lambda examples: tokenizer( examples['text'], padding='max_length', truncation=True), batched=True) #Process labels label_dict = { 'far-left': 0, 'center-left': 1, 'center': 2, 'center-right': 3, 'far-right': 4 } encoded_test = encoded_test.map(
def _prepare_ds(split): data = nlp.load_dataset("imdb", split=f"{split}[:{FLAGS.batch_size if FLAGS.debug else f'{FLAGS.percent}%'}]") data = data.map(_tokenize, batched=True) data.set_format(type="torch", columns=["input_ids", "label"]) return data
'google/electra-small-discriminator') import random def tokenize(batch): return tokenizer(batch['text'], truncation=True, max_length=256, add_special_tokens=True, padding='max_length', return_attention_mask=True) train_dataset = load_dataset( 'json', data_files={'train': 'dataset_full_question/quanta_train.json'}, field='questions')['train'] train_dataset = train_dataset.map( lambda example: {'label': [0 if example['difficulty'] == 'School' else 1]}) train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset)) train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label']) training_args = TrainingArguments( output_dir='./results', num_train_epochs=2, per_device_train_batch_size=16, per_device_eval_batch_size=16, # warmup_steps = 0,
def _prepare_ds(split): ds = nlp.load_dataset('imdb', split=f'{split}[:{FLAGS.batch_size if FLAGS.debug else f"{FLAGS.percent}%"}]') ds = ds.map(_tokenize, batched=True) ds.set_format(type='torch', columns=['input_ids', 'label']) return ds
datasets = nlp.list_datasets() metrics = nlp.list_metrics() print(f"🤩 Currently {len(datasets)} datasets are available on HuggingFace AWS bucket: \n" + '\n'.join(dataset.id for dataset in datasets) + '\n') print(f"🤩 Currently {len(metrics)} metrics are available on HuggingFace AWS bucket: \n" + '\n'.join(metric.id for metric in metrics)) """## Loading a dataset ### Load Dataset """ import nlp mnli = nlp.load_dataset(path='glue', name='mnli', split='train[:10%]') """## Whats in a dataset object ### Features and columns """ print(mnli.shape) print(mnli.num_columns) print(mnli.num_rows) print(len(mnli)) print(mnli.column_names) print(mnli.features) dataset = mnli print(dataset.features['label'].num_classes)
def main(args_dict=None): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if args_dict is not None: model_args, data_args, training_args = parser.parse_dict(args_dict) elif len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Set project name os.environ["WANDB_PROJECT"] = "multilingual_zeroshot" num_labels = 3 labels = ['entailment', 'neutral', 'contradiction'] # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = MBartConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, dropout=model_args.dropout, attention_dropout=model_args.attention_dropout, finetuning_task="mnli", cache_dir=model_args.cache_dir, ) tokenizer = MBartTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = MBartForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets columns = ['input_ids', 'attention_mask', 'labels'] map_fn = get_mnli_map_fn(data_args.lang, data_args.max_seq_length, tokenizer) train_dataset = nlp.load_dataset("multi_nli", split="train") train_dataset = train_dataset.map(map_fn, batched=True, batch_size=512) train_dataset.set_format(type='torch', columns=columns) eval_dataset = (nlp.load_dataset("multi_nli", split="validation_matched") if training_args.do_eval else None) eval_dataset = eval_dataset.map(map_fn, batched=True, batch_size=512) eval_dataset.set_format(type='torch', columns=columns) def compute_metrics_fn(p: EvalPrediction): preds = np.argmax(p.predictions, axis=1) return glue_compute_metrics("classification", preds, p.label_ids) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics_fn, data_collator=DataCollator(tokenizer), ) # disable wandb console logs logging.getLogger('wandb.run_manager').setLevel(logging.WARNING) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) mis_matched_dataset = nlp.load_dataset("multi_nli", split="validation_mismatched") mis_matched_dataset = mis_matched_dataset.map(map_fn, batched=True, batch_size=512) mis_matched_dataset.set_format(type='torch', columns=columns) eval_datasets = [eval_dataset, mis_matched_dataset] for eval_dataset in eval_datasets: trainer.compute_metrics = compute_metrics_fn eval_result = trainer.evaluate(eval_dataset=eval_dataset) output_eval_file = os.path.join(training_args.output_dir, f"eval_results.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in eval_result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) eval_results.update(eval_result)
json_entries_new.append(entry) else: json_entries_new.append(entry) #Save data random.seed(42) random.shuffle(json_entries_new) test_sources = [ 'jungle.world', 'tichyseinblick.de', 'taz.de', 'blogs.taz.de', 'vice.com', 'freitag.de', 'deutsch.rt.com', 'br.de', 'handelsblatt.com', 'stern.de', 'wdr.de' ] train_entries_new = [ entry for entry in json_entries_new if entry['source_domain'] not in test_sources ] test_entries_new = [ entry for entry in json_entries_new if entry['source_domain'] in test_sources ] #ensure_dir('data') with jsonlines.open('train.jsonl', 'w') as writer_train: writer_train.write_all(train_entries_new) with jsonlines.open('test.jsonl', 'w') as writer_test: writer_test.write_all(test_entries_new) dataset = load_dataset('de_politik_news.py', cache_dir='data') os.remove('train.jsonl') os.remove('test.jsonl')
model = BertForSequenceClassification.from_pretrained( 'models/BERT_full_question') tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased') def tokenize(batch): return tokenizer(batch['text'], truncation=True, max_length=256, add_special_tokens=True, padding='max_length', return_attention_mask=True) test_dataset = load_dataset( 'json', data_files={'test': 'dataset_full_question/quanta_test.json'}, field='questions')['test'] test_dataset = test_dataset.map( lambda example: {'label': [0 if example['difficulty'] == 'School' else 1]}) test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset)) test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label']) def compute_metrics(pred): labels = pred.label_ids # print(labels) preds = pred.predictions.argmax(-1) # print(preds)
target_encodings = tokenizer.batch_encode_plus( example_batch['target_text'], pad_to_max_length=True, max_length=16) encodings = { 'input_ids': input_encodings['input_ids'], 'attention_mask': input_encodings['attention_mask'], 'target_ids': target_encodings['input_ids'], 'target_attention_mask': target_encodings['attention_mask'] } return encodings # load train and validation split of squad train_dataset = load_dataset("./squad.py", ignore_verifications=True, split="train") valid_dataset = load_dataset("./squad.py", split="validation") # map add_eos_to_examples function to the dataset example wise train_dataset = train_dataset.map(add_eos_to_examples) # map convert_to_features batch wise train_dataset = train_dataset.map(convert_to_features, batched=True) valid_dataset = valid_dataset.map(add_eos_to_examples, load_from_cache_file=False) valid_dataset = valid_dataset.map(convert_to_features, batched=True, load_from_cache_file=False) # set the tensor type and the columns which the dataset should return
from typing import List, Tuple import nlp import tqdm from config import model_config, gpu_name from generate_counterfactuals import generate_counterfactuals from search_utils.Query import Query from search_utils.Result import Result logger = logging.getLogger(__name__) info = logger.info model_config.load("imdb", None) imdb = nlp.load_dataset("imdb") imdb_train, imdb_test = imdb["train"], imdb["test"] dataset = imdb_test.shuffle(seed=42) # otherwise labels are sorted try: # noinspection PyUnresolvedReferences from google.colab import drive drive.mount('/content/drive') except ModuleNotFoundError: info("probably not running on colab") results: List[Tuple[int, Result]] = [] new_start = 0 for enm in tqdm.tqdm(range(new_start, len(dataset))):
import numpy as np import pandas as pd from nlp import load_dataset dataset = load_dataset("social_bias_frames") # 2020 dataset = load_dataset("hyperpartisan_news_detection") # 2019 dataset = load_dataset("event2Mind") # 2018 dataset = load_dataset("emotion") # 2018 dataset = load_dataset("sentiment140") # 2009 dataset = load_dataset("squad_v2") dataset = load_dataset("squadshifts") # 2020 dataset = load_dataset("webis/tl_dr") # 2017 dataset = load_dataset("wiki_dpr") # 2020 dataset = load_dataset("wiki_snippets") dataset = load_dataset("wikipedia") dataset = load_dataset("wikitext") # 2016 dataset = load_dataset("yelp_polarity") dataset = load_dataset("newsroom") # 2018 dataset = load_dataset("multi_news") # 2019 dataset = load_dataset("imdb") # 2011 dataset = load_dataset("cnn_dailymail") # 2017 dataset = load_dataset("civil_comments") # 2017 # Maybe dataset = load_dataset("social_i_qa") dataset = load_dataset("sogou_news") dataset = load_dataset("super_glue") # 2019 dataset = load_dataset("rotten_tomatoes") # 2005 dataset = load_dataset("reddit") # 2017 dataset = load_dataset("quora") dataset = load_dataset("opinosis") # 2010
def main(): parser = HfArgumentParser((DataTrainingArguments,)) data_args = parser.parse_args_into_dataclasses()[0] logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO ) if data_args.model_type == 't5': tokenizer = T5Tokenizer.from_pretrained("t5-base") else: tokenizer = T5Tokenizer.from_pretrained("bart-base") tokenizer.add_tokens(['<sep>', '<hl>']) train_dataset = nlp.load_dataset(data_args.dataset_path, name=data_args.qg_format, split=nlp.Split.TRAIN) valid_dataset = nlp.load_dataset(data_args.dataset_path, name=data_args.qg_format, split=nlp.Split.VALIDATION) processor = DataProcessor( tokenizer, model_type=data_args.model_type, max_source_length=data_args.max_source_length, max_target_length=data_args.max_target_length ) train_dataset = train_dataset.filter(TASK_TO_FILTER_FN[data_args.task]) if data_args.task == 'multi' and data_args.valid_for_qg_only: logger.info("processing valid data only for qg task") valid_dataset = valid_dataset.filter(filter_qg) else: valid_dataset = valid_dataset.filter(TASK_TO_FILTER_FN[data_args.task]) train_dataset = processor.process(train_dataset) valid_dataset = processor.process(valid_dataset) columns = ["source_ids", "target_ids", "attention_mask"] train_dataset.set_format(type='torch', columns=columns) valid_dataset.set_format(type='torch', columns=columns) if data_args.train_file_name is None: train_file_name = f"train_data_{data_args.task}_{data_args.qg_format}_{data_args.model_type}.pt" train_path = os.path.join("data", train_file_name) valid_file_name = f"valid_data_{data_args.task}_{data_args.qg_format}_{data_args.model_type}.pt" valid_path = os.path.join("data", valid_file_name) else: train_path = os.path.join("data", data_args.train_file_name) valid_path = os.path.join("data", data_args.valid_file_name) torch.save(train_dataset, train_path) logger.info(f"saved train dataset at {train_path}") torch.save(valid_dataset, valid_path) logger.info(f"saved validation dataset at {valid_path}") tokenizer_path = f"{data_args.model_type}_qg_tokenizer" if not os.path.exists(tokenizer_path): os.mkdir(tokenizer_path) tokenizer.save_pretrained(tokenizer_path) logger.info(f"saved tokenizer at {tokenizer_path}")
token_type = [0]*len(tokens) if tokens_b: tokens += [*tokens_b, hf_tokenizer.sep_token] token_type += [1]*(len(tokens_b)+1) example['inp_ids'] = hf_tokenizer.convert_tokens_to_ids(tokens) example['attn_mask'] = [1] * len(tokens) example['token_type_ids'] = token_type return example # %% glue_dsets = {}; glue_dls = {} for task in ['cola', 'sst2', 'mrpc', 'qqp', 'stsb', 'mnli', 'qnli', 'rte', 'wnli', 'ax']: # Load / download datasets. dsets = nlp.load_dataset('glue', task, cache_dir='./datasets') # There is two samples broken in QQP training set if task=='qqp': dsets['train'] = dsets['train'].filter(lambda e: e['question2']!='', cache_file_name='./datasets/glue/qqp/1.0.0/fixed_train.arrow') # Load / Make tokenized datasets tok_func = partial(tokenize_sents_max_len, cols=TEXT_COLS[task], max_len=c.max_length) glue_dsets[task] = dsets.my_map(tok_func, cache_file_names=f"tokenized_{c.max_length}_{{split}}") if c.double_unordered and task in ['mrpc', 'stsb']: swap_tok_func = partial(tokenize_sents_max_len, cols=TEXT_COLS[task], max_len=c.max_length, swap=True) swapped_train = dsets['train'].my_map(swap_tok_func, cache_file_name=f"swapped_tokenized_{c.max_length}_train") glue_dsets[task]['train'] = HF_MergedDataset(glue_dsets[task]['train'], swapped_train)
def _prepare_ds(split): #loading dataset from the nlp library ds = nlp.load_dataset(FLAGS.dataset, split=f'{split}') ds = ds.map(_tokenize, batched=True) ds.set_format(type='torch', columns=['input_ids', 'label']) #output sample type return ds
print(f"🤩 Currently {len(datasets)} datasets are available on HuggingFace AWS bucket: \n" + '\n'.join(dataset.id for dataset in datasets) + '\n') print(f"🤩 Currently {len(metrics)} metrics are available on HuggingFace AWS bucket: \n" + '\n'.join(metric.id for metric in metrics)) # You can read a few attributes of the datasets before loading them (they are python dataclasses) from dataclasses import asdict for key, value in asdict(datasets[6]).items(): print('👉 ' + key + ': ' + str(value)) """## An example with SQuAD""" # Downloading and loading a dataset dataset = nlp.load_dataset('squad', split='validation[:10%]') """This call to `nlp.load_dataset()` does the following steps under the hood: 1. Download and import in the library the **SQuAD python processing script** from HuggingFace AWS bucket if it's not already stored in the library. You can find the SQuAD processing script [here](https://github.com/huggingface/nlp/tree/master/datasets/squad/squad.py) for instance. Processing scripts are small python scripts which define the info (citation, description) and format of the dataset and contain the URL to the original SQuAD JSON files and the code to load examples from the original SQuAD JSON files. 2. Run the SQuAD python processing script which will: - **Download the SQuAD dataset** from the original URL (see the script) if it's not already downloaded and cached. - **Process and cache** all SQuAD in a structured Arrow table for each standard splits stored on the drive. Arrow table are arbitrarly long tables, typed with types that can be mapped to numpy/pandas/python standard types and can store nested objects. They can be directly access from drive, loaded in RAM or even streamed over the web.
def train(args): train_data, dev_data = nlp.load_dataset( 'math_dataset', split=['train[:95%]', 'train[95%:]'], cache_dir="D:/.cache/nlp/") vocab = Vocab.load(args['--vocab']) embed_size = int(args['--embed-size']) hidden_size = int(args['--hidden-size']) dropout_rate = float(args['--dropout']) train_batch_size = int(args['--batch-size']) clip_grad = float(args['--clip-grad']) valid_niter = int(args['--valid-niter']) log_every = int(args['--log-every']) model_save_path = args['--save-to'] model = MathBaselineLSTMwAttention(embed_size, hidden_size, vocab, dropout_rate) size = get_size_of_model(model) print("model size %d parameters" % size) model.train() uniform_init = float(args['--uniform-init']) if np.abs(uniform_init) > 0.: print('uniformly initialize parameters [-%f, +%f]' % (uniform_init, uniform_init), file=sys.stderr) for p in model.parameters(): p.data.uniform_(-uniform_init, uniform_init) else: print('Xavier initialize parameters', file=sys.stderr) for p in model.parameters(): p.data.xavier_uniform_() device = torch.device("cuda:0" if args['--cuda'] else "cpu") print('use device: %s' % device, file=sys.stderr) model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr'])) num_trial = 0 train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0 cum_examples = report_examples = epoch = valid_num = 0 hist_valid_scores = [] train_time = begin_time = time.time() print('begin Maximum Likelihood training') while True: epoch += 1 for src_sents, tgt_sents in batch_iter(train_data, batch_size=train_batch_size, shuffle=True): train_iter += 1 optimizer.zero_grad() batch_size = len(src_sents) example_losses = -model(src_sents, tgt_sents) # (batch_size,) batch_loss = example_losses.sum() loss = batch_loss / batch_size loss.backward() # clip gradient grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad) optimizer.step() batch_losses_val = batch_loss.item() report_loss += batch_losses_val cum_loss += batch_losses_val tgt_words_num_to_predict = sum( len(s[1:]) for s in tgt_sents) # omitting leading `<sos>` report_tgt_words += tgt_words_num_to_predict cum_tgt_words += tgt_words_num_to_predict report_examples += batch_size cum_examples += batch_size if train_iter % log_every == 0: print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \ 'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter, report_loss / report_examples, math.exp(report_loss / report_tgt_words), cum_examples, report_tgt_words / (time.time() - train_time), time.time() - begin_time), file=sys.stderr) train_time = time.time() report_loss = report_tgt_words = report_examples = 0. # perform validation if train_iter % valid_niter == 0: print( 'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter, cum_loss / cum_examples, np.exp(cum_loss / cum_tgt_words), cum_examples), file=sys.stderr) cum_loss = cum_examples = cum_tgt_words = 0. valid_num += 1 print('begin validation ...', file=sys.stderr) # compute dev. ppl and bleu dev_ppl = evaluate_ppl( model, dev_data, batch_size=128) # dev batch size can be a bit larger valid_metric = -dev_ppl print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl), file=sys.stderr) is_better = len(hist_valid_scores ) == 0 or valid_metric > max(hist_valid_scores) hist_valid_scores.append(valid_metric) if is_better: patience = 0 print('save currently the best model to [%s]' % model_save_path, file=sys.stderr) model.save(model_save_path) # also save the optimizers' state torch.save(optimizer.state_dict(), model_save_path + '.optim') elif patience < int(args['--patience']): patience += 1 print('hit patience %d' % patience, file=sys.stderr) if patience == int(args['--patience']): num_trial += 1 print('hit #%d trial' % num_trial, file=sys.stderr) if num_trial == int(args['--max-num-trial']): print('early stop!', file=sys.stderr) exit(0) # decay lr, and restore from previously best checkpoint lr = optimizer.param_groups[0]['lr'] * float( args['--lr-decay']) print( 'load previously best model and decay learning rate to %f' % lr, file=sys.stderr) # load model params = torch.load( model_save_path, map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) model = model.to(device) print('restore parameters of the optimizers', file=sys.stderr) optimizer.load_state_dict( torch.load(model_save_path + '.optim')) # set new lr for param_group in optimizer.param_groups: param_group['lr'] = lr # reset patience patience = 0 if epoch == int(args['--max-epoch']): print('reached maximum number of epochs!', file=sys.stderr) exit(0)
def __init__(self, max_len, batch_size, max_epochs, device, unsup_proportion, sup_proportion, dev_index=1, pretrained=False): text_field = data.Field( lower=True, batch_first=True, fix_length=max_len, pad_token='<pad>', init_token='<go>', is_target=True ) # init_token='<go>', eos_token='<eos>', unk_token='<unk>', pad_token='<unk>') label_field = data.Field(fix_length=max_len - 1, batch_first=True, unk_token=None) print('Current working directory:', os.getcwd()) yelp_data = load_dataset('csv', data_files={ 'train': os.path.join('.data', 'yelp', 'train.csv'), 'test': os.path.join('.data', 'yelp', 'test.csv') }, column_names=['label', 'text'], version='0.0.2') #download_mode=FORCE_REDOWNLOAD) start = time() train_data, test_data = yelp_data['train'], yelp_data['test'] def expand_labels(datum): datum['label'] = [str(datum['label'])] * (max_len - 1) return datum lens = [len(sample['text'].split(' ')) for sample in train_data] train_data, test_data = train_data.map(expand_labels), test_data.map( expand_labels) fields1 = {'text': text_field, 'label': label_field} fields2 = { 'text': ('text', text_field), 'label': ('label', label_field) } fields3 = {'text': text_field} fields4 = {'text': ('text', text_field)} len_train = int(len(train_data) / 3) dev_start, dev_end = int(len_train/5*(dev_index-1)), \ int(len_train/5*(dev_index)) train_start1, train_start2, train_end1, train_end2 = 0, dev_end, int(dev_start*sup_proportion),\ int(dev_end+(len_train-dev_end)*sup_proportion) unsup_start, unsup_end = len_train, int(len_train + len_train * 2 * unsup_proportion) # Since the datasets are originally sorted with the label as key, we shuffle them before reducing the supervised # or the unsupervised data to the first few examples. We use a fixed see to keep the same data for all # experiments np.random.seed(42) train_examples = [Example.fromdict(ex, fields2) for ex in train_data] unsup_examples = [Example.fromdict(ex, fields4) for ex in train_data] np.random.shuffle(train_examples) np.random.shuffle(unsup_examples) train = Dataset( train_examples[train_start1:train_end1] + train_examples[train_start2:train_end2], fields1) val = Dataset(train_examples[dev_start:dev_end], fields1) test = Dataset([Example.fromdict(ex, fields2) for ex in test_data], fields1) unsup_train = Dataset(unsup_examples[unsup_start:unsup_end], fields3) vocab_dataset = Dataset(train_examples, fields1) unsup_test, unsup_val = test, test print('data loading took', time() - start) # build the vocabulary text_field.build_vocab( vocab_dataset, max_size=VOCAB_LIMIT) # , vectors="fasttext.simple.300d") label_field.build_vocab(train) # make iterator for splits self.train_iter, _, _ = data.BucketIterator.splits( (unsup_train, unsup_val, unsup_test), batch_size=batch_size, device=device, shuffle=True, sort=False) _, self.unsup_val_iter, _ = data.BucketIterator.splits( (unsup_train, unsup_val, unsup_test), batch_size=int(batch_size), device=device, shuffle=False, sort=False) self.sup_iter, _, _ = data.BucketIterator.splits((train, val, test), batch_size=batch_size, device=device, shuffle=True, sort=False) _, self.val_iter, self.test_iter = data.BucketIterator.splits( (train, val, test), batch_size=int(batch_size), device=device, shuffle=False, sort=False) self.vocab = text_field.vocab self.tags = label_field.vocab self.text_field = text_field self.label_field = label_field self.device = device self.batch_size = batch_size self.n_epochs = 0 self.max_epochs = max_epochs if pretrained: ftxt = FastText() self.wvs = ftxt.get_vecs_by_tokens(self.vocab.itos) else: self.wvs = None
def __init__(self): self.dataset = nlp.load_dataset('boolq') self.counter = 0
disc_hparam = electra_hparam_from_hf(disc_config, hf_tokenizer) disc_hparam.update(hparam_update) # %% [markdown] # # 1. Load Data # %% if c.size in ['small', 'base']: # wiki if (wiki_cache_dir/f"wiki_electra_{c.max_length}.arrow").exists(): print('loading the electra data (wiki)') wiki = nlp.Dataset.from_file(str(wiki_cache_dir/f"wiki_electra_{c.max_length}.arrow")) else: print('load/download wiki dataset') wiki = nlp.load_dataset('wikipedia', '20200501.en', cache_dir='./datasets')['train'] print('creat data from wiki dataset for ELECTRA') wiki = ELECTRADataTransform(wiki, is_docs=True, text_col='text', max_length=c.max_length, hf_toker=hf_tokenizer).map(cache_file_name=str(wiki_cache_dir/f"wiki_electra_{c.max_length}.arrow")) # bookcorpus if (book_cache_dir/f"book_electra_{c.max_length}.arrow").exists(): print('loading the electra data (BookCorpus)') book = nlp.Dataset.from_file(str(book_cache_dir/f"book_electra_{c.max_length}.arrow")) else: print('load/download BookCorpus dataset') book = nlp.load_dataset('bookcorpus', cache_dir='./datasets')['train'] print('creat data from BookCorpus dataset for ELECTRA') book = ELECTRADataTransform(book, is_docs=False, text_col='text', max_length=c.max_length, hf_toker=hf_tokenizer).map(cache_file_name=str(book_cache_dir/f"book_electra_{c.max_length}.arrow"))
def main(): batch_size = 4 vocab_size = 16384 max_source_length = 1024 max_target_length = 1024 num_workers = 3 dataset = nlp.load_dataset("iwslt2017.py", "nl-en") # Train tokenizer tokenizer_filename = "tokenizer.json" if os.path.exists(tokenizer_filename): tokenizer = Tokenizer.from_file(tokenizer_filename) else: data_filename = "whole_data.txt" with open(data_filename, "w") as f: for item in dataset["train"]: f.write(item["source"] + "\n") f.write(item["target"] + "\n\n") tokenizer = CharBPETokenizer() tokenizer.train([data_filename], vocab_size=vocab_size) pad_token = AddedToken("[PAD]", lstrip=False, rstrip=False) tokenizer.add_tokens([pad_token]) tokenizer.save(tokenizer_filename) tokenizer.pad_token_id = vocab_size # Loaders train_dataset = Seq2SeqDataset(tokenizer, dataset["train"], max_source_length, max_target_length) val_dataset = Seq2SeqDataset(tokenizer, dataset["validation"], max_source_length, max_target_length) train_loader = DataLoader( train_dataset, batch_size=batch_size, shuffle=True, collate_fn=train_dataset.collate_fn, num_workers=num_workers, ) val_loader = DataLoader( val_dataset, batch_size=batch_size, collate_fn=val_dataset.collate_fn, num_workers=num_workers, ) # Train model config = BartConfig( vocab_size=vocab_size + 1, # Pad d_model=1024, encoder_ffn_dim=1024, encoder_layers=6, encoder_attention_heads=4, decoder_ffn_dim=1024, decoder_layers=6, decoder_attention_heads=4, ) model = BartForConditionalGeneration(config) translator = Translate(model, tokenizer) trainer = pl.Trainer(gpus=1) trainer.fit(translator, train_loader, val_loader)
num_beams=4, early_stopping=True) else: outputs = model.generate(inputs, max_length=max_length, min_length=min_length, length_penalty=2.0, num_beams=4, early_stopping=True) return tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=False) en_dataset = load_dataset('en_wiki_multi_news.py', cache_dir='dataset/.en-wiki-multi-news-cache', split='test') de_dataset = load_dataset('de_wiki_multi_news.py', cache_dir='dataset/.de-wiki-multi-news-cache', split='test') fr_dataset = load_dataset('fr_wiki_multi_news.py', cache_dir='dataset/.fr-wiki-multi-news-cache', split='test') # Part 0: Title st.markdown(""" # Generate long summaries in multiple languages using Transformers We use BART or T5 to generate long summaries in English, German or French. This application is a demo of a research paper.
plt.yticks(list(range(0, 6)), labels=classes) plt.colorbar(ctx) plt.show() print('Using TensorFlow version', tf.__version__) # ## Task 3: Importing Data # # 1. Importing the Tweet Emotion dataset # 2. Creating train, validation and test sets # 3. Extracting tweets and labels from the examples # In[2]: dataset = nlp.load_dataset('emotion') # In[3]: train = dataset['train'] val = dataset['validation'] test = dataset['test'] # In[4]: def get_tweet(data): tweets = [x['text'] for x in data] labels = [x['label'] for x in data] return tweets, labels
def main( output_filepath: str, min_length: Optional[int] = None, max_documents: Optional[int] = None, pretrained_model_name_or_path: Optional[str] = None, ) -> None: """Lightly pre-processes an OpenWebText dump obtained from https://skylion007.github.io/OpenWebTextCorpus/. If `min_length` is not None, only documents with at least this many tokens are retained. If `pretrained_model_name_or_path` is not None, the tokenizer will be loaded as `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` using the HuggingFace Transformers library. Otherwise `str.split()` is used. This argument has no effect if `min-length is None`. """ # Collect the raw text from the "scientific_papers" dataset. pubmed = nlp.load_dataset("scientific_papers", "pubmed") arxiv = nlp.load_dataset("scientific_papers", "arxiv") # Create a genrator over both datasets to avoid storing things in memory. pubmed_text = (article["article"] for partition in pubmed.values() for article in partition) arxiv_text = (article["article"] for partition in arxiv.values() for article in partition) scientific_text = itertools.chain(pubmed_text, arxiv_text) # Setup the pre-trained tokenizer, if specified. if min_length is not None: if pretrained_model_name_or_path is not None: # Import transformers here to prevent ImportError errors if the # user doesn't want to use it. from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained( pretrained_model_name_or_path).tokenize else: tokenizer = lambda x: x.split() # noqa else: tokenizer = None documents = [] typer.secho( (f' {MINING} Scraping {max_documents or "all"} documents' f' {f"with a minimum token length of {min_length}" if min_length else ""}' ), fg=typer.colors.WHITE, bold=True, ) with typer.progressbar(scientific_text, label="Preprocessing text") as progress: for doc in progress: doc = _sanitize(doc) if not doc: continue # Retain documents if the length of their shortest document is # equal to or greater than the minimum specified length if tokenizer is not None: num_tokens = len(tokenizer(doc)) if num_tokens < min_length: continue documents.append(doc) _write_output_to_disk(documents, output_filepath)