示例#1
0
        padding='max_length',
        max_length=arguments['target_max_len'])

    encodings = {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'target_ids': target_encodings['input_ids'],
        'target_attention_mask': target_encodings['attention_mask']
    }

    return encodings


print('Getting data from huggingface datasets')
# Use the following to load only a percentage of data for sample efficiency tests
train_dataset = nlp.load_dataset(arguments['dataset_name'],
                                 split='train[:60%]')
valid_dataset = nlp.load_dataset(arguments['dataset_name'],
                                 split='validation[:100%]')

#train_dataset = nlp.load_dataset(arguments['dataset_name'], split = nlp.Split.TRAIN)
#valid_dataset = nlp.load_dataset(arguments['dataset_name'], split = nlp.Split.VALIDATION)

train_dataset = train_dataset.map(format_example, load_from_cache_file=False)
train_dataset = train_dataset.map(convert_to_features,
                                  batched=True,
                                  load_from_cache_file=False)

valid_dataset = valid_dataset.map(format_example, load_from_cache_file=False)
valid_dataset = valid_dataset.map(convert_to_features,
                                  batched=True,
                                  load_from_cache_file=False)
), f"Dataset name '{args.dataset}' should be part of the TFRecords directory name '{args.tfrecords_dir}', don't mix datasets!"
assert (
    args.skip_tfrecords or str(args.max_seq_length) in args.tfrecords_dir
), f"Sequence length '{args.max_seq_length}' should be part of the TFRecords directory name '{args.tfrecords_dir}', don't mix datasets!"

if not os.path.exists(args.cache_dir):
    os.makedirs(args.cache_dir, exist_ok=True)
if not args.skip_tfrecords and not os.path.exists(args.tfrecords_dir):
    os.makedirs(args.tfrecords_dir, exist_ok=True)

start_time = time.perf_counter()

print(f"Loading dataset: {args.dataset}")
if args.dataset.startswith("wikitext"):
    dset = nlp.load_dataset("wikitext",
                            f"{args.dataset}-raw-v1",
                            split="train",
                            cache_dir=args.cache_dir)
elif args.dataset == "wikipedia":
    dset = nlp.load_dataset("wikipedia",
                            "20200501.en",
                            split="train",
                            cache_dir=args.cache_dir)
    dset.drop(columns=["title"])
    dset.features.pop("title")
elif args.dataset == "bookcorpus":
    dset = nlp.load_dataset("bookcorpus",
                            split="train",
                            cache_dir=args.cache_dir)
elif args.dataset == "wikibooks":
    dset_wikipedia = nlp.load_dataset("wikipedia",
                                      "20200501.en",
示例#3
0
        max_length=2)

    encodings = {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'target_ids': target_encodings['input_ids'],
        'target_attention_mask': target_encodings['attention_mask']
    }

    return encodings


print('Loading commonsense_qa train and valid datasets')

# Load the commonsense_qa datasets
train_dataset = nlp.load_dataset('commonsense_qa', split=nlp.Split.TRAIN)
valid_dataset = nlp.load_dataset('commonsense_qa', split=nlp.Split.VALIDATION)

# Load the commonsense_qa concept sentences
train_concept_sentences = get_sentences(TRAIN_SENTENCES_FILE, train_dataset)
valid_concept_sentences = get_sentences(VALID_SENTENCES_FILE, valid_dataset)

train_dataset = train_dataset.map(format_example_train,
                                  with_indices=True,
                                  load_from_cache_file=False)
train_dataset = train_dataset.map(convert_to_features,
                                  batched=True,
                                  load_from_cache_file=False)

valid_dataset = valid_dataset.map(format_example_valid,
                                  with_indices=True,
示例#4
0
from transformers import BertForSequenceClassification, BertTokenizerFast, Trainer, TrainingArguments
from nlp import load_dataset
import torch
from config import *

model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')


def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)


train_dataset, test_dataset = load_dataset('imdb', split=['train', 'test'])
train_dataset = train_dataset.map(tokenize,
                                  batched=True,
                                  batch_size=len(train_dataset))
test_dataset = test_dataset.map(tokenize,
                                batched=True,
                                batch_size=len(train_dataset))
train_dataset.set_format('torch',
                         columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch',
                        columns=['input_ids', 'attention_mask', 'label'])

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
示例#5
0
文件: data_utils.py 项目: yyht/smyrf
 def __init__(self):
     self.dataset = nlp.load_dataset('imdb')
     self.counter = 0
示例#6
0
import nlp

ELI5 = nlp.load_dataset('eli5')
WIKI40B_SNIPPETS = nlp.load_dataset('wiki_snippets', name='wiki40b_en_100_0')['train']
SAVED_RETRIEVER = "retriever_models/eli5_retriever_model_l-8_h-768_b-512-512"
SAVED_REPRESENTATIONS = "wiki_index/wiki40b_passages_reps_32_l-8_h-768_b-512-512.dat"


class ArgumentsQAR():
    def __init__(self):
        self.batch_size = 512
        self.max_length = 128
        self.checkpoint_batch_size = 32
        self.print_freq = 100
        self.pretrained_model_name = "google/bert_uncased_L-8_H-768_A-12"
        self.model_save_name = SAVED_RETRIEVER
        self.learning_rate = 2e-4
        self.num_epochs = 10
示例#7
0
def get_dataset(name, tokenizer, split):     
    if name == 'mnli':
        dataset = load_dataset('glue','mnli', split=split)
    else:
        dataset = load_dataset(name, split=split)      

    input_ids = np.zeros(shape=(len(dataset),512))  
    token_type_ids = np.zeros(shape=(len(dataset),512)) 
    attention_mask = np.zeros(shape=(len(dataset),512))  
    answer = np.zeros(shape=(len(dataset)))
    # input_ids = []       
    # token_type_ids = []
    # attention_mask = []    
    # answer = []

    if name=='boolq':
        for i in range(len(dataset)):         
            tensor_features = tokenizer.__call__(dataset[i]['question'], dataset[i]['passage'], stride=128, return_tensors='np', max_length = 512,  padding='max_length', truncation=True,return_overflowing_tokens=True)          
            input_ids[i] = tensor_features['input_ids']                           
            token_type_ids[i] = tensor_features['token_type_ids']                      
            attention_mask[i] = tensor_features['attention_mask']
            # append越來越慢 https://hant-kb.kutu66.com/others/post_544244         
            

            if dataset[i]['answer']==True:             
                # answer.append(1) 
                answer[i] = 1        
            elif dataset[i]['answer']==False:             
                # answer.append(0)
                answer[i] = 0
            
            # if i == 1000:                 
            #     break
        input_ids = torch.LongTensor(input_ids)     
        token_type_ids = torch.LongTensor(token_type_ids)      
        attention_mask = torch.LongTensor(attention_mask)  
        answer = torch.LongTensor(answer) 

    elif name=='snli' or name=='mnli':
        # label 0 : entailment, label 1 : neural, label 2 : contradiction 
        for i in tqdm(range(len(dataset))):
            tensor_features = tokenizer.__call__(dataset[i]['premise'], dataset[i]['hypothesis'], return_tensors='np' , stride=128, max_length = 512,  padding='max_length', truncation=True,return_overflowing_tokens=True)           
            
            input_ids[i] = tensor_features['input_ids']                           
            token_type_ids[i] = tensor_features['token_type_ids']                      
            attention_mask[i] = tensor_features['attention_mask']

            if dataset[i]['label']==-1:
                answer[i] = 3
            else:
                answer[i] = dataset[i]['label']

            # if i == 1000:
            #     break


        input_ids = torch.LongTensor(input_ids)              
        token_type_ids = torch.LongTensor(token_type_ids)               
        attention_mask = torch.LongTensor(attention_mask)         
        answer = torch.LongTensor(answer)
     
        

    return TensorDataset(input_ids, token_type_ids, attention_mask, answer)
示例#8
0
ap.add_argument("-data_folder",
                "--data_folder",
                required=True,
                help="Path to the dataset")
ap.add_argument("-model_folder",
                "--model_folder",
                required=True,
                help="Path to the model")

args = vars(ap.parse_args())

DATA_FOLDER = args['data_folder']
MODEL_FOLDER = args['model_folder']
#Load dataset
#dataset = load_dataset(os.path.join(DATA_FOLDER, 'de_politik_news.py'), cache_dir=os.path.join(DATA_FOLDER, '.de-politic-news'))
dataset = load_dataset('de_politik_news.py', cache_dir=DATA_FOLDER)
#Tokenize test dataset
tokenizer = BertTokenizer.from_pretrained('bert-base-german-cased')
encoded_test = dataset['test'].map(lambda examples: tokenizer(
    examples['text'], padding='max_length', truncation=True),
                                   batched=True)

#Process labels
label_dict = {
    'far-left': 0,
    'center-left': 1,
    'center': 2,
    'center-right': 3,
    'far-right': 4
}
encoded_test = encoded_test.map(
 def _prepare_ds(split):
     data = nlp.load_dataset("imdb", split=f"{split}[:{FLAGS.batch_size if FLAGS.debug else f'{FLAGS.percent}%'}]")
     data = data.map(_tokenize, batched=True)
     data.set_format(type="torch", columns=["input_ids", "label"])
     return data
    'google/electra-small-discriminator')

import random


def tokenize(batch):
    return tokenizer(batch['text'],
                     truncation=True,
                     max_length=256,
                     add_special_tokens=True,
                     padding='max_length',
                     return_attention_mask=True)


train_dataset = load_dataset(
    'json',
    data_files={'train': 'dataset_full_question/quanta_train.json'},
    field='questions')['train']
train_dataset = train_dataset.map(
    lambda example: {'label': [0 if example['difficulty'] == 'School' else 1]})
train_dataset = train_dataset.map(tokenize,
                                  batched=True,
                                  batch_size=len(train_dataset))
train_dataset.set_format('torch',
                         columns=['input_ids', 'attention_mask', 'label'])

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    # warmup_steps = 0,
示例#11
0
 def _prepare_ds(split):
     ds = nlp.load_dataset('imdb', split=f'{split}[:{FLAGS.batch_size if FLAGS.debug else f"{FLAGS.percent}%"}]')
     ds = ds.map(_tokenize, batched=True)
     ds.set_format(type='torch', columns=['input_ids', 'label'])
     return ds
示例#12
0
datasets = nlp.list_datasets()
metrics = nlp.list_metrics()

print(f"🤩 Currently {len(datasets)} datasets are available on HuggingFace AWS bucket: \n" 
      + '\n'.join(dataset.id for dataset in datasets) + '\n')
print(f"🤩 Currently {len(metrics)} metrics are available on HuggingFace AWS bucket: \n" 
      + '\n'.join(metric.id for metric in metrics))

"""## Loading a dataset

### Load Dataset
"""

import nlp

mnli = nlp.load_dataset(path='glue', name='mnli', split='train[:10%]')

"""## Whats in a dataset object

### Features and columns
"""

print(mnli.shape)
print(mnli.num_columns)
print(mnli.num_rows)
print(len(mnli))
print(mnli.column_names)
print(mnli.features)

dataset = mnli
print(dataset.features['label'].num_classes)
def main(args_dict=None):
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))

    if args_dict is not None:
        model_args, data_args, training_args = parser.parse_dict(args_dict)
    elif len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Set project name
    os.environ["WANDB_PROJECT"] = "multilingual_zeroshot"

    num_labels = 3
    labels = ['entailment', 'neutral', 'contradiction']

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = MBartConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        dropout=model_args.dropout,
        attention_dropout=model_args.attention_dropout,
        finetuning_task="mnli",
        cache_dir=model_args.cache_dir,
    )
    tokenizer = MBartTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    model = MBartForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # Get datasets
    columns = ['input_ids', 'attention_mask', 'labels']
    map_fn = get_mnli_map_fn(data_args.lang, data_args.max_seq_length,
                             tokenizer)

    train_dataset = nlp.load_dataset("multi_nli", split="train")
    train_dataset = train_dataset.map(map_fn, batched=True, batch_size=512)
    train_dataset.set_format(type='torch', columns=columns)

    eval_dataset = (nlp.load_dataset("multi_nli", split="validation_matched")
                    if training_args.do_eval else None)
    eval_dataset = eval_dataset.map(map_fn, batched=True, batch_size=512)
    eval_dataset.set_format(type='torch', columns=columns)

    def compute_metrics_fn(p: EvalPrediction):
        preds = np.argmax(p.predictions, axis=1)
        return glue_compute_metrics("classification", preds, p.label_ids)

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics_fn,
        data_collator=DataCollator(tokenizer),
    )

    # disable wandb console logs
    logging.getLogger('wandb.run_manager').setLevel(logging.WARNING)

    # Training
    if training_args.do_train:
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    eval_results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        # Loop to handle MNLI double evaluation (matched, mis-matched)
        mis_matched_dataset = nlp.load_dataset("multi_nli",
                                               split="validation_mismatched")
        mis_matched_dataset = mis_matched_dataset.map(map_fn,
                                                      batched=True,
                                                      batch_size=512)
        mis_matched_dataset.set_format(type='torch', columns=columns)
        eval_datasets = [eval_dataset, mis_matched_dataset]

        for eval_dataset in eval_datasets:
            trainer.compute_metrics = compute_metrics_fn
            eval_result = trainer.evaluate(eval_dataset=eval_dataset)

            output_eval_file = os.path.join(training_args.output_dir,
                                            f"eval_results.txt")
            if trainer.is_world_master():
                with open(output_eval_file, "w") as writer:
                    logger.info("***** Eval results *****")
                    for key, value in eval_result.items():
                        logger.info("  %s = %s", key, value)
                        writer.write("%s = %s\n" % (key, value))

            eval_results.update(eval_result)
示例#14
0
            json_entries_new.append(entry)
    else:
        json_entries_new.append(entry)

#Save data
random.seed(42)
random.shuffle(json_entries_new)
test_sources = [
    'jungle.world', 'tichyseinblick.de', 'taz.de', 'blogs.taz.de', 'vice.com',
    'freitag.de', 'deutsch.rt.com', 'br.de', 'handelsblatt.com', 'stern.de',
    'wdr.de'
]

train_entries_new = [
    entry for entry in json_entries_new
    if entry['source_domain'] not in test_sources
]
test_entries_new = [
    entry for entry in json_entries_new
    if entry['source_domain'] in test_sources
]

#ensure_dir('data')
with jsonlines.open('train.jsonl', 'w') as writer_train:
    writer_train.write_all(train_entries_new)
with jsonlines.open('test.jsonl', 'w') as writer_test:
    writer_test.write_all(test_entries_new)
dataset = load_dataset('de_politik_news.py', cache_dir='data')
os.remove('train.jsonl')
os.remove('test.jsonl')
model = BertForSequenceClassification.from_pretrained(
    'models/BERT_full_question')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')


def tokenize(batch):
    return tokenizer(batch['text'],
                     truncation=True,
                     max_length=256,
                     add_special_tokens=True,
                     padding='max_length',
                     return_attention_mask=True)


test_dataset = load_dataset(
    'json',
    data_files={'test': 'dataset_full_question/quanta_test.json'},
    field='questions')['test']
test_dataset = test_dataset.map(
    lambda example: {'label': [0 if example['difficulty'] == 'School' else 1]})
test_dataset = test_dataset.map(tokenize,
                                batched=True,
                                batch_size=len(test_dataset))
test_dataset.set_format('torch',
                        columns=['input_ids', 'attention_mask', 'label'])


def compute_metrics(pred):
    labels = pred.label_ids
    # print(labels)
    preds = pred.predictions.argmax(-1)
    # print(preds)
示例#16
0
    target_encodings = tokenizer.batch_encode_plus(
        example_batch['target_text'], pad_to_max_length=True, max_length=16)

    encodings = {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'target_ids': target_encodings['input_ids'],
        'target_attention_mask': target_encodings['attention_mask']
    }

    return encodings


# load train and validation split of squad
train_dataset = load_dataset("./squad.py",
                             ignore_verifications=True,
                             split="train")
valid_dataset = load_dataset("./squad.py", split="validation")

# map add_eos_to_examples function to the dataset example wise
train_dataset = train_dataset.map(add_eos_to_examples)
# map convert_to_features batch wise
train_dataset = train_dataset.map(convert_to_features, batched=True)

valid_dataset = valid_dataset.map(add_eos_to_examples,
                                  load_from_cache_file=False)
valid_dataset = valid_dataset.map(convert_to_features,
                                  batched=True,
                                  load_from_cache_file=False)

# set the tensor type and the columns which the dataset should return
from typing import List, Tuple

import nlp
import tqdm

from config import model_config, gpu_name
from generate_counterfactuals import generate_counterfactuals
from search_utils.Query import Query
from search_utils.Result import Result

logger = logging.getLogger(__name__)
info = logger.info

model_config.load("imdb", None)

imdb = nlp.load_dataset("imdb")
imdb_train, imdb_test = imdb["train"], imdb["test"]
dataset = imdb_test.shuffle(seed=42)  # otherwise labels are sorted

try:
    # noinspection PyUnresolvedReferences
    from google.colab import drive

    drive.mount('/content/drive')
except ModuleNotFoundError:
    info("probably not running on colab")

results: List[Tuple[int, Result]] = []

new_start = 0
for enm in tqdm.tqdm(range(new_start, len(dataset))):
示例#18
0
import numpy as np
import pandas as pd
from nlp import load_dataset

dataset = load_dataset("social_bias_frames")  # 2020
dataset = load_dataset("hyperpartisan_news_detection")  # 2019
dataset = load_dataset("event2Mind")  # 2018
dataset = load_dataset("emotion")  # 2018
dataset = load_dataset("sentiment140")  # 2009
dataset = load_dataset("squad_v2")
dataset = load_dataset("squadshifts")  # 2020
dataset = load_dataset("webis/tl_dr")  # 2017
dataset = load_dataset("wiki_dpr")  # 2020
dataset = load_dataset("wiki_snippets")
dataset = load_dataset("wikipedia")
dataset = load_dataset("wikitext")  # 2016
dataset = load_dataset("yelp_polarity")
dataset = load_dataset("newsroom")  # 2018
dataset = load_dataset("multi_news")  # 2019
dataset = load_dataset("imdb")  # 2011
dataset = load_dataset("cnn_dailymail")  # 2017
dataset = load_dataset("civil_comments")  # 2017

# Maybe
dataset = load_dataset("social_i_qa")
dataset = load_dataset("sogou_news")
dataset = load_dataset("super_glue")  # 2019
dataset = load_dataset("rotten_tomatoes")  # 2005
dataset = load_dataset("reddit")  # 2017
dataset = load_dataset("quora")
dataset = load_dataset("opinosis")  # 2010
示例#19
0
def main():
    parser = HfArgumentParser((DataTrainingArguments,))

    data_args = parser.parse_args_into_dataclasses()[0]

    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
    )

    if data_args.model_type == 't5':
        tokenizer = T5Tokenizer.from_pretrained("t5-base")
    else:
        tokenizer = T5Tokenizer.from_pretrained("bart-base")
    
    tokenizer.add_tokens(['<sep>', '<hl>'])
    
    train_dataset = nlp.load_dataset(data_args.dataset_path, name=data_args.qg_format, split=nlp.Split.TRAIN)
    valid_dataset = nlp.load_dataset(data_args.dataset_path, name=data_args.qg_format, split=nlp.Split.VALIDATION)

    processor = DataProcessor(
        tokenizer,
        model_type=data_args.model_type,
        max_source_length=data_args.max_source_length,
        max_target_length=data_args.max_target_length
    )

    train_dataset = train_dataset.filter(TASK_TO_FILTER_FN[data_args.task])
    if data_args.task == 'multi' and data_args.valid_for_qg_only:
        logger.info("processing valid data only for qg task")
        valid_dataset = valid_dataset.filter(filter_qg)
    else:
        valid_dataset = valid_dataset.filter(TASK_TO_FILTER_FN[data_args.task])

    
    train_dataset = processor.process(train_dataset)
    valid_dataset = processor.process(valid_dataset)

    columns = ["source_ids", "target_ids", "attention_mask"]
    train_dataset.set_format(type='torch', columns=columns)
    valid_dataset.set_format(type='torch', columns=columns)

    if data_args.train_file_name is None:
        train_file_name = f"train_data_{data_args.task}_{data_args.qg_format}_{data_args.model_type}.pt"
        train_path = os.path.join("data", train_file_name)

        valid_file_name = f"valid_data_{data_args.task}_{data_args.qg_format}_{data_args.model_type}.pt"
        valid_path = os.path.join("data", valid_file_name)
    else:
        train_path = os.path.join("data", data_args.train_file_name)
        valid_path = os.path.join("data", data_args.valid_file_name)
    
    torch.save(train_dataset, train_path)
    logger.info(f"saved train dataset at {train_path}")
    
    torch.save(valid_dataset, valid_path)
    logger.info(f"saved validation dataset at {valid_path}")
    
    tokenizer_path = f"{data_args.model_type}_qg_tokenizer"
    if not os.path.exists(tokenizer_path):
        os.mkdir(tokenizer_path)
    tokenizer.save_pretrained(tokenizer_path)
    logger.info(f"saved tokenizer at {tokenizer_path}")
示例#20
0
  token_type = [0]*len(tokens)
  if tokens_b: 
    tokens += [*tokens_b, hf_tokenizer.sep_token]
    token_type += [1]*(len(tokens_b)+1)
  example['inp_ids'] = hf_tokenizer.convert_tokens_to_ids(tokens)
  example['attn_mask'] = [1] * len(tokens)
  example['token_type_ids'] = token_type
  return example


# %%
glue_dsets = {}; glue_dls = {}
for task in ['cola', 'sst2', 'mrpc', 'qqp', 'stsb', 'mnli', 'qnli', 'rte', 'wnli', 'ax']:

  # Load / download datasets.
  dsets = nlp.load_dataset('glue', task, cache_dir='./datasets')

  # There is two samples broken in QQP training set
  if task=='qqp': dsets['train'] = dsets['train'].filter(lambda e: e['question2']!='',
                                          cache_file_name='./datasets/glue/qqp/1.0.0/fixed_train.arrow')

  # Load / Make tokenized datasets
  tok_func = partial(tokenize_sents_max_len, cols=TEXT_COLS[task], max_len=c.max_length)
  glue_dsets[task] = dsets.my_map(tok_func, cache_file_names=f"tokenized_{c.max_length}_{{split}}")

  if c.double_unordered and task in ['mrpc', 'stsb']:
    swap_tok_func = partial(tokenize_sents_max_len, cols=TEXT_COLS[task], max_len=c.max_length, swap=True)
    swapped_train = dsets['train'].my_map(swap_tok_func, 
                                          cache_file_name=f"swapped_tokenized_{c.max_length}_train")
    glue_dsets[task]['train'] = HF_MergedDataset(glue_dsets[task]['train'], swapped_train)
示例#21
0
 def _prepare_ds(split):
     #loading dataset from the nlp library
     ds = nlp.load_dataset(FLAGS.dataset, split=f'{split}')
     ds = ds.map(_tokenize, batched=True)
     ds.set_format(type='torch', columns=['input_ids', 'label']) #output sample type
     return ds
print(f"🤩 Currently {len(datasets)} datasets are available on HuggingFace AWS bucket: \n" 
      + '\n'.join(dataset.id for dataset in datasets) + '\n')
print(f"🤩 Currently {len(metrics)} metrics are available on HuggingFace AWS bucket: \n" 
      + '\n'.join(metric.id for metric in metrics))

# You can read a few attributes of the datasets before loading them (they are python dataclasses)
from dataclasses import asdict

for key, value in asdict(datasets[6]).items():
    print('👉 ' + key + ': ' + str(value))

"""## An example with SQuAD"""

# Downloading and loading a dataset

dataset = nlp.load_dataset('squad', split='validation[:10%]')

"""This call to `nlp.load_dataset()` does the following steps under the hood:

1. Download and import in the library the **SQuAD python processing script** from HuggingFace AWS bucket if it's not already stored in the library. You can find the SQuAD processing script [here](https://github.com/huggingface/nlp/tree/master/datasets/squad/squad.py) for instance.

   Processing scripts are small python scripts which define the info (citation, description) and format of the dataset and contain the URL to the original SQuAD JSON files and the code to load examples from the original SQuAD JSON files.


2. Run the SQuAD python processing script which will:
    - **Download the SQuAD dataset** from the original URL (see the script) if it's not already downloaded and cached.
    - **Process and cache** all SQuAD in a structured Arrow table for each standard splits stored on the drive.

      Arrow table are arbitrarly long tables, typed with types that can be mapped to numpy/pandas/python standard types and can store nested objects. They can be directly access from drive, loaded in RAM or even streamed over the web.
    
示例#23
0
def train(args):
    train_data, dev_data = nlp.load_dataset(
        'math_dataset',
        split=['train[:95%]', 'train[95%:]'],
        cache_dir="D:/.cache/nlp/")
    vocab = Vocab.load(args['--vocab'])
    embed_size = int(args['--embed-size'])
    hidden_size = int(args['--hidden-size'])
    dropout_rate = float(args['--dropout'])

    train_batch_size = int(args['--batch-size'])

    clip_grad = float(args['--clip-grad'])
    valid_niter = int(args['--valid-niter'])
    log_every = int(args['--log-every'])
    model_save_path = args['--save-to']

    model = MathBaselineLSTMwAttention(embed_size, hidden_size, vocab,
                                       dropout_rate)
    size = get_size_of_model(model)

    print("model size %d parameters" % size)
    model.train()

    uniform_init = float(args['--uniform-init'])
    if np.abs(uniform_init) > 0.:
        print('uniformly initialize parameters [-%f, +%f]' %
              (uniform_init, uniform_init),
              file=sys.stderr)
        for p in model.parameters():
            p.data.uniform_(-uniform_init, uniform_init)
    else:
        print('Xavier initialize parameters', file=sys.stderr)
        for p in model.parameters():
            p.data.xavier_uniform_()

    device = torch.device("cuda:0" if args['--cuda'] else "cpu")
    print('use device: %s' % device, file=sys.stderr)

    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr']))

    num_trial = 0
    train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0
    cum_examples = report_examples = epoch = valid_num = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()
    print('begin Maximum Likelihood training')

    while True:
        epoch += 1
        for src_sents, tgt_sents in batch_iter(train_data,
                                               batch_size=train_batch_size,
                                               shuffle=True):
            train_iter += 1

            optimizer.zero_grad()

            batch_size = len(src_sents)

            example_losses = -model(src_sents, tgt_sents)  # (batch_size,)
            batch_loss = example_losses.sum()
            loss = batch_loss / batch_size

            loss.backward()

            # clip gradient
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       clip_grad)

            optimizer.step()

            batch_losses_val = batch_loss.item()
            report_loss += batch_losses_val
            cum_loss += batch_losses_val

            tgt_words_num_to_predict = sum(
                len(s[1:]) for s in tgt_sents)  # omitting leading `<sos>`
            report_tgt_words += tgt_words_num_to_predict
            cum_tgt_words += tgt_words_num_to_predict
            report_examples += batch_size
            cum_examples += batch_size

            if train_iter % log_every == 0:
                print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \
                      'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter,
                                                                                         report_loss / report_examples,
                                                                                         math.exp(report_loss / report_tgt_words),
                                                                                         cum_examples,
                                                                                         report_tgt_words / (time.time() - train_time),
                                                                                         time.time() - begin_time), file=sys.stderr)

                train_time = time.time()
                report_loss = report_tgt_words = report_examples = 0.

            # perform validation
            if train_iter % valid_niter == 0:
                print(
                    'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d'
                    % (epoch, train_iter, cum_loss / cum_examples,
                       np.exp(cum_loss / cum_tgt_words), cum_examples),
                    file=sys.stderr)

                cum_loss = cum_examples = cum_tgt_words = 0.
                valid_num += 1

                print('begin validation ...', file=sys.stderr)

                # compute dev. ppl and bleu
                dev_ppl = evaluate_ppl(
                    model, dev_data,
                    batch_size=128)  # dev batch size can be a bit larger
                valid_metric = -dev_ppl

                print('validation: iter %d, dev. ppl %f' %
                      (train_iter, dev_ppl),
                      file=sys.stderr)

                is_better = len(hist_valid_scores
                                ) == 0 or valid_metric > max(hist_valid_scores)
                hist_valid_scores.append(valid_metric)

                if is_better:
                    patience = 0
                    print('save currently the best model to [%s]' %
                          model_save_path,
                          file=sys.stderr)
                    model.save(model_save_path)

                    # also save the optimizers' state
                    torch.save(optimizer.state_dict(),
                               model_save_path + '.optim')
                elif patience < int(args['--patience']):
                    patience += 1
                    print('hit patience %d' % patience, file=sys.stderr)

                    if patience == int(args['--patience']):
                        num_trial += 1
                        print('hit #%d trial' % num_trial, file=sys.stderr)
                        if num_trial == int(args['--max-num-trial']):
                            print('early stop!', file=sys.stderr)
                            exit(0)

                        # decay lr, and restore from previously best checkpoint
                        lr = optimizer.param_groups[0]['lr'] * float(
                            args['--lr-decay'])
                        print(
                            'load previously best model and decay learning rate to %f'
                            % lr,
                            file=sys.stderr)

                        # load model
                        params = torch.load(
                            model_save_path,
                            map_location=lambda storage, loc: storage)
                        model.load_state_dict(params['state_dict'])
                        model = model.to(device)

                        print('restore parameters of the optimizers',
                              file=sys.stderr)
                        optimizer.load_state_dict(
                            torch.load(model_save_path + '.optim'))

                        # set new lr
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr

                        # reset patience
                        patience = 0

            if epoch == int(args['--max-epoch']):
                print('reached maximum number of epochs!', file=sys.stderr)
                exit(0)
示例#24
0
    def __init__(self,
                 max_len,
                 batch_size,
                 max_epochs,
                 device,
                 unsup_proportion,
                 sup_proportion,
                 dev_index=1,
                 pretrained=False):
        text_field = data.Field(
            lower=True,
            batch_first=True,
            fix_length=max_len,
            pad_token='<pad>',
            init_token='<go>',
            is_target=True
        )  # init_token='<go>', eos_token='<eos>', unk_token='<unk>', pad_token='<unk>')
        label_field = data.Field(fix_length=max_len - 1,
                                 batch_first=True,
                                 unk_token=None)

        print('Current working directory:', os.getcwd())
        yelp_data = load_dataset('csv',
                                 data_files={
                                     'train':
                                     os.path.join('.data', 'yelp',
                                                  'train.csv'),
                                     'test':
                                     os.path.join('.data', 'yelp', 'test.csv')
                                 },
                                 column_names=['label', 'text'],
                                 version='0.0.2')
        #download_mode=FORCE_REDOWNLOAD)

        start = time()
        train_data, test_data = yelp_data['train'], yelp_data['test']

        def expand_labels(datum):
            datum['label'] = [str(datum['label'])] * (max_len - 1)
            return datum

        lens = [len(sample['text'].split(' ')) for sample in train_data]

        train_data, test_data = train_data.map(expand_labels), test_data.map(
            expand_labels)
        fields1 = {'text': text_field, 'label': label_field}
        fields2 = {
            'text': ('text', text_field),
            'label': ('label', label_field)
        }
        fields3 = {'text': text_field}
        fields4 = {'text': ('text', text_field)}

        len_train = int(len(train_data) / 3)
        dev_start, dev_end = int(len_train/5*(dev_index-1)), \
                             int(len_train/5*(dev_index))
        train_start1, train_start2, train_end1, train_end2 = 0, dev_end, int(dev_start*sup_proportion),\
                                                             int(dev_end+(len_train-dev_end)*sup_proportion)
        unsup_start, unsup_end = len_train, int(len_train + len_train * 2 *
                                                unsup_proportion)
        # Since the datasets are originally sorted with the label as key, we shuffle them before reducing the supervised
        # or the unsupervised data to the first few examples. We use a fixed see to keep the same data for all
        # experiments
        np.random.seed(42)
        train_examples = [Example.fromdict(ex, fields2) for ex in train_data]
        unsup_examples = [Example.fromdict(ex, fields4) for ex in train_data]
        np.random.shuffle(train_examples)
        np.random.shuffle(unsup_examples)
        train = Dataset(
            train_examples[train_start1:train_end1] +
            train_examples[train_start2:train_end2], fields1)
        val = Dataset(train_examples[dev_start:dev_end], fields1)
        test = Dataset([Example.fromdict(ex, fields2) for ex in test_data],
                       fields1)
        unsup_train = Dataset(unsup_examples[unsup_start:unsup_end], fields3)

        vocab_dataset = Dataset(train_examples, fields1)
        unsup_test, unsup_val = test, test

        print('data loading took', time() - start)

        # build the vocabulary
        text_field.build_vocab(
            vocab_dataset,
            max_size=VOCAB_LIMIT)  # , vectors="fasttext.simple.300d")
        label_field.build_vocab(train)
        # make iterator for splits
        self.train_iter, _, _ = data.BucketIterator.splits(
            (unsup_train, unsup_val, unsup_test),
            batch_size=batch_size,
            device=device,
            shuffle=True,
            sort=False)
        _, self.unsup_val_iter, _ = data.BucketIterator.splits(
            (unsup_train, unsup_val, unsup_test),
            batch_size=int(batch_size),
            device=device,
            shuffle=False,
            sort=False)
        self.sup_iter, _, _ = data.BucketIterator.splits((train, val, test),
                                                         batch_size=batch_size,
                                                         device=device,
                                                         shuffle=True,
                                                         sort=False)
        _, self.val_iter, self.test_iter = data.BucketIterator.splits(
            (train, val, test),
            batch_size=int(batch_size),
            device=device,
            shuffle=False,
            sort=False)

        self.vocab = text_field.vocab
        self.tags = label_field.vocab
        self.text_field = text_field
        self.label_field = label_field
        self.device = device
        self.batch_size = batch_size
        self.n_epochs = 0
        self.max_epochs = max_epochs
        if pretrained:
            ftxt = FastText()
            self.wvs = ftxt.get_vecs_by_tokens(self.vocab.itos)
        else:
            self.wvs = None
示例#25
0
文件: data_utils.py 项目: yyht/smyrf
 def __init__(self):
     self.dataset = nlp.load_dataset('boolq')
     self.counter = 0
示例#26
0
  disc_hparam = electra_hparam_from_hf(disc_config, hf_tokenizer)
  disc_hparam.update(hparam_update)

# %% [markdown]
# # 1. Load Data

# %%
if c.size in ['small', 'base']:
  
  # wiki
  if (wiki_cache_dir/f"wiki_electra_{c.max_length}.arrow").exists():
    print('loading the electra data (wiki)')
    wiki = nlp.Dataset.from_file(str(wiki_cache_dir/f"wiki_electra_{c.max_length}.arrow"))
  else:
    print('load/download wiki dataset')
    wiki = nlp.load_dataset('wikipedia', '20200501.en', cache_dir='./datasets')['train']
  
    print('creat data from wiki dataset for ELECTRA')
    wiki = ELECTRADataTransform(wiki, is_docs=True, text_col='text', max_length=c.max_length, hf_toker=hf_tokenizer).map(cache_file_name=str(wiki_cache_dir/f"wiki_electra_{c.max_length}.arrow"))

  # bookcorpus
  if (book_cache_dir/f"book_electra_{c.max_length}.arrow").exists():
    print('loading the electra data (BookCorpus)')
    book = nlp.Dataset.from_file(str(book_cache_dir/f"book_electra_{c.max_length}.arrow"))
  else:
    print('load/download BookCorpus dataset')
    book = nlp.load_dataset('bookcorpus', cache_dir='./datasets')['train']
  
    print('creat data from BookCorpus dataset for ELECTRA')
    book = ELECTRADataTransform(book, is_docs=False, text_col='text', max_length=c.max_length, hf_toker=hf_tokenizer).map(cache_file_name=str(book_cache_dir/f"book_electra_{c.max_length}.arrow"))
示例#27
0
def main():
    batch_size = 4
    vocab_size = 16384
    max_source_length = 1024
    max_target_length = 1024
    num_workers = 3

    dataset = nlp.load_dataset("iwslt2017.py", "nl-en")

    # Train tokenizer
    tokenizer_filename = "tokenizer.json"
    if os.path.exists(tokenizer_filename):
        tokenizer = Tokenizer.from_file(tokenizer_filename)
    else:
        data_filename = "whole_data.txt"
        with open(data_filename, "w") as f:
            for item in dataset["train"]:
                f.write(item["source"] + "\n")
                f.write(item["target"] + "\n\n")

        tokenizer = CharBPETokenizer()
        tokenizer.train([data_filename], vocab_size=vocab_size)
        pad_token = AddedToken("[PAD]", lstrip=False, rstrip=False)
        tokenizer.add_tokens([pad_token])
        tokenizer.save(tokenizer_filename)

    tokenizer.pad_token_id = vocab_size

    # Loaders
    train_dataset = Seq2SeqDataset(tokenizer, dataset["train"],
                                   max_source_length, max_target_length)
    val_dataset = Seq2SeqDataset(tokenizer, dataset["validation"],
                                 max_source_length, max_target_length)
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=train_dataset.collate_fn,
        num_workers=num_workers,
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        collate_fn=val_dataset.collate_fn,
        num_workers=num_workers,
    )

    # Train model
    config = BartConfig(
        vocab_size=vocab_size + 1,  # Pad
        d_model=1024,
        encoder_ffn_dim=1024,
        encoder_layers=6,
        encoder_attention_heads=4,
        decoder_ffn_dim=1024,
        decoder_layers=6,
        decoder_attention_heads=4,
    )
    model = BartForConditionalGeneration(config)
    translator = Translate(model, tokenizer)

    trainer = pl.Trainer(gpus=1)
    trainer.fit(translator, train_loader, val_loader)
示例#28
0
                                 num_beams=4,
                                 early_stopping=True)
    else:
        outputs = model.generate(inputs,
                                 max_length=max_length,
                                 min_length=min_length,
                                 length_penalty=2.0,
                                 num_beams=4,
                                 early_stopping=True)
    return tokenizer.decode(outputs[0],
                            skip_special_tokens=True,
                            clean_up_tokenization_spaces=False)


en_dataset = load_dataset('en_wiki_multi_news.py',
                          cache_dir='dataset/.en-wiki-multi-news-cache',
                          split='test')
de_dataset = load_dataset('de_wiki_multi_news.py',
                          cache_dir='dataset/.de-wiki-multi-news-cache',
                          split='test')
fr_dataset = load_dataset('fr_wiki_multi_news.py',
                          cache_dir='dataset/.fr-wiki-multi-news-cache',
                          split='test')

# Part 0: Title

st.markdown("""
    # Generate long summaries in multiple languages using Transformers

    We use BART or T5 to generate long summaries in English, German or French. This application is a demo of a research paper.
示例#29
0
    plt.yticks(list(range(0, 6)), labels=classes)
    plt.colorbar(ctx)
    plt.show()


print('Using TensorFlow version', tf.__version__)

# ## Task 3: Importing Data
#
# 1. Importing the Tweet Emotion dataset
# 2. Creating train, validation and test sets
# 3. Extracting tweets and labels from the examples

# In[2]:

dataset = nlp.load_dataset('emotion')

# In[3]:

train = dataset['train']
val = dataset['validation']
test = dataset['test']

# In[4]:


def get_tweet(data):
    tweets = [x['text'] for x in data]
    labels = [x['label'] for x in data]
    return tweets, labels
示例#30
0
def main(
    output_filepath: str,
    min_length: Optional[int] = None,
    max_documents: Optional[int] = None,
    pretrained_model_name_or_path: Optional[str] = None,
) -> None:
    """Lightly pre-processes an OpenWebText dump obtained from
    https://skylion007.github.io/OpenWebTextCorpus/. If `min_length` is not None, only documents
    with at least this many tokens are retained. If `pretrained_model_name_or_path` is not None, the
    tokenizer will be loaded as `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)`
    using the HuggingFace Transformers library. Otherwise `str.split()` is used. This argument has
    no effect if `min-length is None`.
    """
    # Collect the raw text from the "scientific_papers" dataset.
    pubmed = nlp.load_dataset("scientific_papers", "pubmed")
    arxiv = nlp.load_dataset("scientific_papers", "arxiv")
    # Create a genrator over both datasets to avoid storing things in memory.
    pubmed_text = (article["article"] for partition in pubmed.values()
                   for article in partition)
    arxiv_text = (article["article"] for partition in arxiv.values()
                  for article in partition)
    scientific_text = itertools.chain(pubmed_text, arxiv_text)

    # Setup the pre-trained tokenizer, if specified.
    if min_length is not None:
        if pretrained_model_name_or_path is not None:
            # Import transformers here to prevent ImportError errors if the
            # user doesn't want to use it.
            from transformers import AutoTokenizer

            tokenizer = AutoTokenizer.from_pretrained(
                pretrained_model_name_or_path).tokenize
        else:
            tokenizer = lambda x: x.split()  # noqa
    else:
        tokenizer = None

    documents = []
    typer.secho(
        (f' {MINING} Scraping {max_documents or "all"} documents'
         f' {f"with a minimum token length of {min_length}" if min_length else ""}'
         ),
        fg=typer.colors.WHITE,
        bold=True,
    )

    with typer.progressbar(scientific_text,
                           label="Preprocessing text") as progress:
        for doc in progress:
            doc = _sanitize(doc)
            if not doc:
                continue

            # Retain documents if the length of their shortest document is
            # equal to or greater than the minimum specified length
            if tokenizer is not None:
                num_tokens = len(tokenizer(doc))
                if num_tokens < min_length:
                    continue
            documents.append(doc)
    _write_output_to_disk(documents, output_filepath)