Exemplo n.º 1
0
 def __init__(self, filename, tokenizer, max_seq_length, short_seq_prob,
              masked_lm_prob, max_predictions_per_seq, vocab):
     logging.debug('start to load file %s ...', filename)
     instances = create_training_instances([filename], tokenizer, max_seq_length,
                                           short_seq_prob, masked_lm_prob,
                                           max_predictions_per_seq, vocab,
                                           nworker=1)
     super(BERTPretrainDataset, self).__init__(*instances)
Exemplo n.º 2
0
def generate_dev_set(tokenizer, vocab, cache_file, args):
    """Generate validation set."""
    # set random seed to generate dev data deterministically
    np.random.seed(0)
    random.seed(0)
    mx.random.seed(0)
    worker_pool = multiprocessing.Pool()
    eval_files = nlp.utils.glob(args.data_eval)
    num_files = len(eval_files)
    assert num_files > 0, 'Number of eval files must be greater than 0.' \
                          'Only found %d files at %s'%(num_files, args.data_eval)
    logging.info('Generating validation set from %d files on rank 0.', len(eval_files))
    create_training_instances((eval_files, tokenizer, args.max_seq_length,
                               args.short_seq_prob, args.masked_lm_prob,
                               args.max_predictions_per_seq,
                               args.whole_word_mask, vocab,
                               1, args.num_data_workers,
                               worker_pool, cache_file))
    logging.info('Done generating validation set on rank 0.')
Exemplo n.º 3
0
 def __init__(self, filename, tokenizer, max_seq_length, short_seq_prob,
              masked_lm_prob, max_predictions_per_seq, vocab, num_workers=1, worker_pool=None):
     logging.debug('start to load file %s ...', filename)
     dupe_factor = 1
     instances = create_training_instances(([filename], tokenizer, max_seq_length,
                                            short_seq_prob, masked_lm_prob,
                                            max_predictions_per_seq, vocab,
                                            dupe_factor, num_workers,
                                            worker_pool, None))
     super(BERTPretrainDataset, self).__init__(*instances)
Exemplo n.º 4
0
def prepare_pretrain_text_dataset(filename, tokenizer, max_seq_length, short_seq_prob,
                                  masked_lm_prob, max_predictions_per_seq, whole_word_mask,
                                  random_next_sentence, vocab):
    """Create dataset based on the raw text files"""
    dupe_factor = 1
    if not isinstance(filename, (list, tuple)):
        filename = [filename]
    logging.debug('start to load files %s ...', filename)
    instances = create_training_instances((filename, tokenizer, max_seq_length,
                                           short_seq_prob, masked_lm_prob,
                                           max_predictions_per_seq,
                                           whole_word_mask, vocab,
                                           dupe_factor, 1, None, None, random_next_sentence))
    return mx.gluon.data.ArrayDataset(*instances)
Exemplo n.º 5
0
def loop(files):
    client = storage.Client()
    bucket = client.bucket('mesolitica-tpu-general')
    files, index, postfix = files
    output_files = f'{directory}/albert-{index}-{postfix}.tfrecord'
    print(f'Output filename: {output_files}')
    files = ','.join(files)
    tokenizer = tokenization.FullTokenizer(
        vocab_file='sp10m.cased.albert.vocab',
        do_lower_case=False,
        spm_model_file='sp10m.cased.albert.model',
    )

    input_files = []
    for input_pattern in files.split(','):
        input_files.extend(tf.gfile.Glob(input_pattern))

    tf.logging.info('*** Reading from input files ***')
    for input_file in input_files:
        tf.logging.info('  %s', input_file)

    rng = random.Random(random.randint(1, 999999))
    instances = create_training_instances(
        input_files,
        tokenizer,
        max_seq_length=128,
        dupe_factor=2,
        short_seq_prob=0.1,
        masked_lm_prob=0.15,
        max_predictions_per_seq=20,
        rng=rng,
    )

    tf.logging.info('number of instances: %i', len(instances))

    write_instance_to_example_files(
        instances,
        tokenizer,
        max_seq_length=128,
        max_predictions_per_seq=20,
        output_files=output_files.split(','),
    )

    blob = bucket.blob(f'albert-data/{output_files}')
    blob.upload_from_filename(output_files)
    os.system(f'rm {output_files}')
Exemplo n.º 6
0
def loop(files):
    client = storage.Client()
    bucket = client.bucket('mesolitica-tpu-general')
    input_files, index = files
    output_file = f'{directory}/bert-{index}.tfrecord'

    print('*** Reading from input files ***')
    for input_file in input_files:
        print(input_file)

    max_seq_length = 128
    dupe_factor = 5
    max_predictions_per_seq = 20
    masked_lm_prob = 0.15
    short_seq_prob = 0.1
    rng = random.Random(12345)
    instances = create_training_instances(
        input_files,
        tokenizer,
        max_seq_length,
        dupe_factor,
        short_seq_prob,
        masked_lm_prob,
        max_predictions_per_seq,
        rng,
    )

    print('*** Writing to output files ***')

    write_instance_to_example_files(
        instances,
        tokenizer,
        max_seq_length,
        max_predictions_per_seq,
        [output_file],
    )

    blob = bucket.blob(f'bert-data/{output_file}')
    blob.upload_from_filename(output_file)
    os.system(f'rm {output_file}')
Exemplo n.º 7
0
def create_pretraining_data_from_docs(docs,
                                      save_path,
                                      vocab_path,
                                      token_method='wordpiece',
                                      language='en',
                                      max_seq_length=128,
                                      dupe_factor=10,
                                      short_seq_prob=0.1,
                                      masked_lm_prob=0.15,
                                      max_predictons_per_seq=20):
    """docs: sequence of sequence of sentences.

    Args:
        docs: Sequence of sequence. Docs is a sequence of documents.
            A document is a sequence of sentences.
        save_path: path to save pretraining data.
        vocab_path: The vocabulary file that the BERT model was trained on.
            only enable when token_method='wordpiece'.
        token_method: string. 'wordpiece' or 'spacy'
        language: string. 'en' or 'chn'
        max_seq_length: integer. Maximum sequence length.
        dupe_factor: integer. Number of times to duplicate the input data (with different masks).
        short_seq_prob: float. Probability of creating sequences which are shorter than the maximum length.
        masked_lm_prob: float. Masked LM probability.
        max_predictons_per_seq: integer. Maximum number of masked LM predictions per sequence.
    """

    if not hasattr(docs, '__len__'):
        raise ValueError("`docs` should be sequence of sequence.")
    else:
        if not hasattr(docs[0], '__len__'):
            raise ValueError("`docs` should be sequence of sequence.")
    if token_method not in ['wordpiece', 'spacy']:
        raise ValueError(
            "`token_method` must be one of `wordpiece` and `spacy`.")
    if language not in ['en', 'chn']:
        raise ValueError("`language` should be one of `en` and `chn`.")

    if token_method == "spacy" and language == "chn":
        raise ValueError(
            "spacy tokenizer only enable when `language` is `en`.")

    if token_method == "wordpiece":
        tokenizer = FullTokenizer(vocab_path, do_lower_case=True)
    else:
        tokenizer = SpacyTokenizer(vocab_path, do_lower_case=True)

    instances = create_training_instances(
        docs,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        dupe_factor=dupe_factor,
        short_seq_prob=short_seq_prob,
        masked_lm_prob=masked_lm_prob,
        max_predictions_per_seq=max_predictons_per_seq)

    pretraining_data = dict(tokens=[],
                            segment_ids=[],
                            is_random_next=[],
                            masked_lm_positions=[],
                            masked_lm_labels=[])

    for i, instance in enumerate(instances):
        if i < 10:
            print("num-{}: {}".format(i, instance))
        pretraining_data['tokens'].append(instance.tokens)
        pretraining_data['segment_ids'].append(instance.segment_ids)
        pretraining_data['is_random_next'].append(int(instance.is_random_next))
        pretraining_data['masked_lm_positions'].append(
            instance.masked_lm_positions)
        pretraining_data['masked_lm_labels'].append(instance.masked_lm_labels)

    tokens_ids = []
    tokens_mask = []
    for tokens in pretraining_data['tokens']:
        sub_ids = tokenizer.convert_tokens_to_ids(tokens)
        sub_mask = [1] * len(sub_ids)
        tokens_ids.append(sub_ids)
        tokens_mask.append(sub_mask)

    masked_lm_ids = []
    for mask_labels in pretraining_data['masked_lm_labels']:
        sub_masked_lm_ids = tokenizer.convert_tokens_to_ids(mask_labels)
        masked_lm_ids.append(sub_masked_lm_ids)

    # input
    tokens_ids = pad_sequences(tokens_ids,
                               maxlen=128,
                               padding='post',
                               truncating='post')
    tokens_mask = pad_sequences(tokens_mask,
                                maxlen=128,
                                padding='post',
                                truncating='post')
    segment_ids = pad_sequences(pretraining_data['segment_ids'],
                                maxlen=128,
                                padding='post',
                                truncating='post')
    masked_lm_positions = pad_sequences(
        pretraining_data['masked_lm_positions'],
        maxlen=20,
        padding='post',
        truncating='post')
    # label
    is_random_next = to_categorical(pretraining_data['is_random_next'],
                                    num_classes=2)
    masked_lm_labels = pad_sequences(masked_lm_ids,
                                     maxlen=20,
                                     padding='post',
                                     truncating='post')

    # save
    np.savez(file=save_path,
             tokens_ids=tokens_ids,
             tokens_mask=tokens_mask,
             segment_ids=segment_ids,
             is_random_next=is_random_next,
             masked_lm_positions=masked_lm_positions,
             masked_lm_labels=masked_lm_labels)

    print("[INFO] number of train data:", len(tokens_ids))
    print("[INFO] is_random_next ratio:",
          np.sum(pretraining_data['is_random_next']) / len(is_random_next))
from tokenization import BertTokenizer
import random
from create_pretraining_data import create_training_instances, write_instance_to_example_file
from glob import glob
# line='The switches between clarity and intoxication gave me a headache, but at least the silver-haired faery’s explanation of the queens’ “gifts” helped me understand why I could want to wrap my legs around a creature who terrified me.'
vocab_file = '/workspace/bert/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt'
base_dir = '/home/haris/share/git/DeepLearningExamples/PyTorch/LanguageModeling/BERT/sharded_training_shards_256_test_shards_256_fraction_0.2/books_wiki_en_corpus/'
random_seed = 123

tokenizer = BertTokenizer(vocab_file)
rng = random.Random(random_seed)
max_seq_length = 128
dupe_factor = 5
short_seq_prob = 0.1
masked_lm_prob = 0.15
max_predictions_per_seq = 20
# tokens=tokenizer.tokenize(line)

# input_files=glob(base_dir+'*.txt')
input_files = [
    '/home/haris/share/git/DeepLearningExamples/PyTorch/LanguageModeling/BERT/test_file.txt'
]
output_file = '/home/haris/share/git/DeepLearningExamples/PyTorch/LanguageModeling/BERT/test_file.h5'

instances = create_training_instances(input_files, tokenizer, max_seq_length,
                                      dupe_factor, short_seq_prob,
                                      masked_lm_prob, max_predictions_per_seq,
                                      rng)

write_instance_to_example_file(instances, tokenizer, max_seq_length,
                               max_predictions_per_seq, output_file)