Пример #1
0
def main(args):
    toker = GPT2Tokenizer.from_pretrained('gpt2')
    attrs = []
    if args.reverse:
        attrs.append('reverse')
    if args.two_turn:
        attrs.append('2turn')
    if attrs:
        temp = str(".".join(attrs))
        # str(args.corpus[:-4]) + '.' + str(args.max_seq_len) + 'len.'
        db_path = (str(args.corpus[:-4]) + '.' + str(args.max_seq_len) + 'len.' + temp + '.db/db')
    else:
        db_path = str(args.corpus[:-4]) + '.' + str(args.max_seq_len) + 'len.db/db'
    if exists(dirname(db_path)):
        raise ValueError('Found existing DB, please backup')
    else:
        os.makedirs(dirname(db_path))
    with open(args.corpus, "r", encoding="utf-8") as reader, \
            shelve.open(db_path, 'n') as db:
        chunk = []
        n_chunk = 0
        n_example = 0
        for line in tqdm(reader, total=_get_file_len(args.corpus)):
            try:
                if len(chunk) >= args.chunk_size:
                    # save and renew chunk
                    db['chunk_' + str(n_chunk)] = gzip.compress(
                        json.dumps(chunk[:args.chunk_size]).encode('utf-8'))
                    chunk = chunk[args.chunk_size:]
                    n_chunk += 1

                weights, inputs = _get_inputs_from_text(line, toker)
                if args.reverse:
                    weights = list(reversed(weights))
                    inputs = list(reversed(inputs))
                if args.two_turn:
                    weights = weights[:2]
                    inputs = inputs[:2]
                if len(weights) < 2:
                    continue
                features = _make_features(n_example, weights, inputs,
                                          toker, args.max_seq_len)
                for feature in features:
                    chunk.append(vars(feature))
                    n_example += 1
            except Exception as e:
                print('!!! prepro exception !!!', e)
                continue
        # save last chunk
        db['chunk_' + str(n_chunk)] = gzip.compress(
            json.dumps(chunk).encode('utf-8'))
    # save relevant information to reproduce
    meta = {'n_example': n_example,
            'chunk_size': args.chunk_size,
            'max_seq_len': args.max_seq_len,
            'reverse': args.reverse,
            'two_turn': args.two_turn}
    with open(join(dirname(db_path), 'meta.json'), 'w') as writer:
        json.dump(meta, writer, indent=4)
    torch.save(toker, join(dirname(db_path), 'tokenizer.pt'))
Пример #2
0
    'GPT2.{}.{}.{}gpu.{}'.format(args.learning_rate, args.train_batch_size,
                                 n_gpu, timestamp))
log_dir = args.log_dir if args.log_dir is not None and len(
    args.log_dir) > 0 else output_dir
if args.local_rank == -1:
    os.makedirs(output_dir, exist_ok=True)

logger.info('Input Argument Information')
args_dict = vars(args)
for a in args_dict:
    logger.info('%-28s  %s' % (a, args_dict[a]))

#########################################################################
# Prepare Data Set
##########################################################################
enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path)

config = GPT2Config.from_json_file(join(args.model_name_or_path,
                                        'config.json'))

if args.local_rank == -1:
    train_dataloader = BucketingDataLoader(args.train_input_file,
                                           args.train_batch_size,
                                           args.max_seq_length)
else:
    pass
    # train_dataloader = DistributedBucketingDataLoader(
    #     get_rank(), get_world_size(),
    #     args.train_input_file, args.train_batch_size,
    #     args.max_seq_length)
Пример #3
0
                                               args.train_batch_size, n_gpu,
                                               timestamp))
log_dir = args.log_dir if args.log_dir is not None and len(args.log_dir) > 0 else output_dir
if args.local_rank == -1 or get_rank() == 0:
    os.makedirs(output_dir, exist_ok=True)

logger.info('Input Argument Information')
args_dict = vars(args)
for a in args_dict:
    logger.info('%-28s  %s' % (a, args_dict[a]))


#########################################################################
# Prepare Data Set
##########################################################################
enc = GPT2Tokenizer.from_pretrained('gpt2')

config = GPT2Config.from_json_file(
    join(args.model_name_or_path, 'config.json'))

if args.local_rank == -1:
    train_dataloader = BucketingDataLoader(args.train_input_file,
                                           args.train_batch_size,
                                           args.max_seq_length)
else:
    train_dataloader = DistributedBucketingDataLoader(
        get_rank(), get_world_size(),
        args.train_input_file, args.train_batch_size,
        args.max_seq_length)

eval_dataloader_loss = DynamicBatchingLoader(