def main(args): toker = GPT2Tokenizer.from_pretrained('gpt2') attrs = [] if args.reverse: attrs.append('reverse') if args.two_turn: attrs.append('2turn') if attrs: temp = str(".".join(attrs)) # str(args.corpus[:-4]) + '.' + str(args.max_seq_len) + 'len.' db_path = (str(args.corpus[:-4]) + '.' + str(args.max_seq_len) + 'len.' + temp + '.db/db') else: db_path = str(args.corpus[:-4]) + '.' + str(args.max_seq_len) + 'len.db/db' if exists(dirname(db_path)): raise ValueError('Found existing DB, please backup') else: os.makedirs(dirname(db_path)) with open(args.corpus, "r", encoding="utf-8") as reader, \ shelve.open(db_path, 'n') as db: chunk = [] n_chunk = 0 n_example = 0 for line in tqdm(reader, total=_get_file_len(args.corpus)): try: if len(chunk) >= args.chunk_size: # save and renew chunk db['chunk_' + str(n_chunk)] = gzip.compress( json.dumps(chunk[:args.chunk_size]).encode('utf-8')) chunk = chunk[args.chunk_size:] n_chunk += 1 weights, inputs = _get_inputs_from_text(line, toker) if args.reverse: weights = list(reversed(weights)) inputs = list(reversed(inputs)) if args.two_turn: weights = weights[:2] inputs = inputs[:2] if len(weights) < 2: continue features = _make_features(n_example, weights, inputs, toker, args.max_seq_len) for feature in features: chunk.append(vars(feature)) n_example += 1 except Exception as e: print('!!! prepro exception !!!', e) continue # save last chunk db['chunk_' + str(n_chunk)] = gzip.compress( json.dumps(chunk).encode('utf-8')) # save relevant information to reproduce meta = {'n_example': n_example, 'chunk_size': args.chunk_size, 'max_seq_len': args.max_seq_len, 'reverse': args.reverse, 'two_turn': args.two_turn} with open(join(dirname(db_path), 'meta.json'), 'w') as writer: json.dump(meta, writer, indent=4) torch.save(toker, join(dirname(db_path), 'tokenizer.pt'))
'GPT2.{}.{}.{}gpu.{}'.format(args.learning_rate, args.train_batch_size, n_gpu, timestamp)) log_dir = args.log_dir if args.log_dir is not None and len( args.log_dir) > 0 else output_dir if args.local_rank == -1: os.makedirs(output_dir, exist_ok=True) logger.info('Input Argument Information') args_dict = vars(args) for a in args_dict: logger.info('%-28s %s' % (a, args_dict[a])) ######################################################################### # Prepare Data Set ########################################################################## enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path) config = GPT2Config.from_json_file(join(args.model_name_or_path, 'config.json')) if args.local_rank == -1: train_dataloader = BucketingDataLoader(args.train_input_file, args.train_batch_size, args.max_seq_length) else: pass # train_dataloader = DistributedBucketingDataLoader( # get_rank(), get_world_size(), # args.train_input_file, args.train_batch_size, # args.max_seq_length)
args.train_batch_size, n_gpu, timestamp)) log_dir = args.log_dir if args.log_dir is not None and len(args.log_dir) > 0 else output_dir if args.local_rank == -1 or get_rank() == 0: os.makedirs(output_dir, exist_ok=True) logger.info('Input Argument Information') args_dict = vars(args) for a in args_dict: logger.info('%-28s %s' % (a, args_dict[a])) ######################################################################### # Prepare Data Set ########################################################################## enc = GPT2Tokenizer.from_pretrained('gpt2') config = GPT2Config.from_json_file( join(args.model_name_or_path, 'config.json')) if args.local_rank == -1: train_dataloader = BucketingDataLoader(args.train_input_file, args.train_batch_size, args.max_seq_length) else: train_dataloader = DistributedBucketingDataLoader( get_rank(), get_world_size(), args.train_input_file, args.train_batch_size, args.max_seq_length) eval_dataloader_loss = DynamicBatchingLoader(