def main(args): print(args) # Load tokenizer if args.tokenizer == 'sentencepiece': tokenizer = PretrainedTokenizer(pretrained_model=args.pretrained_model, vocab_file=args.vocab_file) else: tokenizer = TOKENIZER_CLASSES[args.tokenizer]() tokenizer = Tokenizer(tokenizer=tokenizer, vocab_file=args.vocab_file) # Build DataLoader train_dataset = create_examples(args, tokenizer, mode='train') test_dataset = create_examples(args, tokenizer, mode='test') train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=True) # Build Trainer trainer = Trainer(args, train_loader, test_loader, tokenizer) # Train & Validate for epoch in range(1, args.epochs + 1): trainer.train(epoch) trainer.validate(epoch) trainer.save(epoch, args.output_model_prefix)
def main(args): print(args) # Setup CUDA, GPU & distributed training if args.distributed: torch.cuda.set_device(args.local_rank) dist.init_process_group(backend='nccl') # synchronize() # Load pretrained tokenizer tokenizer = PretrainedTokenizer(pretrained_model=args.pretrained_sp_model, vocab_file=args.vocab_file) # Build DataLoader train_dataset = create_examples(args, tokenizer, mode='train') train_sampler = RandomSampler( train_dataset ) if args.local_rank == -1 or args.local_rank == 1 or args.local_rank == 0 else DistributedSampler( train_dataset) train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.batch_size, num_workers=args.n_workers) if args.do_eval: test_dataset = create_examples(args, tokenizer, mode='test') test_sampler = RandomSampler( test_dataset ) if args.local_rank == -1 or args.local_rank == 1 or args.local_rank == 0 else DistributedSampler( test_dataset) test_loader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.batch_size, num_workers=args.n_workers) # Build Trainer trainer = Trainer(args=args, train_loader=train_loader, test_loader=test_loader if args.do_eval else None, tokenizer=tokenizer) # Train for epoch in range(1, args.epochs + 1): trainer.train(epoch) # print(111) trainer.save(epoch, args.output_model_prefix) # print(111) if args.do_eval: trainer.evaluate(epoch)
def main(args): print(args) # Load tokenizer tokenizer_src = PretrainedTokenizer( pretrained_model=args.pretrained_model_src, vocab_file=args.vocab_file_src) tokenizer_tgt = PretrainedTokenizer( pretrained_model=args.pretrained_model_tgt, vocab_file=args.vocab_file_tgt) # Build DataLoader train_dataset = create_examples(args, tokenizer_src, tokenizer_tgt, mode='train') test_dataset = create_examples(args, tokenizer_src, tokenizer_tgt, mode='test') train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=True) # Build Trainer trainer = Trainer(args, train_loader, test_loader, tokenizer_src, tokenizer_tgt) # Train & Validate for epoch in range(1, args.epochs + 1): trainer.train(epoch) trainer.validate(epoch) trainer.save(epoch, args.output_model_prefix)
def main(args): print(args) set_seeds() # Build DataLoader train_dataset, test_dataset = create_examples(args) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=True) # Build Trainer trainer = Trainer(args, train_loader, test_loader) # Warm up for epoch in range(1, args.pretrain + 1): trainer.pretrain(epoch) # Train & Validate for epoch in range(1, args.epochs + 1): trainer.train(epoch) trainer.validate(epoch) trainer.save(epoch, args.output_model_prefix)