def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) finetune_opts(parser) parser.add_argument( "--max_choices_num", default=4, type=int, help= "The maximum number of cadicate answer, shorter than this will be padded." ) tokenizer_opts(parser) adv_opts(parser) args = parser.parse_args() args.labels_num = args.max_choices_num # Load the hyperparameters from the config file. args = load_hyperparam(args) set_seed(args.seed) # Build tokenizer. args.tokenizer = str2tokenizer[args.tokenizer](args) # Build multiple choice model. model = MultipleChoice(args) # Load or initialize parameters. load_or_initialize_parameters(args, model) # Get logger. args.logger = init_logger(args) args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(args.device) # Training phase. trainset = read_dataset(args, args.train_path) instances_num = len(trainset) batch_size = args.batch_size args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1 args.logger.info("Batch size: {}".format(batch_size)) args.logger.info( "The number of training instances: {}".format(instances_num)) optimizer, scheduler = build_optimizer(args, model) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) args.amp = amp if torch.cuda.device_count() > 1: args.logger.info("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) args.model = model if args.use_adv: args.adv_method = str2adv[args.adv_type](model) total_loss, result, best_result = 0.0, 0.0, 0.0 args.logger.info("Start training.") for epoch in range(1, args.epochs_num + 1): random.shuffle(trainset) src = torch.LongTensor([example[0] for example in trainset]) tgt = torch.LongTensor([example[1] for example in trainset]) seg = torch.LongTensor([example[2] for example in trainset]) model.train() for i, (src_batch, tgt_batch, seg_batch, _) in enumerate(batch_loader(batch_size, src, tgt, seg)): loss = train_model(args, model, optimizer, scheduler, src_batch, tgt_batch, seg_batch) total_loss += loss.item() if (i + 1) % args.report_steps == 0: args.logger.info( "Epoch id: {}, Training steps: {}, Avg loss: {:.3f}". format(epoch, i + 1, total_loss / args.report_steps)) total_loss = 0.0 result = evaluate(args, read_dataset(args, args.dev_path)) if result[0] > best_result: best_result = result[0] save_model(model, args.output_model_path) # Evaluation phase. if args.test_path is not None: args.logger.info("Test set evaluation.") if torch.cuda.device_count() > 1: args.model.module.load_state_dict( torch.load(args.output_model_path)) else: args.model.load_state_dict(torch.load(args.output_model_path)) evaluate(args, read_dataset(args, args.test_path))
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) finetune_opts(parser) parser.add_argument("--vocab_path", default=None, type=str, help="Path of the vocabulary file.") parser.add_argument("--spm_model_path", default=None, type=str, help="Path of the sentence piece model.") parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) set_seed(args.seed) # Build tokenizer. args.tokenizer = CharTokenizer(args) # Build machine reading comprehension model. model = MachineReadingComprehension(args) # Load or initialize parameters. load_or_initialize_parameters(args, model) # Get logger. args.logger = init_logger(args) args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(args.device) # Build tokenizer. args.tokenizer = CharTokenizer(args) # Training phase. batch_size = args.batch_size args.logger.info("Batch size: {}".format(batch_size)) trainset, _ = read_dataset(args, args.train_path) instances_num = len(trainset) args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1 args.logger.info( "The number of training instances: {}".format(instances_num)) optimizer, scheduler = build_optimizer(args, model) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) if torch.cuda.device_count() > 1: args.logger.info("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) args.model = model total_loss = 0.0 result = 0.0 best_result = 0.0 args.logger.info("Start training.") for epoch in range(1, args.epochs_num + 1): random.shuffle(trainset) src = torch.LongTensor([sample[0] for sample in trainset]) seg = torch.LongTensor([sample[1] for sample in trainset]) start_position = torch.LongTensor([sample[2] for sample in trainset]) end_position = torch.LongTensor([sample[3] for sample in trainset]) model.train() for i, (src_batch, seg_batch, start_position_batch, end_position_batch) in enumerate( batch_loader(batch_size, src, seg, start_position, end_position)): loss = train(args, model, optimizer, scheduler, src_batch, seg_batch, start_position_batch, end_position_batch) total_loss += loss.item() if (i + 1) % args.report_steps == 0: args.logger.info( "Epoch id: {}, Training steps: {}, Avg loss: {:.3f}". format(epoch, i + 1, total_loss / args.report_steps)) total_loss = 0.0 result = evaluate(args, *read_dataset(args, args.dev_path)) if result > best_result: best_result = result save_model(model, args.output_model_path) # Evaluation phase. if args.test_path is not None: args.logger.info("Test set evaluation.") if torch.cuda.device_count() > 1: args.model.module.load_state_dict( torch.load(args.output_model_path)) else: args.model.load_state_dict(torch.load(args.output_model_path)) evaluate(args, *read_dataset(args, args.test_path))
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Path options. parser.add_argument("--pretrained_model_path", default=None, type=str, help="Path of the pretrained model.") parser.add_argument("--dataset_path_list", default=[], nargs='+', type=str, help="Dataset path list.") parser.add_argument("--output_model_path", default="models/multitask_classifier_model.bin", type=str, help="Path of the output model.") parser.add_argument("--config_path", default="models/bert/base_config.json", type=str, help="Path of the config file.") # Model options. model_opts(parser) parser.add_argument("--pooling", choices=["mean", "max", "first", "last"], default="first", help="Pooling type.") # Tokenizer options. tokenizer_opts(parser) # Optimizer options. optimization_opts(parser) # Training options. training_opts(parser) adv_opts(parser) args = parser.parse_args() args.soft_targets = False # Load the hyperparameters from the config file. args = load_hyperparam(args) set_seed(args.seed) # Count the number of labels. args.labels_num_list = [ count_labels_num(os.path.join(path, "train.tsv")) for path in args.dataset_path_list ] args.datasets_num = len(args.dataset_path_list) # Build tokenizer. args.tokenizer = str2tokenizer[args.tokenizer](args) # Build multi-task classification model. model = MultitaskClassifier(args) # Load or initialize parameters. load_or_initialize_parameters(args, model) # Get logger. args.logger = init_logger(args) args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(args.device) args.model = model if args.use_adv: args.adv_method = str2adv[args.adv_type](model) # Training phase. dataset_list = [ read_dataset(args, os.path.join(path, "train.tsv")) for path in args.dataset_path_list ] packed_dataset_list = [ pack_dataset(dataset, i, args.batch_size) for i, dataset in enumerate(dataset_list) ] packed_dataset_all = [] for packed_dataset in packed_dataset_list: packed_dataset_all += packed_dataset instances_num = sum([len(dataset) for dataset in dataset_list]) batch_size = args.batch_size args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1 args.logger.info("Batch size: {}".format(batch_size)) args.logger.info( "The number of training instances: {}".format(instances_num)) optimizer, scheduler = build_optimizer(args, model) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) args.amp = amp if torch.cuda.device_count() > 1: args.logger.info("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) total_loss, result, best_result = 0.0, 0.0, 0.0 args.logger.info("Start training.") for epoch in range(1, args.epochs_num + 1): random.shuffle(packed_dataset_all) model.train() for i, (dataset_id, src_batch, tgt_batch, seg_batch) in enumerate(packed_dataset_all): if hasattr(model, "module"): model.module.change_dataset(dataset_id) else: model.change_dataset(dataset_id) loss = train_model(args, model, optimizer, scheduler, src_batch, tgt_batch, seg_batch, None) total_loss += loss.item() if (i + 1) % args.report_steps == 0: args.logger.info( "Epoch id: {}, Training steps: {}, Avg loss: {:.3f}". format(epoch, i + 1, total_loss / args.report_steps)) total_loss = 0.0 for dataset_id, path in enumerate(args.dataset_path_list): args.labels_num = args.labels_num_list[dataset_id] if hasattr(model, "module"): model.module.change_dataset(dataset_id) else: model.change_dataset(dataset_id) result = evaluate( args, read_dataset(args, os.path.join(path, "dev.tsv"))) save_model(model, args.output_model_path)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) finetune_opts(parser) tokenizer_opts(parser) parser.add_argument("--temperature", type=float, default=0.05) parser.add_argument("--eval_steps", type=int, default=200, help="Evaluate frequency.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) set_seed(args.seed) # Build tokenizer. args.tokenizer = str2tokenizer[args.tokenizer](args) # Build classification model. model = SimCSE(args) # Load or initialize parameters. load_or_initialize_parameters(args, model) # Get logger. args.logger = init_logger(args) args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(args.device) # Training phase. trainset = read_dataset(args, args.train_path) instances_num = len(trainset) batch_size = args.batch_size args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1 args.logger.info("Batch size: {}".format(batch_size)) args.logger.info( "The number of training instances: {}".format(instances_num)) optimizer, scheduler = build_optimizer(args, model) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) args.amp = amp if torch.cuda.device_count() > 1: args.logger.info("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) args.model = model total_loss, result, best_result = 0.0, 0.0, 0.0 args.logger.info("Start training.") for epoch in range(1, args.epochs_num + 1): random.shuffle(trainset) src_a = torch.LongTensor([example[0][0] for example in trainset]) src_b = torch.LongTensor([example[0][1] for example in trainset]) tgt = torch.FloatTensor([example[1] for example in trainset]) seg_a = torch.LongTensor([example[2][0] for example in trainset]) seg_b = torch.LongTensor([example[2][1] for example in trainset]) model.train() for i, (src_batch, tgt_batch, seg_batch) in enumerate( batch_loader(batch_size, (src_a, src_b), tgt, (seg_a, seg_b))): model.zero_grad() src_a_batch, src_b_batch = src_batch seg_a_batch, seg_b_batch = seg_batch src_a_batch = src_a_batch.to(args.device) src_b_batch = src_b_batch.to(args.device) seg_a_batch = seg_a_batch.to(args.device) seg_b_batch = seg_b_batch.to(args.device) features_0, features_1 = model((src_a_batch, src_b_batch), (seg_a_batch, seg_b_batch)) similarity_matrix = similarity(features_0, features_1, args.temperature) tgt_batch = torch.arange(similarity_matrix.size(0), device=similarity_matrix.device, dtype=torch.long) loss = nn.CrossEntropyLoss()(similarity_matrix, tgt_batch) if args.fp16: with args.amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() scheduler.step() total_loss += loss.item() if (i + 1) % args.report_steps == 0: args.logger.info( "Epoch id: {}, Training steps: {}, Avg loss: {:.3f}". format(epoch, i + 1, total_loss / args.report_steps)) total_loss = 0.0 if (i + 1) % args.eval_steps == 0 or (i + 1) == math.ceil( instances_num / batch_size): result = evaluate(args, read_dataset(args, args.dev_path)) args.logger.info( "Epoch id: {}, Training steps: {}, Evaluate result: {}, Best result: {}" .format(epoch, i + 1, result, best_result)) if result > best_result: best_result = result save_model(model, args.output_model_path) args.logger.info( "It is the best model until now. Save it to {}".format( args.output_model_path))
def worker(proc_id, gpu_ranks, args, model): """ Args: proc_id: The id of GPU for single GPU mode; The id of process (and GPU) for multiprocessing distributed mode. gpu_ranks: List of ranks of each process. """ set_seed(args.seed) # Get logger args.logger = init_logger(args) if args.deepspeed: import deepspeed deepspeed.init_distributed(dist_backend=args.backend) rank = dist.get_rank() gpu_id = proc_id elif args.dist_train: rank = gpu_ranks[proc_id] gpu_id = proc_id elif args.single_gpu: rank = None gpu_id = proc_id else: rank = None gpu_id = None if args.dist_train: train_loader = str2dataloader[args.data_processor]( args, args.dataset_path, args.batch_size, rank, args.world_size, True) else: train_loader = str2dataloader[args.data_processor](args, args.dataset_path, args.batch_size, 0, 1, True) # Build optimizer. param_optimizer = list(model.named_parameters()) no_decay = ["bias", "gamma", "beta"] optimizer_grouped_parameters = [{ "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": 0.01 }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0 }] if args.optimizer in ["adamw"]: custom_optimizer = str2optimizer[args.optimizer]( optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) else: custom_optimizer = str2optimizer[args.optimizer]( optimizer_grouped_parameters, lr=args.learning_rate, scale_parameter=False, relative_step=False) if args.scheduler in ["constant"]: custom_scheduler = str2scheduler[args.scheduler](custom_optimizer) elif args.scheduler in ["constant_with_warmup"]: custom_scheduler = str2scheduler[args.scheduler]( custom_optimizer, args.total_steps * args.warmup) else: custom_scheduler = str2scheduler[args.scheduler]( custom_optimizer, args.total_steps * args.warmup, args.total_steps) if args.deepspeed: model, optimizer, _, scheduler = deepspeed.initialize( model=model, model_parameters=optimizer_grouped_parameters, args=args, optimizer=custom_optimizer, lr_scheduler=custom_scheduler, mpu=None, dist_init_required=False) else: if gpu_id is not None: model.cuda(gpu_id) optimizer = custom_optimizer scheduler = custom_scheduler if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) args.amp = amp if args.dist_train: # Initialize multiprocessing distributed training environment. dist.init_process_group(backend=args.backend, init_method=args.master_ip, world_size=args.world_size, rank=rank) model = DistributedDataParallel(model, device_ids=[gpu_id], find_unused_parameters=True) args.logger.info("Worker %d is training ... " % rank) else: args.logger.info("Worker is training ...") trainer = str2trainer[args.data_processor](args) trainer.train(args, gpu_id, rank, train_loader, model, optimizer, scheduler)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) finetune_opts(parser) parser.add_argument("--vocab_path", default=None, type=str, help="Path of the vocabulary file.") parser.add_argument("--spm_model_path", default=None, type=str, help="Path of the sentence piece model.") parser.add_argument("--label2id_path", type=str, required=True, help="Path of the label2id file.") parser.add_argument( "--crf_target", action="store_true", help="Use CRF loss as the target function or not, default False.") args = parser.parse_args() # Load the hyperparameters of the config file. args = load_hyperparam(args) # Get logger. args.logger = init_logger(args) set_seed(args.seed) args.begin_ids = [] with open(args.label2id_path, mode="r", encoding="utf-8") as f: l2i = json.load(f) args.logger.info("Labels: ", l2i) l2i["[PAD]"] = len(l2i) for label in l2i: if label.startswith("B"): args.begin_ids.append(l2i[label]) args.l2i = l2i args.labels_num = len(l2i) args.tokenizer = SpaceTokenizer(args) # Build sequence labeling model. model = NerTagger(args) # Load or initialize parameters. load_or_initialize_parameters(args, model) args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(args.device) # Training phase. instances = read_dataset(args, args.train_path) instances_num = len(instances) batch_size = args.batch_size args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1 args.logger.info("Batch size: {}".format(batch_size)) args.logger.info( "The number of training instances: {}".format(instances_num)) optimizer, scheduler = build_optimizer(args, model) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) if torch.cuda.device_count() > 1: args.logger.info("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) args.model = model total_loss, f1, best_f1 = 0.0, 0.0, 0.0 args.logger.info("Start training.") for epoch in range(1, args.epochs_num + 1): random.shuffle(instances) src = torch.LongTensor([ins[0] for ins in instances]) tgt = torch.LongTensor([ins[1] for ins in instances]) seg = torch.LongTensor([ins[2] for ins in instances]) model.train() for i, (src_batch, tgt_batch, seg_batch) in enumerate(batch_loader(batch_size, src, tgt, seg)): loss = train(args, model, optimizer, scheduler, src_batch, tgt_batch, seg_batch) total_loss += loss.item() if (i + 1) % args.report_steps == 0: args.logger.info( "Epoch id: {}, Training steps: {}, Avg loss: {:.3f}". format(epoch, i + 1, total_loss / args.report_steps)) total_loss = 0.0 f1 = evaluate(args, read_dataset(args, args.dev_path)) if f1 > best_f1: best_f1 = f1 save_model(model, args.output_model_path) else: continue # Evaluation phase. if args.test_path is not None: args.logger.info("Test set evaluation.") if torch.cuda.device_count() > 1: args.model.module.load_state_dict( torch.load(args.output_model_path)) else: args.model.load_state_dict(torch.load(args.output_model_path)) evaluate(args, read_dataset(args, args.test_path))
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Path options. parser.add_argument("--dataset_path", type=str, default="dataset.pt", help="Path of the preprocessed dataset.") parser.add_argument("--pretrained_model_path", type=str, default=None, help="Path of the pretrained model.") parser.add_argument("--output_model_path", type=str, required=True, help="Path of the output model.") parser.add_argument("--config_path", type=str, default="models/bert/base_config.json", help="Config file of model hyper-parameters.") # Training and saving options. parser.add_argument("--total_steps", type=int, default=100000, help="Total training steps.") parser.add_argument("--save_checkpoint_steps", type=int, default=10000, help="Specific steps to save model checkpoint.") parser.add_argument("--report_steps", type=int, default=100, help="Specific steps to print prompt.") parser.add_argument("--accumulation_steps", type=int, default=1, help="Specific steps to accumulate gradient.") parser.add_argument( "--batch_size", type=int, default=32, help= "Training batch size. The actual batch_size is [batch_size x world_size x accumulation_steps]." ) parser.add_argument("--instances_buffer_size", type=int, default=25600, help="The buffer size of instances in memory.") parser.add_argument("--labels_num", type=int, required=False, help="Number of prediction labels.") parser.add_argument("--dropout", type=float, default=0.1, help="Dropout value.") parser.add_argument("--seed", type=int, default=7, help="Random seed.") # Preprocess options. tokenizer_opts(parser) tgt_tokenizer_opts(parser) # Model options. model_opts(parser) parser.add_argument( "--tgt_embedding", choices=["word", "word_pos", "word_pos_seg", "word_sinusoidalpos"], default="word_pos_seg", help="Target embedding type.") parser.add_argument("--decoder", choices=["transformer"], default="transformer", help="Decoder type.") parser.add_argument("--pooling", choices=["mean", "max", "first", "last"], default="first", help="Pooling type.") parser.add_argument("--target", choices=[ "bert", "lm", "mlm", "bilm", "albert", "seq2seq", "t5", "cls", "prefixlm", "gsg", "bart" ], default="bert", help="The training target of the pretraining model.") parser.add_argument("--tie_weights", action="store_true", help="Tie the word embedding and softmax weights.") parser.add_argument("--has_lmtarget_bias", action="store_true", help="Add bias on output_layer for lm target.") parser.add_argument( "--deep_init", action="store_true", help="Scaling initialization of projection layers by a " "factor of 1/sqrt(2N). Necessary to large models.") # Masking options. parser.add_argument("--whole_word_masking", action="store_true", help="Whole word masking.") parser.add_argument("--span_masking", action="store_true", help="Span masking.") parser.add_argument( "--span_geo_prob", type=float, default=0.2, help="Hyperparameter of geometric distribution for span masking.") parser.add_argument("--span_max_length", type=int, default=10, help="Max length for span masking.") # Optimizer options. optimization_opts(parser) # GPU options. parser.add_argument("--world_size", type=int, default=1, help="Total number of processes (GPUs) for training.") parser.add_argument( "--gpu_ranks", default=[], nargs='+', type=int, help="List of ranks of each process." " Each process has a unique integer rank whose value is in the interval [0, world_size), and runs in a single GPU." ) parser.add_argument("--master_ip", default="tcp://localhost:12345", type=str, help="IP-Port of master for training.") parser.add_argument("--backend", choices=["nccl", "gloo"], default="nccl", type=str, help="Distributed backend.") # Deepspeed options. deepspeed_opts(parser) # Log options. log_opts(parser) args = parser.parse_args() if args.target == "cls": assert args.labels_num is not None, "Cls target needs the denotation of the number of labels." # Load hyper-parameters from config file. if args.config_path: args = load_hyperparam(args) # Get logger args.logger = init_logger(args) ranks_num = len(args.gpu_ranks) if args.deepspeed: if args.world_size > 1: args.dist_train = True else: args.dist_train = False else: if args.world_size > 1: # Multiprocessing distributed mode. assert torch.cuda.is_available(), "No available GPUs." assert ranks_num <= args.world_size, "Started processes exceed `world_size` upper limit." assert ranks_num <= torch.cuda.device_count( ), "Started processes exceeds the available GPUs." args.dist_train = True args.ranks_num = ranks_num args.logger.info("Using distributed mode for training.") elif args.world_size == 1 and ranks_num == 1: # Single GPU mode. assert torch.cuda.is_available(), "No available GPUs." args.gpu_id = args.gpu_ranks[0] assert args.gpu_id < torch.cuda.device_count( ), "Invalid specified GPU device." args.dist_train = False args.single_gpu = True args.logger.info("Using GPU %d for training." % args.gpu_id) else: # CPU mode. assert ranks_num == 0, "GPUs are specified, please check the arguments." args.dist_train = False args.single_gpu = False args.logger.info("Using CPU mode for training.") trainer.train_and_validate(args)