def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_file", default=None, type=str, required=True, help="The input train corpus.") parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--on_memory", action='store_true', help="Whether to load train samples into memory or use disk") parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) #train_examples = None num_train_steps = None if args.do_train: print("Loading Train Dataset", args.train_file) train_dataset = BERTDataset(args.train_file, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) num_train_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model #model = BertForPreTraining.from_pretrained(args.bert_model) bert_config = BertConfig.from_json_file('bert_config.json') model = BertForPreTraining(bert_config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) global_step = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: #TODO: check if this works with current data generator from disk that relies on file.__next__ # (it doesn't return item back by index) train_sampler = DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") if args.do_train: torch.save(model_to_save.state_dict(), output_model_file)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_file", default=None, type=str, required=True, help="The input train corpus.") parser.add_argument( "--output_dir", default='../trained_model', type=str, help="The output directory where the model checkpoints will be written." ) parser.add_argument( "--bert_model", default='bert-base-cased', type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--on_memory", action='store_true', help="Whether to load train samples into memory or use disk") parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train: raise ValueError( "Training is currently the only implemented execution option. Please set `do_train`." ) if not os.path.exists(args.output_dir): logger.info('creating model output dir {}'.format(args.output_dir)) os.makedirs(args.output_dir) dataset = args.train_file.split('/')[-3] voc_fname = '/'.join(args.train_file.split('/')[:-2] + ['vocab.txt']) tokenizer = BertTokenizer(voc_fname, do_lower_case=args.do_lower_case) num_train_optimization_steps = None print("Loading Train Dataset", args.train_file) train_dataset = BERTDataset(args.train_file, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) num_train_optimization_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs # Prepare model. TODO: change to create fresh model. vocab = utils.read(voc_fname) config = BertConfig(vocab_size_or_config_json_file=len(vocab), hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) model = BertForPreTraining(config) model.to(device) if n_gpu > 1: # true most of the time model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) # true most of the time train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) # batch_size nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "pretraining_{}.bin".format( time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()))) if args.do_train: torch.save(model_to_save.state_dict(), output_model_file)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--bert_model_or_config_file", default=None, type=str, required=True, help= "Directory containing pre-trained BERT model or path of configuration file (if no pre-training)." ) parser.add_argument("--train_file", default=None, type=str, required=True, help="The input train corpus.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument( "--on_memory", action='store_true', help="Whether to load train samples into memory or use disk") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( "--num_gpus", type=int, default=-1, help="Num GPUs to use for training (0 for none, -1 for all available)") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() # Check whether bert_model_or_config_file is a file or directory if os.path.isdir(args.bert_model_or_config_file): pretrained = True targets = [WEIGHTS_NAME, CONFIG_NAME, "tokenizer.pkl"] for t in targets: path = os.path.join(args.bert_model_or_config_file, t) if not os.path.exists(path): msg = "File '{}' not found".format(path) raise ValueError(msg) fp = os.path.join(args.bert_model_or_config_file, CONFIG_NAME) config = BertConfig(fp) else: pretrained = False config = BertConfig(args.bert_model_or_config_file) # What GPUs do we use? if args.num_gpus == -1: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() device_ids = None else: device = torch.device("cuda" if torch.cuda.is_available() and args.num_gpus > 0 else "cpu") n_gpu = args.num_gpus if n_gpu > 1: device_ids = list(range(n_gpu)) if args.local_rank != -1: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) # Check some other args if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps if not args.do_train: raise ValueError( "Training is currently the only implemented execution option. Please set `do_train`." ) # Seed RNGs random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) # Prepare output directory if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Make tokenizer if pretrained: fp = os.path.join(args.bert_model_or_config_file, "tokenizer.pkl") with open(fp, "rb") as f: tokenizer = pickle.load(f) else: training_data = [ line.strip() for line in open(args.train_file).readlines() ] tokenizer = CuneiformCharTokenizer(training_data=training_data) tokenizer.trim_vocab(config.min_freq) # Adapt vocab size in config config.vocab_size = len(tokenizer.vocab) print("Size of vocab: {}".format(len(tokenizer.vocab))) # Get training data num_train_optimization_steps = None if args.do_train: print("Loading Train Dataset", args.train_file) train_dataset = BERTDataset(args.train_file, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) num_train_optimization_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model if pretrained: model = BertForPreTraining.from_pretrained( args.bert_model_or_config_file) else: model = BertForPreTraining(config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model, device_ids=device_ids) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) # Prepare training log output_log_file = os.path.join(args.output_dir, "training_log.txt") with open(output_log_file, "w") as f: f.write("Steps\tTrainLoss\n") # Start training global_step = 0 total_tr_steps = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: #TODO: check if this works with current data generator from disk that relies on next(file) # (it doesn't return item back by index) train_sampler = DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 avg_loss = tr_loss / nb_tr_examples # Update training log total_tr_steps += nb_tr_steps log_data = [str(total_tr_steps), "{:.5f}".format(avg_loss)] with open(output_log_file, "a") as f: f.write("\t".join(log_data) + "\n") # Save model logger.info("** ** * Saving model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) fn = os.path.join(args.output_dir, "tokenizer.pkl") with open(fn, "wb") as f: pickle.dump(tokenizer, f)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--dataset_name", default="top300_kl", type=str, required=True, help="The name of dataset to inference (without extention ex) top300_kl)") parser.add_argument("--model_type", default="baseline_tfidf", type=str, required=True, help="baseline, baseline_tfidf, ir-v0, ir-v1") parser.add_argument("--model_path", default=None, type=str, required=True, help="path to model dir") parser.add_argument("--output_dir", default=None, type=str, required=True, help="save_path") ## Other parameters parser.add_argument("--bert_model", default="bert-base-multilingual-cased", type=str, help="Default: bert-base-multilingual-cased" "Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--model_file", default="pytorch_model.bin", type=str, help="The file of model (.bin), default is pytorhc_model.bin,\n" "특정 파일이 필요시 이름 설정 필요") parser.add_argument("--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) processor = IRProcessor() label_list = processor.get_labels() num_labels = len(label_list) print("model:", args.model_type) if args.model_type == "baseline": # load model (finetuned baseline on IR) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=False) config = BertConfig(os.path.join(args.model_path + "bert_config.json")) model = BertForPreTraining(config) model.load_state_dict(torch.load(os.path.join(args.model_path, args.model_file))) elif args.model_type == "baseline_tfidf": # load model (baseline_tfidf) tokenizer = BertTFIDFTokenizer.from_pretrained(args.bert_model, do_lower_case=False, do_tf_idf=True) TFIDFconfig = modeling.BertConfig(os.path.join(args.model_path + "bert_config.json")) model = modeling.BertTFIDFForPreTraining(TFIDFconfig) model.load_state_dict(torch.load(os.path.join(args.model_path, args.model_file))) elif args.model_type == "ir-v0": # load model (*-head) tokenizer = BertTFIDFTokenizer.from_pretrained(args.bert_model, do_lower_case=False, do_tf_idf=True) head_config = modeling_ir.BertForIRConfig(os.path.join(args.model_path + "bert_config.json")) model = modeling_ir.BertForIRForPreTraining(head_config) model.load_state_dict(torch.load(os.path.join(args.model_path, args.model_file))) elif args.model_type == "ir-v1": # load model (*-head) tokenizer = BertTFIDFTokenizer.from_pretrained(args.bert_model, do_lower_case=False, do_tf_idf=True) head_config = modeling_ir_2.BertForIRConfig(os.path.join(args.model_path + "bert_config.json")) model = modeling_ir_2.BertForIRForPreTraining(head_config) model.load_state_dict(torch.load(os.path.join(args.model_path, args.model_file))) if args.fp16: model.half() model.to(device) tfidf_dict = pickle_load(os.path.join(args.data_dir, args.dataset_name + '_tfidf.pkl')) results_logit = dict() results_softmax = dict() eval_set, documents, queries = processor.make_eval_set(args.data_dir, args.dataset_name) logger.info("***** Running evaluation *****") logger.info(" Batch size = %d", args.eval_batch_size) for q_num, query in tqdm(enumerate(queries), total=len(queries), desc="Evaluating"): # for query in queries[0:1]: # for testing logger.info(f"Current Query Num : {q_num}") eval_examples = processor._create_examples(eval_set, query, documents) # logger.info(" Num examples = %d", len(eval_examples)) if args.model_type == "baseline": # baseline or baseline_finetuned eval_features = convert_examples_to_features_for_vanilla( eval_examples, label_list, args.max_seq_length, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Query"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): _, logits = model(input_ids, segment_ids, input_mask) # loss_fct = CrossEntropyLoss() # tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) # eval_loss += tmp_eval_loss.mean().item() # nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append( preds[0], logits.detach().cpu().numpy(), axis=0) else: # baseline_tfidf or *-head model eval_data = LazyDatasetClassifier(eval_examples, label_list, args.max_seq_length, tokenizer, tfidf_dict) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for batch in tqdm(eval_dataloader, desc="Query"): batch = tuple(t.to(device) for t in batch) input_ids, input_weights, input_mask, segment_ids, label_ids = batch with torch.no_grad(): _, logits = model(input_ids, input_weights, segment_ids, input_mask) # loss_fct = CrossEntropyLoss() # tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) # eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append( preds[0], logits.detach().cpu().numpy(), axis=0) # eval_loss = eval_loss / nb_eval_steps preds = preds[0] results_softmax[query] = [] for i, pred in enumerate(softmax(preds)): # using softmax pair = dict() pair["score"] = pred[1] pair["doc_id"] = list(documents.keys())[i] results_softmax[query].append(pair) results_softmax[query].sort(reverse=True, key=lambda x: x["score"]) ranked_doc_list = [] for doc in results_logit[query]: ranked_doc_list.append(doc["doc_id"]) results_logit[query] = ranked_doc_list ranked_doc_list = [] for doc in results_softmax[query]: ranked_doc_list.append(doc["doc_id"]) results_softmax[query] = ranked_doc_list save_name2 = args.model_path.split('/')[0] + '_' + args.model_file.split('.')[0] \ + '_' + args.dataset_name + '_output.json' path2 = os.path.join(args.output_dir, save_name2) with open(path2, 'w', encoding="utf8") as f: json.dump(results_softmax, f, indent=4, sort_keys=True, ensure_ascii=False)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", default=False, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=1e-4, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.01, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--checkpoint_filename', default=None, type=str, help="checkpoint_filename") args = parser.parse_args() os.system("clear") # os.system('bash /data1/private/linyankai/code/run_create.sh 0 0') if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() train_data = None num_train_steps = None if args.do_train: # TODO import indexed_dataset from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, BatchSampler import iterators train_data = indexed_dataset.ZhxIndexedDataset(args.data_dir) if args.local_rank == -1: # train_sampler = RandomSampler(train_data) train_sampler = SequentialSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_sampler = BatchSampler(train_sampler, args.train_batch_size, True) def collate_fn(x): x = torch.LongTensor([xx.numpy() for xx in x]) return x[:, :args. max_seq_length], x[:, args.max_seq_length:2 * args. max_seq_length], x[:, 2 * args. max_seq_length:3 * args. max_seq_length], x[:, 3 * args . max_seq_length: 4 * args . max_seq_length], x[:, 4 * args . max_seq_length: 4 * args .max_seq_length + 1] train_iterator = iterators.EpochBatchIterator(train_data, collate_fn, train_sampler) num_train_steps = int( len(train_data) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model config_file = os.path.join(args.bert_model, "bert_config.json") config = BertConfig.from_json_file(config_file) model = BertForPreTraining(config) # model = BertForPreTraining.from_pretrained(args.bert_model, # cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank)) logger.info(config.to_json_string()) checkpoint_file = args.checkpoint_filename old_iter_id = 0 if checkpoint_file != None: _, global_step, old_iter_id, model, optimizer_dict = load_checkpoint( checkpoint_file, device, n_gpu) if args.fp16: model.half() model.to(device) # if True: if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) if checkpoint_file == None: global_step = 0 # optimizer.load_state_dict(optimizer_dict) tid = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_data)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) model.train() import datetime f_config_out = open( os.path.join(args.output_dir, "config.{}".format(datetime.datetime.now())), "w") f_config_out.write(str(args)) f_config_out.close() fout = open(os.path.join(args.output_dir, "loss.txt"), 'w') f_para_out = open(os.path.join(args.output_dir, "para.txt"), 'w') for iter_id in enumerate( tqdm(range(old_iter_id, int(args.num_train_epochs)), desc="Epoch")): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 checkpoint(args, global_step, iter_id[0] + old_iter_id, model, optimizer) # tid = 1 - tid # os.system('bash /data1/private/linyankai/code/run_create.sh '+str(random.randint(0,255))+" "+str(tid+1)+" &") all_loss = 0 for step, batch in enumerate( tqdm(train_iterator.next_epoch_itr(), desc="Iteration")): batch = tuple(t.to(device) - 1 for t in batch) input_ids, input_mask, segment_ids, masked_lm_labels, next_sentence_label = batch loss, masked_lm_loss, next_sentence_loss = model( input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_label) # loss = masked_lm_loss if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. masked_lm_loss = masked_lm_loss.mean() next_sentence_loss = next_sentence_loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if not math.isnan(loss.item()): if args.fp16: optimizer.backward(loss) else: loss.backward() all_loss += loss.item() if (global_step % 1000 == 0): f_para_out.write("global_step:" + str(global_step) + '\n') f_para_out.write("loss:\t" + str(loss) + "\n") ''' for param_group in optimizer.param_groups: f_para_out.write("lr:\t"+str(param_group['lr'])+'\n') break param_all= list(model.named_parameters()) for n, p in param_optimizer: f_para_out.write(str(n)+"\t"+str(p)+"\tGradient:\t"+str(p.grad.data.float()/optimizer.cur_scale)+"\n") f_para_out.flush() ''' print_param_stats(model, f_para_out) if (global_step % 10 == 0): fout.write("iter_id:" + str(iter_id) + "\t") fout.write("global_step:" + str(global_step) + '\t') fout.write("lr_this_step:" + str(args.learning_rate * warmup_linear(global_step / t_total, args.warmup_proportion)) + '\t') fout.write("step:" + str(step) + '\t') fout.write("loss:" + str(loss.item()) + '\t') fout.write( str(masked_lm_loss.item()) + '\t' + str(next_sentence_loss.item()) + '\n') if global_step % 10000 == 0: checkpoint(args, global_step, iter_id[0] + old_iter_id, model, optimizer) model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "pytorch_model%d.bin" % global_step) torch.save(model_to_save.state_dict(), output_model_file) # fout.flush() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 ''' train_iterator = None train_sampler = None train_data = None import indexed_dataset from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,BatchSampler import iterators train_data = indexed_dataset.IndexedCachedDataset('/data'+str(tid+1)+args.data_dir[6:]) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_sampler = BatchSampler(train_sampler, args.train_batch_size, True) def collate_fn(x): x = torch.LongTensor([xx.numpy() for xx in x]) return x[:,:args.max_seq_length], x[:,args.max_seq_length:2*args.max_seq_length], x[:,2*args.max_seq_length:3*args.max_seq_length], x[:,3*args.max_seq_length:4*args.max_seq_length], x[:,4*args.max_seq_length:4*args.max_seq_length+1] train_iterator = iterators.EpochBatchIterator(train_data, collate_fn, train_sampler) ''' fout.close() f_para_out.close() # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file)