def __init__(self, args): self.args = args self.processor = MrpcProcessor() self.label_list = self.processor.get_labels() self.tokenizer = BertTokenizer( args.vocab_file, do_lower_case=args.do_lower_case )
def preprocess_text_input( context='Danielle is a girl who really loves her cat, Steve.', question='What cat does Danielle love?', vocab_file='DeepLearningExamples/PyTorch/LanguageModeling/BERT/vocab/vocab', max_seq_length=384, max_query_length=64, n_best_size=1, max_answer_length=30, null_score_diff_threshold=-11.0): tokenizer = BertTokenizer(vocab_file, do_lower_case=True, max_len=512) doc_tokens = context.split() query_tokens = tokenizer.tokenize(question) feature = preprocess_tokenized_text(doc_tokens, query_tokens, tokenizer, max_seq_length=max_seq_length, max_query_length=max_query_length) tensors_for_inference, tokens_for_postprocessing = feature input_ids = torch.tensor(tensors_for_inference.input_ids, dtype=torch.long).unsqueeze(0) segment_ids = torch.tensor(tensors_for_inference.segment_ids, dtype=torch.long).unsqueeze(0) input_mask = torch.tensor(tensors_for_inference.input_mask, dtype=torch.long).unsqueeze(0) return (input_ids, segments_ids, input_mask)
def predict(cls, context, question, bing_key=None, max_seq_length=384, max_query_length=64, n_best_size=3, do_lower_case=True, can_give_negative_answer=True, max_answer_length=30, null_score_diff_threshold=-11.0): """For the input, do the predictions and return them. Args: input (a pandas dataframe): The data on which to do the predictions. There will be one prediction per row in the dataframe""" predictor_model = cls.get_predictor_model() doc_tokens = context.split() tokenizer = BertTokenizer(vocab_file, do_lower_case=True, max_len=max_seq_length) query_tokens = tokenizer.tokenize(question) feature = preprocess_tokenized_text(doc_tokens, query_tokens, tokenizer, max_seq_length=max_seq_length, max_query_length=max_query_length) tensors_for_inference, tokens_for_postprocessing = feature input_ids = torch.tensor(tensors_for_inference.input_ids, dtype=torch.long, device=device).unsqueeze(0) segment_ids = torch.tensor(tensors_for_inference.segment_ids, dtype=torch.long, device=device).unsqueeze(0) input_mask = torch.tensor(tensors_for_inference.input_mask, dtype=torch.long, device=device).unsqueeze(0) # run prediction with torch.no_grad(): start_logits, end_logits = predictor_model(input_ids, segment_ids, input_mask) # post-processing start_logits = start_logits[0].detach().cpu().tolist() end_logits = end_logits[0].detach().cpu().tolist() prediction = get_predictions(doc_tokens, tokens_for_postprocessing, start_logits, end_logits, n_best_size, max_answer_length, do_lower_case, can_give_negative_answer, null_score_diff_threshold) return prediction
def __init__( self, eval_script: str = "data/squad/v1.1/evaluate-v1.1.py", predict_file: str = "", output_dir: str = "./", n_best_size: int = 20, max_answer_length: int = 30, version_2_with_negative: bool = False, max_seq_length: int = 384, doc_stride: int = 128, max_query_length: int = 64, vocab_file: str = "", do_lower_case: bool = True, max_len: int = 512, ): tokenizer = BertTokenizer(vocab_file, do_lower_case=do_lower_case, max_len=max_len) # for bert large self.eval_examples = read_squad_examples( input_file=predict_file, is_training=False, version_2_with_negative=version_2_with_negative) self.eval_features = convert_examples_to_features( examples=self.eval_examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=False, ) self.output_dir = output_dir self.eval_script = eval_script self.predict_file = predict_file args = Namespace( version_2_with_negative=version_2_with_negative, n_best_size=n_best_size, max_answer_length=max_answer_length, verbose_logging=False, do_lower_case=do_lower_case, ) self.args = args self.all_results: List[RawResult] = []
def get_dataloader(args): ''' return dataloader for inference ''' # Preprocess input data tokenizer = BertTokenizer(args.vocab_file, do_lower_case=args.do_lower_case, max_len=512) # for bert large cached_features_file = args.predict_file + '_{}_{}.bin'.format(args.max_seq_length, args.doc_stride) try: with open(cached_features_file, "rb") as reader: eval_features = pickle.load(reader) except: eval_examples = read_squad_examples( input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) with open(cached_features_file, "wb") as writer: pickle.dump(eval_features, writer) data = [] for feature in eval_features: input_ids = torch.tensor(feature.input_ids, dtype=torch.int64) input_mask = torch.tensor(feature.input_mask, dtype=torch.int64) segment_ids = torch.tensor(feature.segment_ids, dtype=torch.int64) inp = (input_ids, segment_ids, input_mask) data.append(inp) if args.nbatches > 0: data = data[:args.nbatches*args.batch_size] test_loader = torch.utils.data.DataLoader( data, batch_size=args.batch_size, shuffle=False, num_workers=1, pin_memory=True) return test_loader
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--vocab_file", default=None, type=str, required=True, help="The vocabulary the BERT model will train on.") parser.add_argument( "--input_file", default=None, type=str, required=True, help= "The input train corpus. can be directory with .txt files or a path to a single file" ) parser.add_argument( "--output_file", default=None, type=str, required=True, help="The output file where the model checkpoints will be written.") ## Other parameters # int parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument( "--dupe_factor", default=10, type=int, help= "Number of times to duplicate the input data (with different masks).") parser.add_argument("--max_predictions_per_seq", default=20, type=int, help="Maximum sequence length.") # floats parser.add_argument("--masked_lm_prob", default=0.15, type=float, help="Masked LM probability.") parser.add_argument( "--short_seq_prob", default=0.1, type=float, help= "Probability to create a sequence shorter than maximum sequence length" ) parser.add_argument( "--do_lower_case", action='store_true', default=True, help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument('--random_seed', type=int, default=12345, help="random seed for initialization") args = parser.parse_args() tokenizer = BertTokenizer(args.vocab_file, do_lower_case=args.do_lower_case) input_files = [] if os.path.isfile(args.input_file): input_files.append(args.input_file) elif os.path.isdir(args.input_file): input_files = [ os.path.join(args.input_file, f) for f in os.listdir(args.input_file) if (os.path.isfile(os.path.join(args.input_file, f)) and f.endswith('.txt')) ] else: raise ValueError("{} is not a valid path".format(args.input_file)) rng = random.Random(args.random_seed) instances = create_training_instances(input_files, tokenizer, args.max_seq_length, args.dupe_factor, args.short_seq_prob, args.masked_lm_prob, args.max_predictions_per_seq, rng) output_files = args.output_file.split(",") print("*** Writing to output files ***") for output_file in output_files: print(output_file) write_instance_to_example_files(instances, tokenizer, args.max_seq_length, args.max_predictions_per_seq, output_files)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument("--init_checkpoint", default=None, type=str, required=True, help="The checkpoint file from pretraining") ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--google_pretrained", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--max_steps", default=-1.0, type=float, help="Total number of training steps to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=1, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument('--fp16', default=False, action='store_true', help="Mixed precision training") parser.add_argument('--amp', default=False, action='store_true', help="Mixed precision training") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") parser.add_argument("--old", action='store_true', help="use old fp16 optimizer") parser.add_argument( '--vocab_file', type=str, default=None, required=True, help="Vocabulary mapping/file BERT was pretrainined on") parser.add_argument("--config_file", default=None, type=str, required=True, help="The BERT model config") args = parser.parse_args() args.fp16 = args.fp16 or args.amp if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, } num_labels_task = { "cola": 2, "mnli": 3, "mrpc": 2, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: print( "WARNING: Output directory ({}) already exists and is not empty.". format(args.output_dir)) if not os.path.exists(args.output_dir) and is_main_process(): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() #tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) tokenizer = BertTokenizer(args.vocab_file, do_lower_case=args.do_lower_case, max_len=512) # for bert large train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model config = modeling.BertConfig.from_json_file(args.config_file) # Padding for divisibility by 8 if config.vocab_size % 8 != 0: config.vocab_size += 8 - (config.vocab_size % 8) modeling.ACT2FN["bias_gelu"] = modeling.bias_gelu_training model = modeling.BertForSequenceClassification(config, num_labels=num_labels) print("USING CHECKPOINT from", args.init_checkpoint) model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')["model"], strict=False) print("USED CHECKPOINT from", args.init_checkpoint) model.to(device) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: print("using fp16") try: from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False) if args.loss_scale == 0: model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale="dynamic") else: model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale=args.loss_scale) scheduler = LinearWarmUpScheduler( optimizer, warmup=args.warmup_proportion, total_steps=num_train_optimization_steps) else: print("using fp32") optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: print("data prep") cached_train_features_file = args.data_dir + '_{0}_{1}_{2}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.do_lower_case)) train_features = None try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): if args.max_steps > 0 and global_step > args.max_steps: break batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up for BERT which FusedAdam doesn't do scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 preds = None out_label_ids = None for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = label_ids.detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, label_ids.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = np.argmax(preds, axis=1) eval_loss = eval_loss / nb_eval_steps loss = tr_loss / nb_tr_steps if args.do_train else None results = { 'eval_loss': eval_loss, 'global_step': global_step, 'loss': loss } result = compute_metrics(task_name, preds, out_label_ids) results.update(result) print(results) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(results.keys()): logger.info(" %s = %s", key, str(results[key])) writer.write("%s = %s\n" % (key, str(results[key])))
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--output_dir", default='output', type=str, help= "The output directory where the model checkpoints and predictions will be written." ) parser.add_argument("--checkpoint", default='pretrain_ckpt/bert_small_ckpt.bin', type=str, help="checkpoint") parser.add_argument("--model_config", default='data/bert_small.json', type=str) # Other parameters parser.add_argument("--train_file", default='data/KorQuAD_v1.0_train.json', type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument( "--max_seq_length", default=512, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=96, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=8.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--adam_epsilon", default=1e-6, type=float, help="Epsilon for Adam optimizer.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% " "of training.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--fp16_opt_level', type=str, default='O2', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument( '--null_score_diff_threshold', type=float, default=0.0, help= "If null_score - best_non_null is greater than the threshold predict null." ) args = parser.parse_args() device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {} n_gpu: {}, 16-bits training: {}".format( device, n_gpu, args.fp16)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer('data/ko_vocab_32k.txt', max_len=args.max_seq_length, do_basic_tokenize=True) # Prepare model config = Config.from_json_file(args.model_config) model = QuestionAnswering(config) model.bert.load_state_dict(torch.load(args.checkpoint)) num_params = count_parameters(model) logger.info("Total Parameter: %d" % num_params) model.to(device) model = torch.nn.DataParallel(model) cached_train_features_file = args.train_file + '_{0}_{1}_{2}'.format( str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) train_examples = read_squad_examples(input_file=args.train_file, is_training=True, version_2_with_negative=False) try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) num_train_optimization_steps = int( len(train_features) / args.train_batch_size) * args.num_train_epochs # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule( optimizer, warmup_steps=num_train_optimization_steps * 0.1, t_total=num_train_optimization_steps) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) num_train_step = num_train_optimization_steps input_ids = np.load('input_ids2.npy') input_mask = np.load('input_mask.npy') input_segments = np.load('input_segments.npy') start_prob = np.load('start_prob.npy') end_prob = np.load('end_prob.npy') start_label = np.load('input_start.npy') stop_label = np.load('input_stop.npy') """ for i in range(1000): print(input_ids[i]) print(max(start_prob[i])) print(sum(start_prob[i])) input() """ paragraph = torch.tensor(input_ids.astype( np.int64)).type(dtype=torch.long).cuda() paragraph_mask = torch.tensor(input_mask.astype( np.int64)).type(dtype=torch.long).cuda() paragraph_segments = torch.tensor(input_segments.astype( np.int64)).type(dtype=torch.long).cuda() start_prob = torch.tensor(start_prob.astype( np.float32)).type(dtype=torch.float32).cuda() end_prob = torch.tensor(end_prob.astype( np.float32)).type(dtype=torch.float32).cuda() start_label = torch.tensor(start_label.astype( np.int64)).type(dtype=torch.long).cuda() stop_label = torch.tensor(stop_label.astype( np.int64)).type(dtype=torch.long).cuda() train_data = TensorDataset(paragraph, paragraph_mask, paragraph_segments, start_label, stop_label, start_prob, end_prob) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() global_step = 0 epoch = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): iter_bar = tqdm( train_dataloader, desc="Train(XX Epoch) Step(XX/XX) (Mean loss=X.X) (loss=X.X)") tr_step, total_loss, mean_loss = 0, 0., 0. for step, batch in enumerate(iter_bar): if n_gpu == 1: batch = tuple( t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions, start_probs, end_probs = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions, start_probs, end_probs) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 tr_step += 1 total_loss += loss mean_loss = total_loss / tr_step iter_bar.set_description( "Train Step(%d / %d) (Mean loss=%5.5f) (loss=%5.5f)" % (global_step, num_train_step, mean_loss, loss.item())) logger.info("** ** * Saving file * ** **") model_checkpoint = "korquad_%d.bin" % (epoch) logger.info(model_checkpoint) output_model_file = os.path.join(args.output_dir, model_checkpoint) if n_gpu > 1: torch.save(model.module.state_dict(), output_model_file) else: torch.save(model.state_dict(), output_model_file) epoch += 1
args = parser.parse_args() # TRITON client setup protocol = ProtocolType.from_str(args.protocol) model_version = -1 infer_ctx = InferContext(args.url, protocol, args.model_name, model_version, http_headers=args.http_headers, verbose=args.verbose) # Preprocess input data tokenizer = BertTokenizer(args.vocab_file, do_lower_case=args.do_lower_case, max_len=512) # for bert large cached_features_file = args.predict_file + '_{}_{}.bin'.format( args.max_seq_length, args.doc_stride) eval_examples = read_squad_examples( input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative) try: with open(cached_features_file, "rb") as reader: eval_features = pickle.load(reader) except: eval_features = convert_examples_to_features( examples=eval_examples,
def main(args): args.fp16 = args.fp16 or args.amp if args.server_ip and args.server_port: # Distant debugging - see # https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd logger.info("Waiting for debugger attach") ptvsd.enable_attach( address=(args.server_ip, args.server_port), redirect_output=True, ) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of # sychronizing nodes/GPUs. if not torch.distributed.is_initialized(): torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, " "16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16, )) if not args.do_train and not args.do_eval and not args.do_predict: raise ValueError("At least one of `do_train`, `do_eval` or " "`do_predict` must be True.") if is_main_process(): if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train): logger.warning("Output directory ({}) already exists and is not " "empty.".format(args.output_dir)) mkdir_by_main_process(args.output_dir) if is_main_process(): dllogger.init(backends=[ dllogger.JSONStreamBackend( verbosity=dllogger.Verbosity.VERBOSE, filename=os.path.join(args.output_dir, 'dllogger.json'), ), dllogger.StdOutBackend( verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step, ), ]) else: dllogger.init(backends=[]) dllogger.log(step="PARAMETER", data={"Config": [str(args)]}) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, " "should be >= 1".format( args.gradient_accumulation_steps)) if args.gradient_accumulation_steps > args.train_batch_size: raise ValueError("gradient_accumulation_steps ({}) cannot be larger " "train_batch_size ({}) - there cannot be a fraction " "of one sample.".format( args.gradient_accumulation_steps, args.train_batch_size, )) args.train_batch_size = (args.train_batch_size // args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) dllogger.log(step="PARAMETER", data={"SEED": args.seed}) processor = PROCESSORS[args.task_name]() num_labels = len(processor.get_labels()) #tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) tokenizer = BertTokenizer( args.vocab_file, do_lower_case=args.do_lower_case, max_len=512, ) # for bert large num_train_optimization_steps = None if args.do_train: train_features = get_train_features( args.data_dir, args.bert_model, args.max_seq_length, args.do_lower_case, args.local_rank, args.train_batch_size, args.gradient_accumulation_steps, args.num_train_epochs, tokenizer, processor, ) num_train_optimization_steps = int( len(train_features) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = (num_train_optimization_steps // torch.distributed.get_world_size()) # Prepare model config = modeling.BertConfig.from_json_file(args.config_file) # Padding for divisibility by 8 if config.vocab_size % 8 != 0: config.vocab_size += 8 - (config.vocab_size % 8) # modeling.ACT2FN["bias_gelu"] = modeling.bias_gelu_training model = modeling.BertForSequenceClassification( config, num_labels=num_labels, ) logger.info("USING CHECKPOINT from {}".format(args.init_checkpoint)) checkpoint = torch.load(args.init_checkpoint, map_location='cpu') checkpoint = checkpoint["model"] if "model" in checkpoint.keys( ) else checkpoint model.load_state_dict(checkpoint, strict=False) logger.info("USED CHECKPOINT from {}".format(args.init_checkpoint)) dllogger.log( step="PARAMETER", data={ "num_parameters": sum([p.numel() for p in model.parameters() if p.requires_grad]), }, ) model.to(device) # Prepare optimizer model, optimizer, scheduler = init_optimizer_and_amp( model, args.learning_rate, args.loss_scale, args.warmup_proportion, num_train_optimization_steps, args.fp16, ) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from " "https://www.github.com/nvidia/apex to use " "distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) loss_fct = torch.nn.CrossEntropyLoss() results = {} if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) train_data = gen_tensor_dataset(train_features) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader( train_data, sampler=train_sampler, batch_size=args.train_batch_size, ) global_step = 0 nb_tr_steps = 0 tr_loss = 0 latency_train = 0.0 nb_tr_examples = 0 model.train() tic_train = time.perf_counter() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): if args.max_steps > 0 and global_step > args.max_steps: break batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch logits = model(input_ids, segment_ids, input_mask) loss = loss_fct( logits.view(-1, num_labels), label_ids.view(-1), ) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up for BERT # which FusedAdam doesn't do scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 latency_train = time.perf_counter() - tic_train tr_loss = tr_loss / nb_tr_steps results.update({ 'global_step': global_step, 'train:loss': tr_loss, 'train:latency': latency_train, 'train:num_samples_per_gpu': nb_tr_examples, 'train:num_steps': nb_tr_steps, 'train:throughput': get_world_size() * nb_tr_examples / latency_train, }) if is_main_process() and not args.skip_checkpoint: model_to_save = model.module if hasattr(model, 'module') else model torch.save( {"model": model_to_save.state_dict()}, os.path.join(args.output_dir, modeling.WEIGHTS_NAME), ) with open( os.path.join(args.output_dir, modeling.CONFIG_NAME), 'w', ) as f: f.write(model_to_save.config.to_json_string()) if (args.do_eval or args.do_predict) and is_main_process(): eval_examples = processor.get_dev_examples(args.data_dir) eval_features, label_map = convert_examples_to_features( eval_examples, processor.get_labels(), args.max_seq_length, tokenizer, ) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) eval_data = gen_tensor_dataset(eval_features) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size, ) model.eval() preds = None out_label_ids = None eval_loss = 0 nb_eval_steps, nb_eval_examples = 0, 0 cuda_events = [(torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)) for _ in range(len(eval_dataloader))] for i, (input_ids, input_mask, segment_ids, label_ids) in tqdm( enumerate(eval_dataloader), desc="Evaluating", ): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): cuda_events[i][0].record() logits = model(input_ids, segment_ids, input_mask) cuda_events[i][1].record() if args.do_eval: eval_loss += loss_fct( logits.view(-1, num_labels), label_ids.view(-1), ).mean().item() nb_eval_steps += 1 nb_eval_examples += input_ids.size(0) if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = label_ids.detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, label_ids.detach().cpu().numpy(), axis=0, ) torch.cuda.synchronize() eval_latencies = [ event_start.elapsed_time(event_end) for event_start, event_end in cuda_events ] eval_latencies = list(sorted(eval_latencies)) def infer_latency_sli(threshold): index = int(len(eval_latencies) * threshold) - 1 index = min(max(index, 0), len(eval_latencies) - 1) return eval_latencies[index] eval_throughput = (args.eval_batch_size / (np.mean(eval_latencies) / 1000)) results.update({ 'eval:num_samples_per_gpu': nb_eval_examples, 'eval:num_steps': nb_eval_steps, 'infer:latency(ms):50%': infer_latency_sli(0.5), 'infer:latency(ms):90%': infer_latency_sli(0.9), 'infer:latency(ms):95%': infer_latency_sli(0.95), 'infer:latency(ms):99%': infer_latency_sli(0.99), 'infer:latency(ms):100%': infer_latency_sli(1.0), 'infer:latency(ms):avg': np.mean(eval_latencies), 'infer:latency(ms):std': np.std(eval_latencies), 'infer:latency(ms):sum': np.sum(eval_latencies), 'infer:throughput(samples/s):avg': eval_throughput, }) preds = np.argmax(preds, axis=1) if args.do_predict: dump_predictions( os.path.join(args.output_dir, 'predictions.json'), label_map, preds, eval_examples, ) if args.do_eval: results['eval:loss'] = eval_loss / nb_eval_steps eval_result = compute_metrics(args.task_name, preds, out_label_ids) results.update(eval_result) if is_main_process(): logger.info("***** Results *****") for key in sorted(results.keys()): logger.info(" %s = %s", key, str(results[key])) with open(os.path.join(args.output_dir, "results.txt"), "w") as writer: json.dump(results, writer) dllogger_queries_from_results = { 'exact_match': 'acc', 'F1': 'f1', 'e2e_train_time': 'train:latency', 'training_sequences_per_second': 'train:throughput', 'e2e_inference_time': ('infer:latency(ms):sum', lambda x: x / 1000), 'inference_sequences_per_second': 'infer:throughput(samples/s):avg', } for key, query in dllogger_queries_from_results.items(): results_key, convert = (query if isinstance(query, tuple) else (query, lambda x: x)) if results_key not in results: continue dllogger.log( step=tuple(), data={key: convert(results[results_key])}, ) dllogger.flush() return results
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--init_checkpoint", default=None, type=str, required=True, help="The checkpoint file from pretraining") ## Other parameters parser.add_argument( "--verbose_logging", action='store_true', help= "If true, all of the warnings related to data processing will be printed. " ) parser.add_argument("--seed", default=1, type=int) parser.add_argument( "--question", default= "Most antibiotics target bacteria and don't affect what class of organisms? ", type=str, help="question") parser.add_argument( "--context", default= "Within the genitourinary and gastrointestinal tracts, commensal flora serve as biological barriers by competing with pathogenic bacteria for food and space and, in some cases, by changing the conditions in their environment, such as pH or available iron. This reduces the probability that pathogens will reach sufficient numbers to cause illness. However, since most antibiotics non-specifically target bacteria and do not affect fungi, oral antibiotics can lead to an overgrowth of fungi and cause conditions such as a vaginal candidiasis (a yeast infection). There is good evidence that re-introduction of probiotic flora, such as pure cultures of the lactobacilli normally found in unpasteurized yogurt, helps restore a healthy balance of microbial populations in intestinal infections in children and encouraging preliminary data in studies on bacterial gastroenteritis, inflammatory bowel diseases, urinary tract infection and post-surgical infections. ", type=str, help="context") parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument( "--n_best_size", default=1, type=int, help="The total number of n-best predictions to generate. ") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument( '--version_2_with_negative', action='store_true', help='If true, then the model can reply with "unknown". ') parser.add_argument( '--null_score_diff_threshold', type=float, default=-11.0, help= "If null_score - best_non_null is greater than the threshold predict 'unknown'. " ) parser.add_argument( '--vocab_file', type=str, default=None, required=True, help="Vocabulary mapping/file BERT was pretrainined on") parser.add_argument("--config_file", default=None, type=str, required=True, help="The BERT model config") parser.add_argument('--fp16', action='store_true', help="use mixed-precision") parser.add_argument("--local_rank", default=-1, help="ordinal of the GPU to use") args = parser.parse_args() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) tokenizer = BertTokenizer(args.vocab_file, do_lower_case=args.do_lower_case, max_len=512) # for bert large # Prepare model config = BertConfig.from_json_file(args.config_file) # Padding for divisibility by 8 if config.vocab_size % 8 != 0: config.vocab_size += 8 - (config.vocab_size % 8) # initialize model model = BertForQuestionAnswering(config) model.load_state_dict( torch.load(args.init_checkpoint, map_location='cpu')["model"]) model.to(device) if args.fp16: model.half() model.eval() print("question: ", args.question) print("context: ", args.context) print() # preprocessing doc_tokens = args.context.split() query_tokens = tokenizer.tokenize(args.question) feature = preprocess_tokenized_text(doc_tokens, query_tokens, tokenizer, max_seq_length=args.max_seq_length, max_query_length=args.max_query_length) tensors_for_inference, tokens_for_postprocessing = feature input_ids = torch.tensor(tensors_for_inference.input_ids, dtype=torch.long).unsqueeze(0) segment_ids = torch.tensor(tensors_for_inference.segment_ids, dtype=torch.long).unsqueeze(0) input_mask = torch.tensor(tensors_for_inference.input_mask, dtype=torch.long).unsqueeze(0) # load tensors to device input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) # run prediction with torch.no_grad(): start_logits, end_logits = model(input_ids, segment_ids, input_mask) # post-processing start_logits = start_logits[0].detach().cpu().tolist() end_logits = end_logits[0].detach().cpu().tolist() answer, answers = get_answer(doc_tokens, tokens_for_postprocessing, start_logits, end_logits, args) # print result print() print(answer) print() print(json.dumps(answers, indent=4))
def get_dataloader_fn( precision: str = 'fp32', batch_size: int = 8, vocab_file: str = "", do_lower_case: bool = True, predict_file: str = "", max_len: int = 512, max_seq_length: int = 384, doc_stride: int = 128, max_query_length: int = 64, version_2_with_negative: bool = False, pad_to_batch_size: bool = True, ): # Preprocess input data tokenizer = BertTokenizer(vocab_file, do_lower_case=do_lower_case, max_len=max_len) eval_examples = read_squad_examples( input_file=predict_file, is_training=False, version_2_with_negative=version_2_with_negative) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=False, ) # get inputs all_unique_ids = [f.unique_id for f in eval_features] all_input_ids = [f.input_ids for f in eval_features] all_input_mask = [f.input_mask for f in eval_features] all_segment_ids = [f.segment_ids for f in eval_features] if pad_to_batch_size: # each batch should have a fixed size f = eval_features[-1] padding = batch_size - (len(all_unique_ids) % batch_size) all_unique_ids += [f.unique_id for _ in range(padding)] all_input_ids += [f.input_ids for _ in range(padding)] all_input_mask += [f.input_mask for _ in range(padding)] all_segment_ids += [f.segment_ids for _ in range(padding)] all_unique_ids = torch.tensor(all_unique_ids, dtype=torch.int32, requires_grad=False) all_input_ids = torch.tensor(all_input_ids, dtype=torch.int32, requires_grad=False) all_input_mask = torch.tensor(all_input_mask, dtype=torch.int32, requires_grad=False) all_segment_ids = torch.tensor(all_segment_ids, dtype=torch.int32, requires_grad=False) eval_data = torch.utils.data.TensorDataset(all_unique_ids, all_input_ids, all_input_mask, all_segment_ids) eval_sampler = torch.utils.data.SequentialSampler(eval_data) eval_dataloader = torch.utils.data.DataLoader( eval_data, sampler=eval_sampler, batch_size=batch_size, shuffle=False, num_workers=0, ) dtype = {'fp32': np.float32, 'fp16': np.float16} dtype = dtype[precision] def _get_dataloader(): """return dataloader for inference""" for unique_id, input_ids, input_mask, segment_ids in eval_dataloader: unique_id = unique_id.cpu().numpy() input_ids = input_ids.cpu().numpy() input_mask = input_mask.cpu().numpy() segment_ids = segment_ids.cpu().numpy() x = { "input__0": input_ids, "input__1": segment_ids, "input__2": input_mask } y_real = { "output__0": np.zeros([batch_size, max_seq_length], dtype=dtype), "output__1": np.zeros([batch_size, max_seq_length], dtype=dtype), } yield (unique_id, x, y_real) return _get_dataloader
from tokenization import BertTokenizer import random from create_pretraining_data import create_training_instances, write_instance_to_example_file from glob import glob # line='The switches between clarity and intoxication gave me a headache, but at least the silver-haired faery’s explanation of the queens’ “gifts” helped me understand why I could want to wrap my legs around a creature who terrified me.' vocab_file = '/workspace/bert/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt' base_dir = '/home/haris/share/git/DeepLearningExamples/PyTorch/LanguageModeling/BERT/sharded_training_shards_256_test_shards_256_fraction_0.2/books_wiki_en_corpus/' random_seed = 123 tokenizer = BertTokenizer(vocab_file) rng = random.Random(random_seed) max_seq_length = 128 dupe_factor = 5 short_seq_prob = 0.1 masked_lm_prob = 0.15 max_predictions_per_seq = 20 # tokens=tokenizer.tokenize(line) # input_files=glob(base_dir+'*.txt') input_files = [ '/home/haris/share/git/DeepLearningExamples/PyTorch/LanguageModeling/BERT/test_file.txt' ] output_file = '/home/haris/share/git/DeepLearningExamples/PyTorch/LanguageModeling/BERT/test_file.h5' instances = create_training_instances(input_files, tokenizer, max_seq_length, dupe_factor, short_seq_prob, masked_lm_prob, max_predictions_per_seq, rng) write_instance_to_example_file(instances, tokenizer, max_seq_length, max_predictions_per_seq, output_file)
from tokenization import BertTokenizer from spacy.gold import align import spacy import os from tqdm import tqdm # from spacy.tokenizer import Tokenizer nlp = spacy.load('en_core_web_lg') tokenizer_bert=BertTokenizer() # nlp.tokenier=my_basic_tokenizer # tokenizer_spacy= Tokenizer(nlp.vocab) base_dir='/home/haris/share/git/DeepLearningExamples/PyTorch/LanguageModeling/BERT/sharded_training_shards_256_test_shards_256_fraction_0.2/books_wiki_en_corpus/' # def my_basic_tokenizer(text): # bert_tokens=tokenizer_bert.tokenize(text) # return Doc(nlp.vocab,words=bert_tokens) def test_file(file_number): filename='books_wiki_en_corpus_'+'training_'+str(file_number)+'.txt' file=os.path.join(base_dir,filename) total_diffs=0 with open(file) as f: text=f.readlines() for line in text: # spacy_doc=nlp(line.lower()) # spacy_tokens=[str(token) for token in spacy_doc]