def setup_model_and_optimizer(args, model_type=None, multi_token=True, num_labels=None, spell_length=None): """Setup model and optimizer.""" model = get_model(args, model_type=model_type, multi_token=multi_token, num_labels=num_labels, spell_length=spell_length) param_groups = get_optimizer_param_groups(model) if args.train_data is not None or args.data_dir is not None and ( args.epochs > 0 or args.train_iters > 0): if args.deepspeed: print_rank_0("DeepSpeed is enabled.") model, optimizer, _, _ = deepspeed.initialize( model=model, model_parameters=param_groups, args=args, mpu=mpu, dist_init_required=False) else: optimizer = get_optimizer(param_groups, args) lr_scheduler = get_learning_rate_scheduler(optimizer, args) else: optimizer, lr_scheduler = None, None return model, optimizer, lr_scheduler
def prepare_tokenizer(args): tokenizer_args = { 'tokenizer_type': args.tokenizer_type, 'corpus': None, 'model_path': args.tokenizer_path, 'vocab_size': args.vocab_size, 'model_type': args.tokenizer_model_type, 'cache_dir': args.cache_dir} tokenizer = make_tokenizer(**tokenizer_args) num_tokens = tokenizer.num_tokens before = num_tokens after = before multiple = args.make_vocab_size_divisible_by * \ mpu.get_model_parallel_world_size() while (after % multiple) != 0: after += 1 print_rank_0('> padded vocab (size: {}) with {} dummy ' 'tokens (new size: {})'.format( before, after - before, after)) args.tokenizer_num_tokens = after args.tokenizer_num_type_tokens = tokenizer.num_type_tokens args.eod_token = tokenizer.get_command('eos').Id # after = tokenizer.num_tokens # while after % mpu.get_model_parallel_world_size() != 0: # after += 1 args.vocab_size = after print("prepare tokenizer done", flush=True) return tokenizer
def evaluate(data_iterator, model, args, timers, verbose = False): """Evaluation.""" # Turn on evaluation mode which disables dropout. model.eval() total_lm_loss = 0 #total_nsp_loss = 0 with torch.no_grad(): iteration = 0 while iteration < args.eval_iters: iteration += 1 if verbose and iteration % args.log_interval == 0: print_rank_0('Evaluating iter {}/{}'.format(iteration, args.eval_iters)) # Forward evaluation. lm_loss = forward_step(data_iterator, model, args, timers) # Reduce across processes. if isinstance(model, args.DDP_type): reduced_losses = lm_loss.view(1) torch.distributed.all_reduce(reduced_losses.data) reduced_losses.data = reduced_losses.data/args.world_size lm_loss = reduced_losses[0] #nsp_loss = reduced_losses[1] total_lm_loss += lm_loss.data.detach().float().item() #total_nsp_loss += nsp_loss.data.detach().float().item() # Move model back to the train mode. model.train() total_lm_loss /= args.eval_iters #total_nsp_loss /= args.eval_iters return total_lm_loss#, total_nsp_loss
def __init__(self, args, task_name, data_dir, seq_length, split, tokenizer, for_train=False, pattern_ensemble=False, pattern_text=False): self.processor = PROCESSORS[task_name](args) args.variable_num_choices = self.processor.variable_num_choices print_rank_0( f"Creating {task_name} dataset from file at {data_dir} (split={split})" ) self.dataset_name = f"{task_name}-{split}" self.cloze_eval = args.cloze_eval self.seq_length = seq_length self.tokenizer = tokenizer self.pattern_ensemble = pattern_ensemble self.pattern_text = pattern_text if pattern_text: assert self.cloze_eval, "Labeled examples only exist in cloze evaluation" self.args = args if split == DEV_SET: example_list = self.processor.get_dev_examples(data_dir, for_train=for_train) elif split == TEST_SET: example_list = self.processor.get_test_examples(data_dir) elif split == TRUE_DEV_SET: example_list = self.processor.get_true_dev_examples(data_dir) elif split == TRAIN_SET: if task_name == "wsc": example_list = self.processor.get_train_examples(data_dir, cloze_eval=args.cloze_eval) else: example_list = self.processor.get_train_examples(data_dir) elif split == UNLABELED_SET: example_list = self.processor.get_unlabeled_examples(data_dir) for example in example_list: example.label = self.processor.get_labels()[0] else: raise ValueError(f"'split' must be one of {SPLIT_TYPES}, got '{split}' instead") if split == TEST_SET: self.labeled = False else: self.labeled = True label_distribution = Counter(example.label for example in example_list) print_rank_0( f"Returning {len(example_list)} {split} examples with label dist.: {list(label_distribution.items())}") self.samples = [] example_list.sort(key=lambda x: x.num_choices) self.example_list = example_list if self.cloze_eval: if self.pattern_ensemble: pattern_ids = PVPS[task_name].available_patterns() self.pvps = [] for pattern_id in pattern_ids: self.pvps.append(PVPS[task_name](args, tokenizer, self.processor.get_labels(), seq_length, pattern_id=pattern_id, num_prompt_tokens=args.num_prompt_tokens, is_multi_token=args.multi_token, max_segment_length=args.segment_length, fast_decode=args.fast_decode, split=split)) else: self.pvp = PVPS[task_name](args, tokenizer, self.processor.get_labels(), seq_length, pattern_id=args.pattern_id, num_prompt_tokens=args.num_prompt_tokens, is_multi_token=args.multi_token, max_segment_length=args.segment_length, fast_decode=args.fast_decode, split=split) self.examples = {example.guid: example for example in example_list}
def setup_model_and_optimizer(args, config, need_optim=False, ckpt_path=None, do_fp16=False): """Setup model and optimizer.""" model = get_model(args, config, do_fp16=do_fp16) optimizer = get_optimizer(model, args, do_fp16=do_fp16) if need_optim else None lr_scheduler = get_learning_rate_scheduler(optimizer, args) if need_optim else None if args.deepspeed: print_rank_0("DeepSpeed is enabled.") model, optimizer, _, lr_scheduler = deepspeed.initialize( model=model, optimizer=optimizer, args=args, lr_scheduler=lr_scheduler, mpu=mpu, dist_init_required=False) iteration = 0 if ckpt_path is not None: iteration = load_checkpoint(ckpt_path, model, optimizer, lr_scheduler, args) return model, optimizer, lr_scheduler, iteration
def _create_examples(path: str, set_type: str) -> List[InputExample]: examples = [] with open(path, encoding='utf8') as f: for line in f: example_json = json.loads(line) label = example_json['label'] if 'label' in example_json else None idx = example_json['idx'] guid = "%s-%s" % (set_type, idx) text_a = example_json['premise'] meta = { 'choice1': example_json['choice1'], 'choice2': example_json['choice2'], 'question': example_json['question'] } example = InputExample(guid=guid, text_a=text_a, label=label, meta=meta, idx=idx) examples.append(example) if set_type == 'train' or set_type == 'unlabeled': mirror_examples = [] for ex in examples: label = 1 if ex.label == 0 else 0 meta = { 'choice1': ex.meta['choice2'], 'choice2': ex.meta['choice1'], 'question': ex.meta['question'] } mirror_example = InputExample(guid=ex.guid + 'm', text_a=ex.text_a, label=label, meta=meta) mirror_examples.append(mirror_example) examples += mirror_examples print_rank_0(f"Added {len(mirror_examples)} mirror examples, total size is {len(examples)}...") return examples
def _create_examples(path: str, set_type: str) -> List[InputExample]: examples = [] with open(path, encoding='utf8') as f: for line in f: example_json = json.loads(line) passage_idx = example_json['idx'] text = punctuation_standardization(example_json['passage']['text']) questions = example_json['passage']['questions'] for question_json in questions: question = punctuation_standardization(question_json["question"]) question_idx = question_json['idx'] answers = question_json["answers"] for answer_json in answers: label = answer_json["label"] if 'label' in answer_json else None answer_idx = answer_json["idx"] guid = f'{set_type}-p{passage_idx}-q{question_idx}-a{answer_idx}' meta = { 'passage_idx': passage_idx, 'question_idx': question_idx, 'answer_idx': answer_idx, 'answer': punctuation_standardization(answer_json["text"]) } idx = [passage_idx, question_idx, answer_idx] example = InputExample(guid=guid, text_a=text, text_b=question, label=label, meta=meta, idx=idx) examples.append(example) question_indices = list(set(example.meta['question_idx'] for example in examples)) label_distribution = Counter(example.label for example in examples) print_rank_0( f"Returning {len(examples)} examples corresponding to {len(question_indices)} questions with label " f"distribution {list(label_distribution.items())}") return examples
def setup_model_and_optimizer_C(args, model_cls=GPT2Model_C): """Setup model and optimizer.""" model = get_model_C(args, model_cls) optimizer = get_optimizer(model, args) lr_scheduler = get_learning_rate_scheduler(optimizer, args) if args.deepspeed: print_rank_0("DeepSpeed is enabled.") model, optimizer, _, lr_scheduler = deepspeed.initialize( model=model, optimizer=optimizer, args=args, lr_scheduler=lr_scheduler, mpu=mpu, dist_init_required=False ) if args.load is not None: args.iteration = load_checkpoint(model, optimizer, lr_scheduler, args) else: args.iteration = 0 return model, optimizer, lr_scheduler
def get_model(args): """Build the model.""" print_rank_0('building GPT2 model ...') model = GPT2Model(num_layers=args.num_layers, vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, embedding_dropout_prob=args.hidden_dropout, attention_dropout_prob=args.attention_dropout, output_dropout_prob=args.hidden_dropout, max_sequence_length=args.max_position_embeddings, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=False) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) # Wrap model for distributed training. model = DDP(model) return model
def prepare_tokenizer(args): add_sentinel_token = 0 if args.sentinel_token: add_sentinel_token = args.max_position_embeddings tokenizer = make_tokenizer(args.tokenizer_type, None, args.tokenizer_path, args.vocab_size, args.tokenizer_model_type, add_block_symbols=args.block_lm, cache_dir=args.cache_dir, add_sentinel_token=add_sentinel_token, add_task_mask=args.task_mask, add_decoder_mask=args.block_mask_prob > 0.0 or args.context_mask_ratio > 0.0, fix_command_token=args.fix_command_token) if mpu.get_model_parallel_rank() == 0: num_tokens = tokenizer.num_tokens eod_token = tokenizer.get_command('eos').Id assert eod_token == tokenizer.get_command('pad').Id before = num_tokens after = before multiple = args.make_vocab_size_divisible_by while (after % multiple) != 0: after += 1 print_rank_0('> padded vocab (size: {}) with {} dummy ' 'tokens (new size: {})'.format(before, after - before, after)) print_rank_0('> found end-of-document token: {}'.format(eod_token)) token_counts = torch.cuda.LongTensor([after, eod_token]) else: token_counts = torch.cuda.LongTensor([0, 0]) # Broadcast num tokens. torch.distributed.broadcast(token_counts, mpu.get_model_parallel_src_rank(), group=mpu.get_model_parallel_group()) num_tokens = token_counts[0].item() eod_token = token_counts[1].item() args.vocab_size, args.eod_token = num_tokens, eod_token return tokenizer
def build_lambada_dataset(tokenizer, args): """Build lambada dataset.""" assert len(args.valid_data) == 1 val_dataset = LambadaDataset(args, tokenizer, strict=True) print_rank_0(' > found {} samples, {} label tokens.'.format( len(val_dataset), sum(map(len, val_dataset.labels)))) return val_dataset
def __init__(self, args, split, tokenizer): self.args = args self.task, self.data_dir = args.task.lower(), args.data_dir self.max_src_length, self.max_tgt_length = args.src_seq_length, args.tgt_seq_length self.split = split self.tokenizer = tokenizer self.dataset_name = split if self.task in ["gigaword", "cnn_dm", "cnn_dm_original"]: self.processor = SummmaryProcessor(self.task, self.data_dir, tokenizer) elif self.task in ["xsum"]: self.processor = XSumProcessor(self.data_dir, tokenizer) elif self.task in ["squad_generation"]: self.processor = SQuADGenerationProcessor(self.data_dir, tokenizer) elif self.task in ["squad", "squad_v1"]: self.processor = SQuADProcessor(self.data_dir, tokenizer, self.max_src_length, args) elif self.task in ['cmrc']: self.processor = CMRCProcessor(self.data_dir, tokenizer) else: raise NotImplementedError(self.task) example_list = self.processor.create_examples(split) self.example_list = example_list self.examples = {example.guid: example for example in example_list} print_rank_0(f"Return {len(self.examples)} {split} examples")
def evaluate(model, dataloader, eval_metric, args): """Evaluation.""" # Turn on evaluation mode which disables dropout. model.eval() total_output, total_count = 0.0, 0 total_tokens = 0 with torch.no_grad(): # For all the batches in the dataset. for iteration, batch in enumerate(dataloader): if (iteration + 1) % args.log_interval == 0: print_rank_0('> working on iteration: {}'.format(iteration)) # Forward evaluation. output, _, _ = lm_forward_step(batch, model, args, None, [], eval_metric=eval_metric) count = batch['text'].size(0) count = torch.cuda.LongTensor([count]) # Reduce across processes. torch.distributed.all_reduce(output, group=mpu.get_data_parallel_group()) torch.distributed.all_reduce(count, group=mpu.get_data_parallel_group()) total_output += output.item() total_count += count.item() total_tokens += batch['loss_mask'].sum().item() print(total_tokens) return {eval_metric: total_output}, total_count
def get_model(args, version=None): """Build the model.""" print_rank_0('building Bert model ...') if version is None: model = BertMixtureModel(num_layers=args.num_layers, vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, embedding_dropout_prob=args.hidden_dropout, attention_dropout_prob=args.attention_dropout, output_dropout_prob=args.hidden_dropout, layernorm_epsilon=args.layernorm_epsilon, max_sequence_length=args.max_position_embeddings, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=True, num_experts=args.num_experts, type_vocab_size=2) elif version == "v0": model = BertMixtureModel_v0(num_layers=args.num_layers, vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, embedding_dropout_prob=args.hidden_dropout, attention_dropout_prob=args.attention_dropout, output_dropout_prob=args.hidden_dropout, layernorm_epsilon=args.layernorm_epsilon, max_sequence_length=args.max_position_embeddings, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=True, num_experts=args.num_experts, type_vocab_size=2) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) #To prevent OOM for model sizes that cannot fit in GPU memory in full precision if args.deepspeed and args.fp16: model.half() # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) # Wrap model for distributed training. if USE_TORCH_DDP: i = torch.cuda.current_device() model = DDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) else: model = DDP(model) return model
def init_indices(self): if self.is_lazy: lens = np.array([self.ds.get_text_len(idx) for idx in range(len(self.ds))]) else: lens = np.array([len(d['prompt']) + len(d['text']) if isinstance(d, dict) else len(d) for d in self.ds]) self.indices = list(accumulate(lens)) print_rank_0(f"Dataset document count {len(lens)}, token count {self.indices[-1]}") self.num_samples = self.indices[-1] // self.max_seq_len + 1
def init_weighting(self): if self.is_lazy: lens = np.array([self.ds.get_text_len(idx) for idx in range(len(self.ds))]) else: lens = np.array([len(d['text']) if isinstance(d, dict) else len(d) for d in self.ds]) self.total_len = np.sum(lens) print_rank_0( f"Dataset document count {len(lens)}, token count {self.total_len}, non sentence start{self.non_sentence_start}") self.weighting = list(accumulate(lens))
def read_input_to_queue(): for path in paths: print_rank_0(f"Start reading {path}") with open(path) as file: for row in file: task_queue.put(row) print_rank_0("Read input complete") for i in range(len(processes)): task_queue.put('STOP')
def evaluate(data_loader, model, args, timers, num_iterations=None): """Evaluation.""" # Turn on evaluation mode which disables dropout. model.eval() total_lm_loss = 0 if num_iterations is not None: max_iters = num_iterations else: if mpu.get_model_parallel_rank() == 0: max_iters_gpu = torch.cuda.LongTensor([len(data_loader)]) else: max_iters_gpu = torch.cuda.LongTensor([0]) torch.distributed.broadcast(max_iters_gpu, mpu.get_model_parallel_src_rank(), group=mpu.get_model_parallel_group()) max_iters = max_iters_gpu[0].item() print_rank_0('global rank: {} | max iters: {}'.format( torch.distributed.get_rank(), max_iters)) if data_loader is not None: data_iterator = iter(data_loader) else: data_iterator = None with torch.no_grad(): iteration = 0 while iteration < max_iters: if iteration % args.log_interval == 0: print_rank_0('global rank: {} | iteration: {}'.format( torch.distributed.get_rank(), iteration)) # Forward evaluation. lm_loss = forward_step(data_iterator, model, args, timers) if lm_loss is None: break # Reduce across processes. if isinstance(model, DDP): torch.distributed.all_reduce(lm_loss.data) if args.cloze_eval: lm_loss.data = lm_loss.data / args.world_size else: lm_loss.data = lm_loss.data / args.model_parallel_size if not args.cloze_eval: total_lm_loss += lm_loss.data.detach().float().item()/(args.num_tokenized_tokens-1) else: total_lm_loss += lm_loss.data.detach().float().item() iteration += 1 # Move model back to the train mode. model.train() return total_lm_loss
def print_info(self, info): total_dict = defaultdict(int) while True: try: source_dict = info.get(block=False) for source, length in source_dict.items(): total_dict[source] += length except Empty: break print_rank_0(total_dict)
def check_and_set_(self, cls_value, sd_value, name): if self.override_lr_scheduler: print_rank_0(' > overriding {} value to {}'.format( name, cls_value)) return cls_value else: if not self.use_checkpoint_lr_scheduler: assert cls_value == sd_value, 'AnnealingLR: class input value' \ 'and checkpoint values for {} do not match'.format(name) print_rank_0(' > using checkpoint value {} for {}'.format( sd_value, name)) return sd_value
def __init__(self, args, split, tokenizer, for_train=False): task_name = args.task.lower() data_dir = args.data_dir processor = PROCESSORS[task_name](args) print_rank_0( f"Creating {task_name} dataset from file at {data_dir} (split={split})" ) self.dataset_name = f"{task_name}-{split}" if split == DEV_SET: examples = processor.get_dev_examples(data_dir, for_train=for_train) elif split == TEST_SET: examples = processor.get_test_examples(data_dir) elif split == TRAIN_SET: examples = processor.get_train_examples(data_dir) elif split == UNLABELED_SET: examples = processor.get_unlabeled_examples(data_dir) for example in examples: example.label = processor.get_labels()[0] else: raise ValueError( f"'split' must be one of {SPLIT_TYPES}, got '{split}' instead") if split == TEST_SET: self.labeled = False else: self.labeled = True label_distribution = Counter(example.label for example in examples) print_rank_0( f"Returning {len(examples)} {split} examples with label dist.: {list(label_distribution.items())}" ) self.samples = [] examples.sort(key=lambda x: x.num_choices) if args.cloze_eval: pvp = PVPS[task_name](args, tokenizer, processor.get_labels(), args.seq_length, pattern_id=args.pattern_id, fast_decode=args.fast_decode, continuous_prompt=args.continuous_prompt) for example in examples: sample = pvp.encode(example) self.samples.append(sample) print_rank_0(f"Truncate {pvp.num_truncated} examples") else: for example in examples: sample = processor.encode(example, tokenizer, args) self.samples.append(sample) print_rank_0(f"Truncate {processor.num_truncated} examples") print_rank_0(f"Creating {len(self.samples)} samples") self.examples = {example.guid: example for example in examples}
def evaluate(data_iterator, student_model, teacher_model, args, timers, verbose=False): """Evaluation.""" # Turn on evaluation mode which disables dropout. student_model.eval() if teacher_model is not None: teacher_model.eval() total_losses = defaultdict(int) with torch.no_grad(): for iter in range(args.eval_iters): if verbose and iter % args.log_interval == 0: print_rank_0('Evaluating iter {}/{}'.format( iter, args.eval_iters)) save_rank_0( args, 'Evaluating iter {}/{}'.format(iter, args.eval_iters)) # Forward evaluation. losses = forward_step(data_iterator, student_model, teacher_model, args, timers) # tot_loss = losses["tot_loss"] '''when contiguous memory optimizations are enabled, the buffers allocated by the optimizations are deallocated during backward pass in the absence of backward pass the buffers should be reset after each forward pass''' if args.deepspeed and args.deepspeed_activation_checkpointing: deepspeed.checkpointing.reset() # Reduce across processes. if isinstance(student_model, DDP): for k in losses: torch.distributed.all_reduce(losses[k].data) losses[k].data = losses[k].data / args.world_size for k in losses: total_losses[k] += losses[k].data.detach().float().item() # Move model back to the train mode. student_model.train() for k in total_losses: total_losses[k] /= args.eval_iters return total_losses
def evaluate_and_print_results(prefix, data_iterator, model, args, writer, iteration, timers, verbose=False): """Helper function to evaluate and dump results on screen.""" lm_loss, nsp_loss = evaluate(data_iterator, model, args, timers, verbose) val_loss = lm_loss + nsp_loss print_rank_0('-' * 100) string = ' validation loss at {} | '.format(prefix) string += 'LM loss: {:.6E} | '.format(lm_loss) string += 'NSP loss: {:.6E} | '.format(nsp_loss) string += 'total loss: {:.6E}'.format(val_loss) length = len(string) + 1 print_rank_0('-' * length) print_rank_0(string) print_rank_0('-' * length) if writer and args.rank == 0: writer.add_scalar('val_lm_loss', lm_loss, iteration) writer.add_scalar('val_nsp_loss', nsp_loss, iteration) writer.add_scalar('val_total_loss', val_loss, iteration) return val_loss
def read_input_to_queue(): for path in paths: print_rank_0(f"Start reading {path}") with open(path) as file: if self.split_row: for row in file: task_queue.put(row) else: items = json.load(file) for item in items["RECORDS"]: task_queue.put(item) print_rank_0("Read input complete") for i in range(len(processes)): task_queue.put('STOP')
def report_evaluate_metrics(summary_writer, prefix, loss, ppl, gpt_loss, bert_loss, sent_loss, multi_loss, step): string = ' validation loss at {}'.format(prefix) string += ' | LM loss: {:.6E}'.format(loss) string += ' | LM PPL: {:.6E}'.format(ppl) if gpt_loss != 0: string += ' | GPT loss: {:.6E}'.format(gpt_loss) if bert_loss != 0: string += ' | BERT loss: {:.6E}'.format(bert_loss) if sent_loss != 0: string += ' | Sent loss: {:.6E}'.format(sent_loss) if multi_loss != 0: string += ' | Multi loss: {:.6E}'.format(multi_loss) length = len(string) + 1 print_rank_0('-' * 100) print_rank_0('-' * length) print_rank_0(string) print_rank_0('-' * length) if summary_writer is not None: summary_writer.add_scalar(f'Train/valid_ppl', ppl, step) summary_writer.add_scalar(f'Train/valid_loss', loss, step) if gpt_loss != 0: summary_writer.add_scalar(f'Train/valid_gpt_loss', gpt_loss, step) if bert_loss != 0: summary_writer.add_scalar(f'Train/valid_bert_loss', bert_loss, step) if sent_loss != 0: summary_writer.add_scalar(f'Train/valid_sent_loss', sent_loss, step) if multi_loss != 0: summary_writer.add_scalar(f'Train/valid_multi_loss', multi_loss, step)
def evaluate_and_print_results(prefix, data_iterator, model, args, timers, verbose=False, writer=None, iteration=0): """Helper function to evaluate and dump results on screen.""" lm_loss = evaluate(data_iterator, model, args, timers, verbose) lm_ppl = math.exp(min(20, lm_loss)) if writer and torch.distributed.is_initialized( ) and torch.distributed.get_rank() == 0: scalars = {'loss': lm_loss, 'perplexity': lm_ppl} for k, v in scalars.items(): writer.add_scalar(k, v, iteration) print_rank_0('-' * 100) string = ' validation loss at {} | '.format(prefix) string += 'LM loss: {:.6E} | '.format(lm_loss) string += 'LM PPL: {:.6E}'.format(lm_ppl) length = len(string) + 1 print_rank_0('-' * length) print_rank_0(string) print_rank_0('-' * length) return lm_loss
def evaluate_and_print_results(prefix, data_iterator, student_model, teacher_model, args, timers, verbose=False): """Helper function to evaluate and dump results on screen.""" losses = evaluate(data_iterator, student_model, teacher_model, args, timers, verbose) lm_ppl = None if "lm_loss" in losses: lm_loss = losses["lm_loss"] lm_ppl = math.exp(min(20, lm_loss)) print_rank_0('-' * 100) save_rank_0(args, '-' * 100) string = ' validation loss at {} | '.format(prefix) for k in losses: string += '{}: {:.6} | '.format(k, losses[k]) if lm_ppl is not None: string += 'LM PPL: {:.6}'.format(lm_ppl) length = len(string) + 1 print_rank_0('-' * length) save_rank_0(args, '-' * 100) print_rank_0(string) save_rank_0(args, string) print_rank_0('-' * length) save_rank_0(args, '-' * 100) return losses
def evaluate_and_print_results(prefix, data_iterator, model, args, writer, iteration, timers, verbose=False): """Helper function to evaluate and dump results on screen.""" lm_loss = evaluate(data_iterator, model, args, timers, verbose) lm_ppl = math.exp(min(20, lm_loss)) print_rank_0('-' * 100) string = ' validation loss at {} | '.format(prefix) string += 'LM loss: {:.6E} | '.format(lm_loss) string += 'LM PPL: {:.6E}'.format(lm_ppl) length = len(string) + 1 print_rank_0('-' * length) print_rank_0(string) print_rank_0('-' * length) if writer and args.rank == 0: writer.add_scalar('val_loss', lm_loss, iteration) writer.add_scalar('val_ppl', lm_ppl, iteration) return lm_loss
def build_lm_dataset(tokenizer, args): documents = [] num_tokens, num_original_tokens = 0, 0 with open(args.valid_data[0], encoding='utf-8') as file: for line in file: tokens = tokenizer.EncodeAsIds(line.strip()).tokenization num_tokens += len(tokens) num_original_tokens += len(line.strip().split(" ")) documents.append(tokens) val_dataset = LMDataset(args, documents, tokenizer, num_original_tokens, num_tokens) print_rank_0( ' > number of document: {}, number of original tokens {}, number of detokenized tokens: {}'.format( len(documents), num_original_tokens, num_tokens)) return val_dataset
def __init__(self, args, split, tokenizer): self.args = args task, data_dir = args.task.lower(), args.data_dir self.max_src_length, self.max_tgt_length = args.src_seq_length, args.tgt_seq_length self.split = split self.tokenizer = tokenizer if split == "train": filename = "train" elif split == "dev": filename = "val" elif split == "test": filename = "test" else: raise NotImplementedError(split) print_rank_0(f"Creating {task}-{split} dataset from {data_dir}") self.dataset_name = split if task == "gigaword": detokenizer = gigaword_detokenize elif task == "cnn_dm": detokenizer = cnndm_detokenize else: detokenizer = None source_texts, target_texts = [], [] with open(os.path.join(data_dir, f"{filename}.source"), encoding='utf-8') as file: for line in file: line = line.strip() line = detokenizer(line) if detokenizer else line source_texts.append(line) with open(os.path.join(data_dir, f"{filename}.target"), encoding='utf-8') as file: for line in file: line = line.strip() line = detokenizer(line, is_target=True) if detokenizer else line target_texts.append(line) assert len(source_texts) == len(target_texts) self.examples, self.example_list = {}, [] for idx, (source_text, target_text) in enumerate(zip(source_texts, target_texts)): if (idx + 1) % 20000 == 0: print_rank_0(f"Complete {idx + 1} examples") guid = "%s-%s" % (split, idx) meta = { "ref": tokenizer.DecodeIds( tokenizer.EncodeAsIds(target_text).tokenization) } example = InputExample(guid=guid, text_a=source_text, text_b=target_text, meta=meta) if idx < 10: print_rank_0( (source_text.encode('utf-8'), target_text.encode('utf-8'), meta["ref"].encode('utf-8'))) self.examples[guid] = example self.example_list.append(example) print_rank_0(f"Return {len(self.examples)} {split} examples")