def get_training_stats(trainer): stats = collections.OrderedDict() stats['loss'] = trainer.get_meter('train_loss') stats['old_loss'] = trainer.get_meter('train_old_loss') if trainer.get_meter('train_nll_loss').count > 0: nll_loss = trainer.get_meter('train_nll_loss') stats['nll_loss'] = nll_loss else: nll_loss = trainer.get_meter('train_loss') if trainer.get_meter('train_kl_loss').count > 0: kl_loss = trainer.get_meter('train_kl_loss') stats['kl_loss'] = kl_loss stats['ppl'] = utils.get_perplexity(nll_loss.avg) stats['wps'] = trainer.get_meter('wps') stats['ups'] = trainer.get_meter('ups') stats['wpb'] = trainer.get_meter('wpb') stats['bsz'] = trainer.get_meter('bsz') stats['num_updates'] = trainer.get_num_updates() stats['lr'] = trainer.get_lr() stats['gs_tau'] = trainer.get_gs_tau() stats['gnorm'] = trainer.get_meter('gnorm') stats['clip'] = trainer.get_meter('clip') stats['oom'] = trainer.get_meter('oom') if trainer.get_meter('loss_scale') is not None: stats['loss_scale'] = trainer.get_meter('loss_scale') stats['wall'] = round(trainer.get_meter('wall').elapsed_time) stats['train_wall'] = trainer.get_meter('train_wall') return stats
def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" loss_sum = sum(log.get('loss', 0) for log in logging_outputs) nll_loss_sum = sum(log.get('nll_loss', 0) for log in logging_outputs) critic_score_sum = sum( log.get('critic_score', 0) for log in logging_outputs) ntokens = sum(log.get('ntokens', 0) for log in logging_outputs) sample_size = sum(log.get('sample_size', 0) for log in logging_outputs) metrics.log_scalar('loss', loss_sum / sample_size / math.log(2), sample_size, round=3) metrics.log_scalar('nll_loss', nll_loss_sum / ntokens / math.log(2), ntokens, round=3) metrics.log_scalar('critic_score', critic_score_sum / sample_size / math.log(2), sample_size) metrics.log_derived( 'ppl', lambda meters: utils.get_perplexity(meters['nll_loss'].avg)) if 'reward' in logging_outputs[0].keys(): reward_sum = sum(log.get('reward', 0) for log in logging_outputs) metrics.log_scalar('reward', reward_sum / sample_size / math.log(2), sample_size, round=3)
def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" # loss_sum = sum(log.get('loss', 0) for log in logging_outputs) #sample_size = sum(log.get('sample_size', 0) for log in logging_ouRtputs) #metrics.log_scalar('loss', loss_sum / sample_size / math.log(2), sample_size, round=3) loss_mask = sum(log.get('loss_mask', 0) for log in logging_outputs) loss_decode = sum(log.get('loss_decode', 0) for log in logging_outputs) sample_size_decode = sum(log.get('sample_size_decode', 0) for log in logging_outputs) sample_size_mask = sum(log.get('sample_size_mask', 0) for log in logging_outputs) accumulate_step = sum(log.get('sample_size_t', 0) for log in logging_outputs) loss_decode=loss_decode / sample_size_decode / math.log(2) loss_mask=loss_mask / sample_size_mask / math.log(2) metrics.log_scalar('loss_decode', loss_decode, sample_size_decode, round=3) metrics.log_scalar('loss_mask', loss_mask, sample_size_mask, round=3) loss_sum=0.5*loss_mask+0.5*loss_decode metrics.log_scalar('loss', loss_sum , 0.5*sample_size_decode+0.5*sample_size_mask, round=3) metrics.log_derived('ppl', lambda meters: utils.get_perplexity(meters['loss'].avg)) metrics.log_scalar('sample_size_t', accumulate_step , accumulate_step, round=3) token = sum(log.get('ntokens', 0) for log in logging_outputs) metrics.log_scalar('ntokens', token , token, round=3) metrics.log_scalar('sample_size_decode', sample_size_decode , sample_size_decode, round=3) sample_size = sum(log.get('sample_size', 0) for log in logging_outputs) metrics.log_scalar('sample_size', sample_size , sample_size, round=3)
def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" loss_sum = sum(log.get('loss', 0) for log in logging_outputs) loss_ce = sum(log.get('ce_loss', 0) for log in logging_outputs) loss_kd = sum(log.get('kd_loss', 0) for log in logging_outputs) # print('debug info: loss kd =', loss_kd) # print('debug info: loss kd =', [log.get('ce_loss', 0) for log in logging_outputs]) if 'ce_loss_teacher' in logging_outputs[0]: loss_ce_teacher = sum( log.get('ce_loss_teacher', 0) for log in logging_outputs) sample_size = sum(log.get('sample_size', 0) for log in logging_outputs) metrics.log_scalar('loss', loss_sum / sample_size / math.log(2), sample_size, round=3) metrics.log_scalar('loss_ce', loss_ce / sample_size / math.log(2), sample_size, round=3) metrics.log_scalar('loss_kd', loss_kd / sample_size / math.log(2), sample_size, round=3) if 'ce_loss_teacher' in logging_outputs[0]: metrics.log_scalar('loss_ce_teacher', loss_ce_teacher / sample_size / math.log(2), sample_size, round=3) metrics.log_derived( 'ppl', lambda meters: utils.get_perplexity(meters['loss'].avg))
def reduce_metrics(cls, logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" loss_sum = sum(log.get('loss', 0) for log in logging_outputs) nll_loss_sum = sum(log.get('nll_loss', 0) for log in logging_outputs) ntokens = sum(log.get('ntokens', 0) for log in logging_outputs) sample_size = sum(log.get('sample_size', 0) for log in logging_outputs) metrics.log_scalar('loss', loss_sum / sample_size / math.log(2), sample_size, round=3) metrics.log_scalar('nll_loss', nll_loss_sum / ntokens / math.log(2), ntokens, round=3) metrics.log_derived( 'ppl', lambda meters: utils.get_perplexity(meters['nll_loss'].avg)) total = utils.item(sum(log.get('total', 0) for log in logging_outputs)) if total > 0: metrics.log_scalar('total', total) n_correct = utils.item( sum(log.get('n_correct', 0) for log in logging_outputs)) metrics.log_scalar('n_correct', n_correct) metrics.log_derived( 'accuracy', lambda meters: round( meters['n_correct'].sum * 100.0 / meters['total'].sum, 3) if meters['total'].sum > 0 else float('nan'), )
def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" loss_sum = sum(log.get("loss", 0) for log in logging_outputs) nll_loss_sum = sum(log.get("nll_loss", 0) for log in logging_outputs) ntokens = sum(log.get("ntokens", 0) for log in logging_outputs) sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) symm_kl_sum = sum(log.get("symm_kl", 0) for log in logging_outputs) self_kl_sum = sum(log.get("self_kl", 0) for log in logging_outputs) self_cv_sum = sum(log.get("self_cv", 0) for log in logging_outputs) metrics.log_scalar("symm_kl", symm_kl_sum / sample_size, sample_size, round=3) metrics.log_scalar("self_kl", self_kl_sum / sample_size, sample_size, round=3) metrics.log_scalar("self_cv", self_cv_sum / sample_size, sample_size, round=3) metrics.log_scalar( "loss", loss_sum / sample_size / math.log(2), sample_size, round=3 ) metrics.log_scalar( "nll_loss", nll_loss_sum / ntokens / math.log(2), ntokens, round=3 ) metrics.log_derived( "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg) )
def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" sample_size = utils.item( sum(log.get("sample_size", 0) for log in logging_outputs)) loss = utils.item(sum(log.get("loss", 0) for log in logging_outputs)) nll_loss = utils.item( sum(log.get("nll_loss", 0) for log in logging_outputs)) metrics.log_scalar('loss', loss / sample_size / math.log(2), sample_size, round=3) metrics.log_scalar('nll_loss', nll_loss / sample_size / math.log(2), sample_size, round=3) metrics.log_derived( 'ppl', lambda meters: utils.get_perplexity(meters['loss'].avg)) for key in logging_outputs[0]: if key[-5:] == "-loss": val = sum(log.get(key, 0) for log in logging_outputs) metrics.log_scalar( key[:-5], val / sample_size / math.log(2) if sample_size > 0 else 0.0, sample_size, round=3, )
def reduce_metrics(cls, logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" loss_sum = sum(log.get("loss", 0) for log in logging_outputs) nll_loss_sum = sum(log.get("nll_loss", 0) for log in logging_outputs) ntokens = sum(log.get("ntokens", 0) for log in logging_outputs) sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) # metrics.log_scalar( # "loss", loss_sum / sample_size / math.log(2), sample_size, round=3 # ) # metrics.log_scalar( # "nll_loss", nll_loss_sum / ntokens / math.log(2), ntokens, round=3 # ) metrics.log_scalar("loss", loss_sum, sample_size, round=3) metrics.log_scalar("nll_loss", nll_loss_sum, ntokens, round=3) metrics.log_derived( "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg)) total = utils.item(sum(log.get("total", 0) for log in logging_outputs)) if total > 0: metrics.log_scalar("total", total) n_correct = utils.item( sum(log.get("n_correct", 0) for log in logging_outputs)) metrics.log_scalar("n_correct", n_correct) metrics.log_derived( "accuracy", lambda meters: round( meters["n_correct"].sum * 100.0 / meters["total"].sum, 3) if meters["total"].sum > 0 else float("nan"), )
def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" lm_loss_sum = sum(log.get('lm_loss', 0) for log in logging_outputs) sentence_loss_sum = sum( log.get('sentence_loss', 0) for log in logging_outputs) ntokens = sum(log.get('ntokens', 0) for log in logging_outputs) nsentences = sum(log.get('nsentences', 0) for log in logging_outputs) sample_size = sum(log.get('sample_size', 0) for log in logging_outputs) agg_loss = sum(log.get('loss', 0) for log in logging_outputs) metrics.log_scalar('loss', agg_loss / sample_size / math.log(2) if sample_size > 0 else 0., round=3) metrics.log_derived( 'ppl', lambda meters: utils.get_perplexity(meters['loss'].avg)) metrics.log_scalar('lm_loss', lm_loss_sum / ntokens / math.log(2) if ntokens > 0 else 0., round=3) metrics.log_scalar('sentence_loss', sentence_loss_sum / nsentences / math.log(2) if nsentences > 0 else 0., round=3) metrics.log_scalar('nll_loss', lm_loss_sum / ntokens / math.log(2) if ntokens > 0 else 0., round=3) metrics.log_scalar('ntokens', ntokens) metrics.log_scalar('nsentences', nsentences) metrics.log_scalar('sample_size', sample_size)
def get_training_stats(trainer): stats = collections.OrderedDict() stats['loss'] = trainer.get_meter('train_loss') if trainer.get_meter('train_nll_loss').count > 0: nll_loss = trainer.get_meter('train_nll_loss') stats['nll_loss'] = nll_loss else: nll_loss = trainer.get_meter('train_loss') stats['ppl'] = utils.get_perplexity(nll_loss.avg) stats['wps'] = trainer.get_meter('wps') stats['ups'] = trainer.get_meter('ups') stats['wpb'] = trainer.get_meter('wpb') stats['bsz'] = trainer.get_meter('bsz') stats['num_updates'] = trainer.get_num_updates() stats['lr'] = trainer.get_lr() stats['gnorm'] = trainer.get_meter('gnorm') stats['clip'] = trainer.get_meter('clip') stats['oom'] = trainer.get_meter('oom') if trainer.get_meter('loss_scale') is not None: stats['loss_scale'] = trainer.get_meter('loss_scale') stats['wall'] = round(trainer.get_meter('wall').elapsed_time) stats['train_wall'] = trainer.get_meter('train_wall') cur_time = time.strftime("%a %b %d %H:%M:%S %Y", time.localtime()) stats['time'] = cur_time return stats
def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" loss_sum = sum(log.get("loss", 0) for log in logging_outputs) code_loss_sum = sum(log.get("code_loss", 0) for log in logging_outputs) value_loss_sum = sum( log.get("value_loss", 0) for log in logging_outputs) sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) sample_size_code = sum( log.get("sample_size_code", 0) for log in logging_outputs) sample_size_value = sum( log.get("sample_size_value", 0) for log in logging_outputs) metrics.log_scalar("loss", loss_sum / sample_size, sample_size, round=3) metrics.log_scalar("code_loss", code_loss_sum / sample_size_code / math.log(2), sample_size_code, round=3) metrics.log_scalar("valueloss", value_loss_sum / sample_size_value, sample_size_value, round=3) metrics.log_derived( "code_ppl", lambda meters: utils.get_perplexity(meters["code_loss"].avg))
def get_valid_stats(trainer, args, extra_meters=None): stats = collections.OrderedDict() stats['loss'] = trainer.get_meter('valid_loss') if trainer.get_meter('valid_nll_loss').count > 0: nll_loss = trainer.get_meter('valid_nll_loss') stats['nll_loss'] = nll_loss else: nll_loss = stats['loss'] stats['ppl'] = utils.get_perplexity(nll_loss.avg) stats['num_updates'] = trainer.get_num_updates() if hasattr(checkpoint_utils.save_checkpoint, 'best'): key = 'best_{0}'.format(args.best_checkpoint_metric) best_function = max if args.maximize_best_checkpoint_metric else min current_metric = None if args.best_checkpoint_metric == 'loss': current_metric = stats['loss'].avg elif args.best_checkpoint_metric in extra_meters: current_metric = extra_meters[args.best_checkpoint_metric].avg elif args.best_checkpoint_metric in stats: current_metric = stats[args.best_checkpoint_metric] else: raise ValueError("best_checkpoint_metric not found in logs") stats[key] = best_function( checkpoint_utils.save_checkpoint.best, current_metric, ) return stats
def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" loss_sum = utils.item( sum(log.get("loss", 0) for log in logging_outputs)) nll_loss_sum = utils.item( sum(log.get("nll_loss", 0) for log in logging_outputs)) alignment_loss_sum = utils.item( sum(log.get("alignment_loss", 0) for log in logging_outputs)) ntokens = utils.item( sum(log.get("ntokens", 0) for log in logging_outputs)) sample_size = utils.item( sum(log.get("sample_size", 0) for log in logging_outputs)) metrics.log_scalar("loss", loss_sum / sample_size / math.log(2), sample_size, round=3) metrics.log_scalar("nll_loss", nll_loss_sum / ntokens / math.log(2), ntokens, round=3) metrics.log_scalar( "alignment_loss", alignment_loss_sum / sample_size / math.log(2), sample_size, round=3, ) metrics.log_derived( "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg))
def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" loss_sum = sum(log.get('loss', 0) for log in logging_outputs) sample_size = sum(log.get('sample_size', 0) for log in logging_outputs) metrics.log_scalar('loss', loss_sum / sample_size / math.log(2), sample_size, round=3) metrics.log_derived('ppl', lambda meters: utils.get_perplexity(meters['loss'].avg))
def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" loss_sum = sum(log.get('loss', 0) for log in logging_outputs) nll_loss_sum = sum(log.get('nll_loss', 0) for log in logging_outputs) ctrl_loss_sum = sum(log.get('ctrl_loss', 0) for log in logging_outputs) sel_entropy = sum(log.get('sel_entropy', 0) for log in logging_outputs) batch_entropy = sum( log.get('batch_entropy', 0) for log in logging_outputs) ctrl_entropy_ratio = sel_entropy / (batch_entropy + _EPS) ntokens = sum(log.get('ntokens', 0) for log in logging_outputs) sample_size = sum(log.get('sample_size', 0) for log in logging_outputs) nsentences = sum(log.get('nsentences', 0) for log in logging_outputs) metrics.log_scalar('loss', loss_sum / sample_size / math.log(2), sample_size, round=3) metrics.log_scalar('nll_loss', nll_loss_sum / ntokens / math.log(2), ntokens, round=3) metrics.log_scalar('ctrl_loss', ctrl_loss_sum / sample_size / math.log(2), nsentences, round=3) metrics.log_scalar('sel_entropy', sel_entropy, 1, round=3) metrics.log_scalar('batch_entropy', batch_entropy, 1, round=3) metrics.log_scalar('ctrl_entropy_ratio', ctrl_entropy_ratio, 1, round=3) metrics.log_derived( 'ppl', lambda meters: utils.get_perplexity(meters['nll_loss'].avg))
def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" loss_sum = utils.item( sum(log.get('loss', 0) for log in logging_outputs)) nll_loss_sum = utils.item( sum(log.get('nll_loss', 0) for log in logging_outputs)) sync_loss_sum = utils.item( sum(log.get('sync_loss', 0) for log in logging_outputs)) ntokens = utils.item( sum(log.get('ntokens', 0) for log in logging_outputs)) sample_size = utils.item( sum(log.get('sample_size', 0) for log in logging_outputs)) metrics.log_scalar('loss', loss_sum / sample_size / math.log(2), sample_size, round=3) metrics.log_scalar('nll_loss', nll_loss_sum / ntokens / math.log(2), ntokens, round=3) metrics.log_scalar('sync_loss', sync_loss_sum / sample_size / math.log(2), sample_size, round=3) metrics.log_derived( 'ppl', lambda meters: utils.get_perplexity(meters['nll_loss'].avg))
def get_training_stats(trainer, args): stats = collections.OrderedDict() stats['loss'] = trainer.get_meter('train_loss') if trainer.get_meter('train_nll_loss').count > 0: nll_loss = trainer.get_meter('train_nll_loss') stats['nll_loss'] = nll_loss else: nll_loss = trainer.get_meter('train_loss') if getattr(args, 'use_gpu', True): # computing perplexity introduces aten::_local_scalar_dense calls # that slow training down stats['ppl'] = utils.get_perplexity(nll_loss.avg) stats['wps'] = trainer.get_meter('wps') stats['ups'] = trainer.get_meter('ups') stats['wpb'] = trainer.get_meter('wpb') stats['bsz'] = trainer.get_meter('bsz') stats['num_updates'] = trainer.get_num_updates() stats['lr'] = trainer.get_lr() stats['gnorm'] = trainer.get_meter('gnorm') if getattr(args, 'use_gpu', True): # computing 'clip' count introduces aten::_local_scalar_dense calls # that slow training down, so it's disabled, hence the meter is invalid stats['clip'] = trainer.get_meter('clip') stats['oom'] = trainer.get_meter('oom') if trainer.get_meter('loss_scale') is not None: stats['loss_scale'] = trainer.get_meter('loss_scale') stats['wall'] = round(trainer.get_meter('wall').elapsed_time) stats['train_wall'] = trainer.get_meter('train_wall') return stats
def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" loss_sum = sum(log.get('loss', 0) for log in logging_outputs) nll_loss_sum = sum(log.get('nll_loss', 0) for log in logging_outputs) ntokens = sum(log.get('ntokens', 0) for log in logging_outputs) sample_size = sum(log.get('sample_size', 0) for log in logging_outputs) total_loss_sum = sum(log.get('total_loss', 0) for log in logging_outputs) metrics.log_scalar('loss', loss_sum / sample_size / math.log(2), sample_size, round=3) metrics.log_scalar('nll_loss', nll_loss_sum / ntokens / math.log(2), ntokens, round=3) metrics.log_derived('ppl', lambda meters: utils.get_perplexity(meters['nll_loss'].avg)) metrics.log_scalar('total_loss', total_loss_sum / sample_size / math.log(2), sample_size, round=3) if len(logging_outputs) > 0 and "org_nll_loss" in logging_outputs[0]: org_nll_loss_sum = sum(log.get('org_nll_loss', 0) for log in logging_outputs) metrics.log_scalar('org_nll_loss', org_nll_loss_sum / ntokens / math.log(2), ntokens, round=3) metrics.log_derived('org_ppl', lambda meters: utils.get_perplexity(meters['org_nll_loss'].avg))
def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" loss_sum = utils.item( sum(log.get('loss', 0) for log in logging_outputs)) ntokens = utils.item( sum(log.get('ntokens', 0) for log in logging_outputs)) sample_size = utils.item( sum(log.get('sample_size', 0) for log in logging_outputs)) #batch_size=len([log.get('ori_loss', 0) for log in logging_outputs]) #print (str(logging_outputs)) #=================================================================================================== #ori_loss=utils.item(sum(log.get('ori_loss', 0) for log in logging_outputs)) #Dist_M_sum=utils.item(sum(log.get('Dist_M_sum', 0).float() for log in logging_outputs)) #vec_en2de_mean_norm=utils.item(sum(log.get('vec_en2de_mean_norm', 0).float() for log in logging_outputs)) #vec_de2en_mean_norm=utils.item(sum(log.get('vec_de2en_mean_norm', 0).float() for log in logging_outputs)) #======================================================================================================= #print ('batch_size = {}, sample_size = {}, ori_loss = {}, Dist_M_sum = {} vec_en2de_mean_norm = {}, vec_de2en_mean_norm = {}'.format(batch_size, sample_size,ori_loss,Dist_M_sum,vec_en2de_mean_norm,vec_de2en_mean_norm)) #print ('ori_loss= {} Dist_M_sum ={} vec_en2de_mean_norm = {}, vec_de2en_mean_norm ={}'.format(ori_loss,Dist_M_sum,vec_en2de_mean_norm,vec_de2en_mean_norm)) metrics.log_scalar('loss', loss_sum / sample_size / math.log(2), sample_size, round=3) #=========================================================================================== #metrics.log_scalar('ori_loss', loss_sum , sample_size, round=3) #metrics.log_scalar('Dist_M_sum', Dist_M_sum , sample_size, round=3) #metrics.log_scalar('vec_en2de_mean_norm', vec_en2de_mean_norm , sample_size, round=3) #metrics.log_scalar('vec_de2en_mean_norm', vec_de2en_mean_norm , sample_size, round=3) #=========================================================================================== if sample_size != ntokens: metrics.log_scalar('nll_loss', loss_sum / ntokens / math.log(2), ntokens, round=3) metrics.log_derived( 'ppl', lambda meters: utils.get_perplexity(meters['nll_loss'].avg)) else: metrics.log_derived( 'ppl', lambda meters: utils.get_perplexity(meters['loss'].avg))
def reduce_metrics(cls, logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" loss_sum = sum(log.get("loss", 0) for log in logging_outputs) nll_loss_sum = sum(log.get("nll_loss", 0) for log in logging_outputs) ntokens = sum(log.get("ntokens", 0) for log in logging_outputs) sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) classifier_loss_sum = sum( log.get("classifier_loss", 0) for log in logging_outputs) metrics.log_scalar("loss", loss_sum / sample_size / math.log(2), sample_size, round=3) metrics.log_scalar("nll_loss", nll_loss_sum / ntokens / math.log(2), ntokens, round=3) metrics.log_derived( "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg)) metrics.log_scalar("classifier_loss", classifier_loss_sum / ntokens / math.log(2), ntokens, round=3) total = utils.item(sum(log.get("total", 0) for log in logging_outputs)) if total > 0: metrics.log_scalar("total", total) n_correct = utils.item( sum(log.get("n_correct", 0) for log in logging_outputs)) metrics.log_scalar("n_correct", n_correct) metrics.log_derived( "accuracy", lambda meters: round( meters["n_correct"].sum * 100.0 / meters["total"].sum, 3) if meters["total"].sum > 0 else float("nan"), ) t = logging_outputs[0].get("per_lang_n_correct", None) if t is not None: for lang_id in t: n_correct = sum( log.get("per_lang_n_correct").get(lang_id, 0) for log in logging_outputs) n_total = sum( log.get("per_lang_total").get(lang_id, 0) for log in logging_outputs) metrics.log_scalar( f"accuracy_lang_{lang_id}", round(n_correct * 100.0 / n_total, 3) if n_total > 0 else float("nan"), priority=100, ) metrics.log_scalar( f"n_total_lang_{lang_id}", n_total, priority=100, )
def reduce_metrics(cls, logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" loss_sum = sum(log.get("loss", 0) for log in logging_outputs) nll_loss_sum = sum(log.get("nll_loss", 0) for log in logging_outputs) ctc_loss_sum = sum(log.get("ctc_loss", 0) for log in logging_outputs) ntokens = sum(log.get("ntokens", 0) for log in logging_outputs) sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) if 'nhit' in logging_outputs[0] and \ 'ntokens_masked' in logging_outputs[0]: nhit = sum(log['nhit'] for log in logging_outputs) ntokens_masked = sum(log['ntokens_masked'] for log in logging_outputs) assert nhit <= ntokens_masked if ntokens_masked > 0: hit_rate = nhit / ntokens_masked else: hit_rate = -1 #TODO: check how to fill the 3 arguments below metrics.log_scalar("nhit", nhit, round=3, weight=0) metrics.log_scalar("ntokens_masked", ntokens_masked, round=3, weight=0) metrics.log_scalar("hit_rate", hit_rate, round=3, weight=0) # May have to adjust below for CTC loss as well metrics.log_scalar("loss", loss_sum / sample_size / math.log(2), sample_size, round=3) metrics.log_scalar("nll_loss", nll_loss_sum / ntokens / math.log(2), ntokens, round=3) metrics.log_scalar("ctc_loss", ctc_loss_sum / ntokens / math.log(2), ntokens, round=3) metrics.log_derived( "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg)) total = utils.item(sum(log.get("total", 0) for log in logging_outputs)) if total > 0: metrics.log_scalar("total", total) n_correct = utils.item( sum(log.get("n_correct", 0) for log in logging_outputs)) metrics.log_scalar("n_correct", n_correct) metrics.log_derived( "accuracy", lambda meters: round( meters["n_correct"].sum * 100.0 / meters["total"].sum, 3) if meters["total"].sum > 0 else float("nan"), )
def reduce_metrics(self, logging_outputs, criterion): super().reduce_metrics(logging_outputs, criterion) bt_sample_size = sum( x.get("bt_sample_size", 0) for x in logging_outputs) if bt_sample_size: bt_loss_sum = sum(x.get("bt_loss", 0) for x in logging_outputs) bt_loss_sum *= 1 / bt_sample_size / math.log(2) metrics.log_scalar("bt_loss", bt_loss_sum, bt_sample_size, round=3) bt_nll_loss_sum = sum( x.get("bt_nll_loss", 0) for x in logging_outputs) bt_ntokens = sum(x.get("bt_ntokens", 0) for x in logging_outputs) bt_nll_loss_sum *= 1 / bt_ntokens / math.log(2) metrics.log_scalar("bt_nll_loss", bt_nll_loss_sum, bt_ntokens, round=3) metrics.log_derived( "bt_ppl", lambda meters: utils.get_perplexity(meters["bt_nll_loss"].avg)) dae_sample_size = sum( x.get("dae_sample_size", 0) for x in logging_outputs) if dae_sample_size: dae_loss_sum = sum(x.get("dae_loss", 0) for x in logging_outputs) dae_loss_sum *= 1 / dae_sample_size / math.log(2) metrics.log_scalar("dae_loss", dae_loss_sum, dae_sample_size, round=3) dae_nll_loss_sum = sum( x.get("dae_nll_loss", 0) for x in logging_outputs) dae_ntokens = sum(x.get("dae_ntokens", 0) for x in logging_outputs) dae_nll_loss_sum *= 1 / dae_ntokens / math.log(2) metrics.log_scalar("dae_nll_loss", dae_nll_loss_sum, dae_ntokens, round=3) metrics.log_derived( "dae_ppl", lambda meters: utils.get_perplexity(meters["dae_nll_loss"].avg ), )
def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" loss_sum = sum(log.get('loss', 0) for log in logging_outputs) nll_loss_sum = sum(log.get('nll_loss', 0) for log in logging_outputs) ntokens = sum(log.get('ntokens', 0) for log in logging_outputs) sample_size = sum(log.get('sample_size', 0) for log in logging_outputs) metrics.log_scalar('loss', loss_sum / sample_size / math.log(2), sample_size, round=3) metrics.log_scalar('nll_loss', nll_loss_sum / ntokens / math.log(2), ntokens, round=3) metrics.log_derived('ppl', lambda meters: utils.get_perplexity(meters['nll_loss'].avg)) if len(logging_outputs) > 0 and 'fg_gnll0' in logging_outputs[0]: for ii in range(8): g_nll = sum(log.get('fg_gnll{}'.format(ii), 0) for log in logging_outputs) g_tokens = sum(log.get('fg_gcount{}'.format(ii), 0) for log in logging_outputs) division_g_ntokens = g_tokens if g_tokens > 0 else 1 metrics.log_scalar('fg_gnll{}'.format(ii), g_nll / division_g_ntokens / math.log(2), g_tokens, round=3) metrics.log_derived_with_key('fg_ppl{}'.format(ii), lambda value: utils.get_perplexity(value), "fg_gnll{}".format(ii))
def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" loss_sum = sum(log.get('loss', 0) for log in logging_outputs) primary_loss_sum = sum( log.get('primary_loss', 0) for log in logging_outputs) auxiliary_loss_sum = sum( log.get('auxiliary_loss', 0) for log in logging_outputs) primary_nll_loss = sum( log.get('primary_nll_loss', 0) for log in logging_outputs) auxiliary_nll_loss = sum( log.get('auxiliary_nll_loss', 0) for log in logging_outputs) ntokens = sum(log.get('ntokens', 0) for log in logging_outputs) auxiliary_ntokens = sum( log.get('auxiliary_ntokens', 0) for log in logging_outputs) sample_size = sum(log.get('sample_size', 0) for log in logging_outputs) metrics.log_scalar('loss', loss_sum / sample_size / math.log(2), sample_size, round=3) metrics.log_scalar('primary_loss', primary_loss_sum / sample_size / math.log(2), sample_size, round=3) metrics.log_scalar('auxiliary_loss', auxiliary_loss_sum / sample_size / math.log(2), sample_size, round=3) metrics.log_scalar('primary_nll_loss', primary_nll_loss / ntokens / math.log(2), ntokens, round=3) metrics.log_scalar('auxiliary_nll_loss', auxiliary_nll_loss / auxiliary_ntokens / math.log(2), auxiliary_ntokens, round=3) metrics.log_derived( 'primary_ppl', lambda meters: utils.get_perplexity(meters[ 'primary_nll_loss'].avg)) metrics.log_derived( 'auxiliary_ppl', lambda meters: utils.get_perplexity(meters[ 'auxiliary_nll_loss'].avg))
def get_valid_stats(trainer, args, extra_meters=None): stats = collections.OrderedDict() stats['loss'] = trainer.get_meter('valid_loss') if trainer.get_meter('valid_nll_loss').count > 0: nll_loss = trainer.get_meter('valid_nll_loss') stats['nll_loss'] = nll_loss else: nll_loss = stats['loss'] stats['ppl'] = utils.get_perplexity(nll_loss.avg) stats['num_updates'] = trainer.get_num_updates() return stats
def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" loss_sum = sum(log.get("loss", 0) for log in logging_outputs) ntokens = sum(log.get("ntokens", 0) for log in logging_outputs) sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) total_loss_sum = sum( log.get("total_loss", 0) for log in logging_outputs) avg_span_sum = sum(log.get("avg_span", 0) for log in logging_outputs) max_span_sum = sum(log.get("max_span", 0) for log in logging_outputs) # we divide by log(2) to convert the loss from base e to base 2 metrics.log_scalar("loss", loss_sum / sample_size / math.log(2), sample_size, round=3) metrics.log_scalar("avg_span", avg_span_sum / sample_size, sample_size, round=3) metrics.log_scalar("max_span", max_span_sum / sample_size, sample_size, round=3) # total loss contains the L1 norm on adaptive-span metrics.log_scalar( "total_loss", total_loss_sum / sample_size / math.log(2), sample_size, round=3, ) if sample_size != ntokens: metrics.log_scalar("nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3) metrics.log_derived( "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg)) else: metrics.log_derived( "ppl", lambda meters: utils.get_perplexity(meters["loss"].avg))
def get_valid_stats(args, trainer, stats): if 'nll_loss' in stats and 'ppl' not in stats: stats['ppl'] = utils.get_perplexity(stats['nll_loss']) stats['num_updates'] = trainer.get_num_updates() if hasattr(checkpoint_utils.save_checkpoint, 'best'): key = 'best_{0}'.format(args.best_checkpoint_metric) best_function = max if args.maximize_best_checkpoint_metric else min stats[key] = best_function( checkpoint_utils.save_checkpoint.best, stats[args.best_checkpoint_metric], ) return stats
def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training (copied from normal cross entropy).""" loss_sum = sum(log.get("loss", 0) for log in logging_outputs) ntokens = sum(log.get("ntokens", 0) for log in logging_outputs) sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) metrics.log_scalar("loss", loss_sum / sample_size / math.log(2), sample_size, round=3) if sample_size != ntokens: metrics.log_scalar("nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3) metrics.log_derived( "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg)) else: metrics.log_derived( "ppl", lambda meters: utils.get_perplexity(meters["loss"].avg)) counts = {} for lk in logging_outputs[0].keys(): if lk.startswith("count_"): val = sum(log[lk] for log in logging_outputs) metrics.log_scalar(lk, val) counts[lk] = val for lk in logging_outputs[0].keys(): if lk.startswith("loss_"): val = sum(log[lk] for log in logging_outputs) metrics.log_scalar(lk, val / sample_size / math.log(2), round=3) elif lk.startswith("correct_"): val = sum(log[lk] for log in logging_outputs) metrics.log_scalar( lk, val / counts[re.sub("correct", "count", lk)])
def get_valid_stats(trainer): stats = collections.OrderedDict() stats['loss'] = trainer.get_meter('valid_loss') if trainer.get_meter('valid_nll_loss').count > 0: nll_loss = trainer.get_meter('valid_nll_loss') stats['nll_loss'] = nll_loss else: nll_loss = stats['loss'] stats['ppl'] = utils.get_perplexity(nll_loss.avg) stats['num_updates'] = trainer.get_num_updates() if hasattr(checkpoint_utils.save_checkpoint, 'best'): stats['best_loss'] = min(checkpoint_utils.save_checkpoint.best, stats['loss'].avg) return stats
def reduce_metrics(logging_outputs): """Aggregate logging outputs from data parallel training.""" loss_sum = sum(log.get('loss', 0) for log in logging_outputs) correct = sum(log.get('correct', 0) for log in logging_outputs) ntokens = sum(log.get('ntokens', 0) for log in logging_outputs) sample_size = sum(log.get('sample_size', 0) for log in logging_outputs) metrics.log_scalar('loss', loss_sum / sample_size, sample_size, round=3) if sample_size != ntokens: metrics.log_scalar('nll_loss', loss_sum / sample_size, ntokens, round=3) metrics.log_derived( 'ppl', lambda meters: utils.get_perplexity(meters['nll_loss'].avg)) else: metrics.log_derived( 'ppl', lambda meters: utils.get_perplexity(meters['loss'].avg)) metrics.log_scalar('acc', correct * 100.0 / sample_size, round=3)