def feed_data(self, data_loader, training=True): if training: assert self.optimizer is not None batch_size = data_loader.batch_size losses_per_token = AverageMeter() loss_per_sentence = AverageMeter() num_toks = AverageMeter() start_time = 0 end_time = 0 t0 = time.time() for i, (src, tgt, _) in enumerate(data_loader): if i == self.profile_start: if self.cupti: start_cupti_tracing() elif self.nsight: torch.cuda.profiler.start() start_time = time.time() if i == self.profile_stop: end_time = time.time() if self.cupti: end_cupti_tracing() elif self.nsight: torch.cuda.profiler.stop() if i == self.profile_stop: break self.save_counter += 1 # measure data loading time # do a train/evaluate iteration stats = self.iterate(src, tgt, training=training) loss_per_token, loss_per_sentence, num_toks = stats save_chkpt = (self.save_counter % self.save_freq) == (self.save_freq - 1) if training and save_chkpt: self.save_counter = 0 self.save_info['iteration'] = i identifier = next(self.checkpoint_counter, -1) if identifier != -1: with sync_workers() as rank: if rank == 0: self.save(identifier=identifier) t1 = time.time() print("iteration {}: {} ms".format(i, (t1 - t0) * 1000)) t0 = t1 print("average time {:.2f} ms".format( (end_time - start_time) * 1000 / (self.profile_stop - self.profile_start))) return losses_per_token.avg
def feed_data(self, data_loader, training=True): if training: assert self.optimizer is not None batch_time = AverageMeter() data_time = AverageMeter() losses_per_token = AverageMeter() losses_per_sentence = AverageMeter() tot_tok_time = AverageMeter() src_tok_time = AverageMeter() tgt_tok_time = AverageMeter() batch_size = data_loader.batch_size end = time.time() for i, (src, tgt, _) in enumerate(data_loader): print("iteration {}".format(i)) if i >= self.num_steps and self.num_steps > 0: break if i == 5 and self.cupti: start_cupti_tracing() self.save_counter += 1 # measure data loading time data_time.update(time.time() - end) # do a train/evaluate iteration stats = self.iterate(src, tgt, training=training) loss_per_token, loss_per_sentence, num_toks = stats # measure accuracy and record loss losses_per_token.update(loss_per_token, num_toks['tgt']) losses_per_sentence.update(loss_per_sentence, batch_size) # measure elapsed time elapsed = time.time() - end batch_time.update(elapsed) src_tok_time.update(num_toks['src'] / elapsed) tgt_tok_time.update(num_toks['tgt'] / elapsed) tot_num_toks = num_toks['tgt'] + num_toks['src'] tot_tok_time.update(tot_num_toks / elapsed) self.loss = losses_per_token.avg end = time.time() if i % self.print_freq == 0: phase = 'TRAIN' if training else 'EVAL' log = [] log += ['{} [{}][{}/{}]'.format(phase, self.epoch, i, len(data_loader))] log += ['Time {:.3f} ({:.3f})'.format(batch_time.val, batch_time.avg)] log += ['Data {:.3f} ({:.3f})'.format(data_time.val, data_time.avg)] log += ['Tok/s {:.0f} ({:.0f})'.format(tot_tok_time.val, tot_tok_time.avg)] if self.verbose: log += ['Src tok/s {:.0f} ({:.0f})'.format(src_tok_time.val, src_tok_time.avg)] log += ['Tgt tok/s {:.0f} ({:.0f})'.format(tgt_tok_time.val, tgt_tok_time.avg)] log += ['Loss/sentence {:.1f} ({:.1f})'.format(losses_per_sentence.val, losses_per_sentence.avg)] log += ['Loss/tok {:.8f} ({:.8f})'.format(losses_per_token.val, losses_per_token.avg)] log = '\t'.join(log) print(log) #logging.info(log) save_chkpt = (self.save_counter % self.save_freq) == (self.save_freq - 1) if training and save_chkpt: self.save_counter = 0 self.save_info['iteration'] = i identifier = next(self.checkpoint_counter, -1) if identifier != -1: with sync_workers() as rank: if rank == 0: self.save(identifier=identifier) if self.cupti: end_cupti_tracing() return losses_per_token.avg
def feed_data(self, data_loader, training=True): """ Runs training or validation on batches from data_loader. :param data_loader: data loader :param training: if True runs training else runs validation """ if training: assert self.optimizer is not None eval_fractions = np.linspace(0, 1, self.intra_epoch_eval + 2)[1:-1] iters_with_update = len(data_loader) // self.iter_size eval_iters = (eval_fractions * iters_with_update).astype(int) eval_iters = eval_iters * self.iter_size eval_iters = set(eval_iters) batch_time = AverageMeter() data_time = AverageMeter() losses_per_token = AverageMeter(skip_first=False) losses_per_sentence = AverageMeter(skip_first=False) tot_tok_time = AverageMeter() src_tok_time = AverageMeter() tgt_tok_time = AverageMeter() batch_size = data_loader.batch_size end = time.time() for i, (src, tgt) in enumerate(data_loader): self.save_counter += 1 # measure data loading time data_time.update(time.time() - end) update = False if i % self.iter_size == self.iter_size - 1: update = True # do a train/evaluate iteration stats = self.iterate(src, tgt, update, training=training) loss_per_token, loss_per_sentence, num_toks = stats # measure accuracy and record loss losses_per_token.update(loss_per_token, num_toks['tgt']) losses_per_sentence.update(loss_per_sentence, batch_size) # measure elapsed time elapsed = time.time() - end batch_time.update(elapsed) src_tok_time.update(num_toks['src'] / elapsed) tgt_tok_time.update(num_toks['tgt'] / elapsed) tot_num_toks = num_toks['tgt'] + num_toks['src'] tot_tok_time.update(tot_num_toks / elapsed) self.loss = losses_per_token.avg if training and i in eval_iters: test_bleu, _ = self.translator.run(calc_bleu=True, epoch=self.epoch, iteration=i) log = [] log += [f'TRAIN [{self.epoch}][{i}/{len(data_loader)}]'] log += [f'BLEU: {test_bleu:.2f}'] log = '\t'.join(log) logging.info(log) self.model.train() self.preallocate(data_loader, training=True) if i % self.print_freq == 0: phase = 'TRAIN' if training else 'VALIDATION' log = [] log += [f'{phase} [{self.epoch}][{i}/{len(data_loader)}]'] log += [f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})'] log += [f'Data {data_time.val:.2e} ({data_time.avg:.2e})'] log += [ f'Tok/s {tot_tok_time.val:.0f} ({tot_tok_time.avg:.0f})' ] if self.verbose: log += [ f'Src tok/s {src_tok_time.val:.0f} ({src_tok_time.avg:.0f})' ] log += [ f'Tgt tok/s {tgt_tok_time.val:.0f} ({tgt_tok_time.avg:.0f})' ] log += [ f'Loss/sentence {losses_per_sentence.val:.1f} ({losses_per_sentence.avg:.1f})' ] log += [ f'Loss/tok {losses_per_token.val:.4f} ({losses_per_token.avg:.4f})' ] if training: lr = self.optimizer.param_groups[0]['lr'] log += [f'LR {lr:.3e}'] log = '\t'.join(log) logging.info(log) save_chkpt = (self.save_counter % self.save_freq) == (self.save_freq - 1) if training and save_chkpt: self.save_counter = 0 self.save_info['iteration'] = i identifier = next(self.checkpoint_counter, -1) if identifier != -1: with sync_workers() as rank: if rank == 0: self.save(identifier=identifier) end = time.time() tot_tok_time.reduce('sum') losses_per_token.reduce('mean') return losses_per_token.avg, tot_tok_time.avg
def feed_data(self, data_loader, training=True): if training: assert self.optimizer is not None batch_time = AverageMeter() data_time = AverageMeter() losses_per_token = AverageMeter() losses_per_sentence = AverageMeter() tot_tok_time = AverageMeter() src_tok_time = AverageMeter() tgt_tok_time = AverageMeter() batch_size = data_loader.batch_size end = time.time() for i, (src, tgt, _) in enumerate(data_loader): self.save_counter += 1 # measure data loading time data_time.update(time.time() - end) # do a train/evaluate iteration stats = self.iterate(src, tgt, training=training) loss_per_token, loss_per_sentence, num_toks = stats # measure accuracy and record loss losses_per_token.update(loss_per_token, num_toks['tgt']) losses_per_sentence.update(loss_per_sentence, batch_size) # measure elapsed time elapsed = time.time() - end batch_time.update(elapsed) src_tok_time.update(num_toks['src'] / elapsed) tgt_tok_time.update(num_toks['tgt'] / elapsed) tot_num_toks = num_toks['tgt'] + num_toks['src'] tot_tok_time.update(tot_num_toks / elapsed) self.loss = losses_per_token.avg end = time.time() if i % self.print_freq == 0: phase = 'TRAIN' if training else 'EVAL' log = [] log += [f'{phase} [{self.epoch}][{i}/{len(data_loader)}]'] log += [f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})'] log += [f'Data {data_time.val:.3f} ({data_time.avg:.3f})'] log += [ f'Tok/s {tot_tok_time.val:.0f} ({tot_tok_time.avg:.0f})' ] if self.verbose: log += [ f'Src tok/s {src_tok_time.val:.0f} ({src_tok_time.avg:.0f})' ] log += [ f'Tgt tok/s {tgt_tok_time.val:.0f} ({tgt_tok_time.avg:.0f})' ] log += [ f'Loss/sentence {losses_per_sentence.val:.1f} ({losses_per_sentence.avg:.1f})' ] log += [ f'Loss/tok {losses_per_token.val:.8f} ({losses_per_token.avg:.8f})' ] log = '\t'.join(log) logging.info(log) save_chkpt = (self.save_counter % self.save_freq) == (self.save_freq - 1) if training and save_chkpt: self.save_counter = 0 self.save_info['iteration'] = i identifier = next(self.checkpoint_counter, -1) if identifier != -1: with sync_workers() as rank: if rank == 0: self.save(identifier=identifier) return losses_per_token.avg
def feed_data(self, data_loader, training=True): """ Runs training or validation on batches from data_loader. :param data_loader: data loader :param training: if True runs training else runs validation """ if training: assert self.optimizer is not None eval_fractions = np.linspace(0, 1, self.intra_epoch_eval + 2)[1:-1] eval_iters = (eval_fractions * len(data_loader)).astype(int) eval_iters = set(eval_iters) batch_time = AverageMeter() data_time = AverageMeter() losses_per_token = AverageMeter() losses_per_sentence = AverageMeter() tot_tok_time = AverageMeter() src_tok_time = AverageMeter() tgt_tok_time = AverageMeter() batch_size = data_loader.batch_size layer_timestamps = [] verbose = True module_whitelist = ["EmuBidirLSTM", "RecurrentAttention", "Classifier"] for i, (src, tgt) in enumerate(data_loader): break (src, src_length) = src (tgt, tgt_length) = tgt src_length = torch.LongTensor(src_length).cuda() src = src.cuda() tgt = tgt.cuda() model_input = (src, src_length, tgt[:-1]) summary = torchsummary.summary(model=self.model, module_whitelist=module_whitelist, model_input=model_input, verbose=True) end = time.time() NUM_STEPS_TO_PROFILE = 100 # profile 100 steps for i, (src, tgt) in enumerate(data_loader): self.save_counter += 1 # measure data loading time data_time.update(time.time() - end) with torchprofiler.Profiling(self.model, module_whitelist) as p: # do a train/evaluate iteration stats = self.iterate(src, tgt, training=training) loss_per_token, loss_per_sentence, num_toks = stats print(str(p)) layer_timestamps.append(p.processed_times()) # measure accuracy and record loss losses_per_token.update(loss_per_token, num_toks['tgt']) losses_per_sentence.update(loss_per_sentence, batch_size) # measure elapsed time elapsed = time.time() - end batch_time.update(elapsed) src_tok_time.update(num_toks['src'] / elapsed) tgt_tok_time.update(num_toks['tgt'] / elapsed) tot_num_toks = num_toks['tgt'] + num_toks['src'] tot_tok_time.update(tot_num_toks / elapsed) self.loss = losses_per_token.avg if training and i in eval_iters: test_bleu, _ = self.translator.run(calc_bleu=True, epoch=self.epoch, iteration=i) log = [] log += [f'TRAIN [{self.epoch}][{i}/{len(data_loader)}]'] log += [f'BLEU: {test_bleu:.2f}'] log = '\t'.join(log) logging.info(log) self.model.train() self.preallocate(data_loader, training=True) if i % self.print_freq == 0: phase = 'TRAIN' if training else 'VALIDATION' log = [] log += [f'{phase} [{self.epoch}][{i}/{len(data_loader)}]'] log += [f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})'] log += [f'Data {data_time.val:.5f} ({data_time.avg:.5f})'] log += [ f'Tok/s {tot_tok_time.val:.0f} ({tot_tok_time.avg:.0f})' ] if self.verbose: log += [ f'Src tok/s {src_tok_time.val:.0f} ({src_tok_time.avg:.0f})' ] log += [ f'Tgt tok/s {tgt_tok_time.val:.0f} ({tgt_tok_time.avg:.0f})' ] log += [ f'Loss/sentence {losses_per_sentence.val:.1f} ({losses_per_sentence.avg:.1f})' ] log += [ f'Loss/tok {losses_per_token.val:.4f} ({losses_per_token.avg:.4f})' ] lr = [ param_group['lr'] for param_group in self.optimizer.param_groups ] log += [f'Learning Rate {lr}'] log = '\t'.join(log) logging.info(log) if i >= NUM_STEPS_TO_PROFILE: break save_chkpt = (self.save_counter % self.save_freq) == (self.save_freq - 1) if training and save_chkpt: self.save_counter = 0 self.save_info['iteration'] = i identifier = next(self.checkpoint_counter, -1) if identifier != -1: with sync_workers() as rank: if rank == 0: self.save(identifier=identifier) end = time.time() if verbose: print( "\n==========================================================") print("Layer Type Forward Time (ms) Backward Time (ms)") print("==========================================================") tot_accounted_time = 0.0 per_layer_times = [] for i in range(len(layer_timestamps[0])): layer_type = str(layer_timestamps[0][i][0]) layer_forward_time_sum = 0.0 layer_backward_time_sum = 0.0 for j in range(len(layer_timestamps)): layer_forward_time_sum += (layer_timestamps[j][i][2] / 1000) layer_backward_time_sum += (layer_timestamps[j][i][5] / 1000) per_layer_times.append( (layer_type, layer_forward_time_sum / len(layer_timestamps), layer_backward_time_sum / len(layer_timestamps))) if verbose: print(per_layer_times[-1][0], per_layer_times[-1][1], per_layer_times[-1][2]) tot_accounted_time += (per_layer_times[-1][1] + per_layer_times[-1][2]) print("Total accounted time: %.3f ms" % tot_accounted_time) summary_i = 0 per_layer_times_i = 0 last_summary_i = -1 last_per_layer_times_i = -1 while len(per_layer_times) > 0: per_layer_time = per_layer_times.pop(0) for summary_i in range(len(summary)): summary_elem = summary[summary_i] if str(summary_elem['layer_name']) != str(per_layer_time[0]): continue if 'forward_time' in summary_elem and 'backward_time' in summary_elem: continue summary_elem['forward_time'] = per_layer_time[1] summary_elem['backward_time'] = per_layer_time[2] break if training: create_graph(self.model, module_whitelist, (src, tgt), summary, os.path.join("profiles", self.arch)) tot_tok_time.reduce('sum') losses_per_token.reduce('mean') return losses_per_token.avg, tot_tok_time.avg