def advi_callback(params, t, g, results, delta_results, model, eval_function, hparams): results.append(eval_function(params)) if (t + 1) % hparams['advi_callback_iteration'] == 0: if len(results) > hparams['advi_callback_iteration']: previous_elbo = results[-(hparams['advi_callback_iteration'] + 1)] else: previous_elbo = 0.0 current_elbo = results[-1] delta_results.append(relative_difference(previous_elbo, current_elbo)) delta_elbo_mean = np.nanmean(delta_results) delta_elbo_median = np.nanmedian(delta_results) if ((delta_elbo_median <= hparams['advi_convergence_threshold']) | (delta_elbo_mean <= hparams['advi_convergence_threshold'])): tqdm.write(f"Converged early according to ADVI " f"metrics for Median/Mean") tqdm.write(f"Iteration {t+1}") tqdm.write(f"Rel. tolerance Δ threshold: " f"{hparams['advi_convergence_threshold']}") tqdm.write(f"Rel. tolerance Δ mean: {delta_elbo_mean:.5f}") tqdm.write(f"Rel. tolerance Δ median: {delta_elbo_median:.5f}") return "exit" return None
def train(model, optimizer, scheduler, dataloader, start_epoch, num_epochs, training_mode, output_dir='', save_freq=100, val_dataloader=None, loss_fn=None, finetune_fn=None, semantic_model=None): writer = SummaryWriter(output_dir) for epoch in tqdm(range(start_epoch, num_epochs + 1)): epoch_train_loss, epoch_train_misclassification_percentage = train_epoch(model, dataloader, optimizer, loss_fn, finetune_fn, semantic_model) epoch_val_loss, epoch_val_misclassification_percentage = val_epoch(model, val_dataloader, loss_fn, finetune_fn, semantic_model) scheduler.step() tqdm.write(f"Epoch: {epoch} \t Train Loss: {epoch_train_loss:.4f} \t Train Misclassified %: {epoch_train_misclassification_percentage*100:.2f} \t Val Loss: {epoch_val_loss:.4f} \t Val Misclassified %: {epoch_val_misclassification_percentage*100:.2f}\t {output_dir}") writer.add_scalar('Loss/Train', epoch_train_loss, epoch) writer.add_scalar('Loss/Val', epoch_val_loss, epoch) writer.add_scalar('Misclassified_Percentage/Train', epoch_train_misclassification_percentage, epoch) writer.add_scalar('Misclassified_Percentage/Val', epoch_val_misclassification_percentage, epoch) if semantic_model: torch.save({'model': semantic_model}, os.path.join(output_dir, 'semantic_latest.pth')) torch.save({'model': model, 'optimizer_state_dict': optimizer.state_dict(), 'scheduler_state_dict': scheduler.state_dict(), 'epoch': epoch}, os.path.join(output_dir, 'latest.pth')) if epoch % save_freq == 0: torch.save({'model': model, 'optimizer_state_dict': optimizer.state_dict(), 'scheduler_state_dict': scheduler.state_dict(), 'epoch': epoch}, os.path.join(output_dir, f'{epoch:04d}.pth')) if semantic_model: torch.save({'model': semantic_model}, os.path.join(output_dir, f'semantic_{epoch:04d}.pth'))
def validate(self, num_batch, device='cuda:0'): self.model.eval() loss = 0 with torch.no_grad(): for batch_idx in tqdm(range(num_batch)): velocities, positions, initial_position = self.val_dataset.__next__( ) velocities = velocities.to(device) positions = positions.to(device) initial_position = initial_position.to(device) predict = initial_position prediction = np.zeros( (self.val_dataset.batch_size, self.val_dataset.steps, 2)) for step in range(self.val_dataset.steps): vel = velocities[:, step] pos = positions[:, step] predict = self.model.step(predict, vel) prediction[:, step] = predict.cpu().numpy() loss += F.mse_loss(predict, pos) loss /= self.val_dataset.steps loss /= num_batch tqdm.write('Test Loss: {:7f}'.format(loss.item())) self.save_fig(prediction, positions, name='epoch_{}'.format(self.epoch)) self.create_halfspace_plot(self.model, self.val_dataset.bounds) return loss.item()
def sleep(self, e): tqdm.write("{color}[{message} {sleep_time}]{reset}".format( color=Fore.BLUE, sleep_time=e.sleep_time, message=e.msg, reset=Fore.RESET, ))
def emit(self, record : str): r"""Format and write message Args: record (str): message to be written during progress """ msg = self.format(record) tqdm.write(msg)
def log_message(self, message): """ Logs a message, preserving the progress bar correct output format. Args: message (str): string you wish to log. """ from tqdm import tqdm tqdm.write(message, **self.tqdm_kwargs)
def run(self): """Runs training schemes for given number of epochs & sample steps. Will generate samples periodically, after each epoch is finished. tqdm progress bars features current loss without having to print every step. Features nested progress bars; Expect buggy behavior. Arguments ----------- No arguments passed. Returns ----------- Does not return anything.""" with warnings.catch_warnings(): warnings.simplefilter('ignore') with tqdm(range(self.args.num_epochs), dynamic_ncols=True, initial=self.start_1) as pbar1: for epoch in pbar1: if self.args.increase_batch_size and (epoch + self.start_1) \ and (epoch + self.start_1) % self.args.increase_batch_size == 0: self.wavenet.accumulate *= 2 self.train_data_loader.dataset.dataset_length *= 2 tqdm.write('Accumulate = {}'.format( self.wavenet.accumulate)) with tqdm(self.train_data_loader, dynamic_ncols=True, initial=self.start_2) as pbar2: for target, condition in pbar2: current_loss = self.wavenet.train( target=target.cuda(non_blocking=True), condition=condition.cuda(non_blocking=True), output_length=self.args.output_length) pbar2.set_postfix(loss=current_loss) self.start_2 = 0 with torch.no_grad(): test_loss = [] with tqdm(self.test_data_loader, dynamic_ncols=True) as pbar3: for target, condition in pbar3: current_loss = self.wavenet.get_loss( target=target.cuda(non_blocking=True), condition=condition.cuda( non_blocking=True), output_length=self.args.output_length ).item() test_loss.append(current_loss) pbar3.set_postfix(loss=current_loss) test_loss = sum(test_loss) / len(test_loss) pbar1.set_postfix(loss=test_loss) sampled_image = self.sample(num=1, name=self.wavenet.step) self.write_test_loss(loss=test_loss, image=sampled_image) self.wavenet.save() self.test_writer.close() self.train_writer.close()
def emit(self, record): try: msg = self.format(record) tqdm.write(msg) self.flush() except (KeyboardInterrupt, SystemExit): raise except: self.handleError(record)
def log_message(self, message: str) -> None: """ Logs a message, preserving the progress bar correct output format. Args: message: string you wish to log. """ from tqdm import tqdm tqdm.write(message, file=self.tqdm_kwargs.get("file", None))
def _compute_matrix_profile(self): """ Compute the matrix profile using STAMP. """ try: for n_iter, idx in enumerate(self._iterator): D = utils.mass(self.ts2[idx: idx+self.window_size], self.ts1) self._elementwise_min(D, idx) except KeyboardInterrupt: if self.verbose: tqdm.write("Calculation interrupted at iteration {}. Approximate result returned.".format(n_iter))
def _compute_matrix_profile(self): """ Compute the matrix profile using PreSCRIMP. """ try: mu_T, sigma_T = utils.rolling_avg_sd(self.ts1, self.window_size) if self._same_ts: mu_Q, sigma_Q = mu_T, sigma_T else: mu_Q, sigma_Q = utils.rolling_avg_sd(self.ts2, self.window_size) for n_iter, idx in enumerate(self._iterator): D = utils.mass(self.ts2[idx: idx+self.window_size], self.ts1) self._elementwise_min(D, idx) jdx = np.argmin(D) # the index of closest profile to the current idx # compute diagonals until the next sampled point q1 = self.ts2[idx:idx + self.sample_interval + self.window_size - 1] q2 = self.ts1[jdx:jdx + self.sample_interval + self.window_size - 1] lq = min(len(q1), len(q2)) q = q1[:lq] * q2[:lq] q = utils.rolling_sum(q, self.window_size) D = utils.calculate_distance_profile(q, self.window_size, mu_Q[idx:idx + len(q)], sigma_Q[idx:idx + len(q)], mu_T[jdx:jdx + len(q)], sigma_T[jdx:jdx + len(q)]) self._index_profile[jdx: jdx + len(q)] = np.where(D < self._matrix_profile[jdx:jdx + len(q)], np.arange(idx, idx + len(q)), self._index_profile[jdx:jdx + len(q)]) self._matrix_profile[jdx:jdx + len(q)] = np.minimum(D, self._matrix_profile[jdx:jdx + len(q)]) if self._same_ts: self._index_profile[idx:idx + len(q)] = np.where(D < self._matrix_profile[idx:idx + len(q)], np.arange(jdx, jdx + len(q)), self._index_profile[idx:idx + len(q)]) self._matrix_profile[idx:idx + len(q)] = np.minimum(D, self._matrix_profile[idx:idx + len(q)]) # compute diagonals until the previous sampled point if idx != 0 and jdx != 0: q1 = self.ts2[max(0, idx - self.sample_interval):(idx + self.window_size - 1)] q2 = self.ts1[max(0, jdx - self.sample_interval):(jdx + self.window_size - 1)] lq = min(len(q1), len(q2)) q = q1[-lq:] * q2[-lq:] q = utils.rolling_sum(q, self.window_size) D = utils.calculate_distance_profile(q, self.window_size, mu_Q[idx - len(q):idx], sigma_Q[idx - len(q):idx], mu_T[jdx - len(q):jdx], sigma_T[jdx - len(q):jdx]) self._index_profile[jdx - len(q): jdx] = np.where(D < self._matrix_profile[jdx - len(q):jdx], np.arange(idx - len(q), idx), self._index_profile[jdx - len(q):jdx]) self._matrix_profile[jdx - len(q):jdx] = np.minimum(D, self._matrix_profile[jdx - len(q):jdx]) if self._same_ts: self._index_profile[idx - len(q):idx] = np.where(D < self._matrix_profile[idx - len(q):idx], np.arange(jdx - len(q), jdx), self._index_profile[idx - len(q):idx]) self._matrix_profile[idx - len(q):idx] = np.minimum(D, self._matrix_profile[idx - len(q):idx]) except KeyboardInterrupt: if self.verbose: tqdm.write("Calculation interrupted at iteration {}. Approximate result returned.".format(n_iter))
def emit(self, record): """ Emit a record via Tqdm screen """ try: msg = self.format(record) tqdm.write(msg) self.flush() except (KeyboardInterrupt, SystemExit): raise except Exception: self.handleError(record)
def _preprocess(self): """ Here we do the PreSCRIMP before the SCRIMP algorithm if pre_scrimp = True. """ if self.pre_scrimp > 0: if self.verbose: tqdm.write("PreSCRIMP:") self._pre_scrimp_class = PreSCRIMP(self.ts1, None if self._same_ts else self.ts2, window_size=self.window_size, exclusion_zone=self.ez, verbose=self.verbose, s_size=self.s_size, sample_rate=self.pre_scrimp) self._matrix_profile, self._index_profile = self._pre_scrimp_class.get_profiles()
def train(model, optimizer, scheduler, dataloader, start_epoch, device, num_epochs, training_mode, context_mode, output_dir='./model_parameters/', save_freq=100, val_dataloader=None): writer = SummaryWriter(output_dir) for epoch in tqdm(range(start_epoch, num_epochs + 1)): epoch_train_loss, epoch_train_misclassification_percentage = \ train_epoch(model, dataloader, training_mode, context_mode, optimizer, device) epoch_val_loss, epoch_val_misclassification_percentage = \ val_epoch(model, val_dataloader, training_mode, context_mode, device) scheduler.step() tqdm.write( f"Epoch: {epoch} \t Train Misclassified Percentage: {epoch_train_misclassification_percentage} \t Val Misclassified Percentage: {epoch_val_misclassification_percentage}\t {output_dir}" ) writer.add_scalar('Loss/Train', epoch_train_loss, epoch) writer.add_scalar('Loss/Val', epoch_val_loss, epoch) writer.add_scalar('Misclassified Percentage/Train', epoch_train_misclassification_percentage, epoch) writer.add_scalar('Misclassified Percentage/Val', epoch_val_misclassification_percentage, epoch) torch.save( { 'model': model, 'optimizer_state_dict': optimizer.state_dict(), 'scheduler_state_dict': scheduler.state_dict(), 'epoch': epoch }, os.path.join(output_dir, 'latest.pth')) if epoch % save_freq == 0: torch.save( { 'model': model, 'optimizer_state_dict': optimizer.state_dict(), 'scheduler_state_dict': scheduler.state_dict(), 'epoch': epoch }, os.path.join(output_dir, f'{epoch:04d}.pth'))
def train(self): for n in range(FLAGS.epochs): for _ in tqdm(iterable=range(FLAGS.steps_per_epoch), ncols=int(get_terminal_width() * .9), desc=tqdm.write(f'Epoch {n + 1} / {FLAGS.epochs}'), unit=' steps'): self.train_step(self.img) save_img(self.img.read_value(), n + 1)
def load(self, path: str): """Loading method: Loads from checkpoints. Arguments ----------- path : string Path to checkpoint to load from. Returns ----------- Does not return anything. """ tqdm.write('Loading from {}'.format(path)) load = torch.load(path) self.net.load_state_dict(load['model']) self.optimizer.load_state_dict(load['optimizer']) self.accumulate = load['accumulate'] self.step = load['step'] self.count = load['count']
def sample(self, step, temperature=1., init=None, nonzero=None, diff=None, nonzero_diff=None, condition=None, length=2048): if not os.path.isdir('Samples'): os.mkdir('Samples') roll = self.generate(temperature, init, nonzero, nonzero_diff, condition, length).detach().cpu().numpy() roll = clean(roll) save_roll(roll, step) midi = piano_rolls_to_midi(roll) midi.write('Samples/{}.mid'.format(step)) tqdm.write('Saved to Samples/{}.mid'.format(step)) roll = np.expand_dims(roll.T, axis=0) return roll
def _compute_matrix_profile(self): """ Compute the matrix profile using SCRIMP. """ if self.verbose and self.pre_scrimp > 0: tqdm.write("SCRIMP:") try: n1 = len(self.ts1) n2 = len(self.ts2) mu_T, sigma_T = utils.rolling_avg_sd(self.ts1, self.window_size) if self._same_ts: mu_Q, sigma_Q = mu_T, sigma_T else: mu_Q, sigma_Q = utils.rolling_avg_sd(self.ts2, self.window_size) for n_iter, k in enumerate(self._iterator): if k >= 0: # compute diagonals starting from a slot in first column q = self.ts2[k:k+n1] * self.ts1[:n2-k] q = utils.rolling_sum(q, self.window_size) D = utils.calculate_distance_profile(q, self.window_size, mu_Q[k:k+len(q)], sigma_Q[k:k+len(q)], mu_T[:len(q)], sigma_T[:len(q)]) self._index_profile[:len(q)] = np.where(D < self._matrix_profile[:len(q)], np.arange(k, k + len(q)), self._index_profile[:len(q)]) self._matrix_profile[:len(q)] = np.minimum(D, self._matrix_profile[:len(q)]) if self._same_ts: self._index_profile[k:k+len(q)] = np.where(D < self._matrix_profile[k:k+len(q)], np.arange(len(q)), self._index_profile[k:k+len(q)]) self._matrix_profile[k:k+len(q)] = np.minimum(D, self._matrix_profile[k:k+len(q)]) else: # compute diagonals starting from a slot in first row k = -k q = self.ts2[:n1-k] * self.ts1[k:k+n2] q = utils.rolling_sum(q, self.window_size) D = utils.calculate_distance_profile(q, self.window_size, mu_Q[:len(q)], sigma_Q[:len(q)], mu_T[k:k+len(q)], sigma_T[k:k+len(q)]) self._index_profile[k:k+len(q)] = np.where(D < self._matrix_profile[k:k+len(q)], np.arange(len(q)), self._index_profile[k:k+len(q)]) self._matrix_profile[k:k+len(q)] = np.minimum(D, self._matrix_profile[k:k+len(q)]) except KeyboardInterrupt: if self.verbose: tqdm.write("Calculation interrupted at iteration {}. Approximate result returned.".format(n_iter))
def pbar(self, epoch): bar = tqdm( total=(int(self.train_dataset_size * self.reps) // int(self.batch_size.numpy())) * int(self.batch_size.numpy()), ncols=int(self.get_terminal_width() * .9), desc=tqdm.write( f' \n Epoch {int(epoch)}/{int(self.num_epochs.numpy())}'), postfix=self.postfix, bar_format=self.bar_format, unit=' Samples') return bar
def trainer(self, batch_gen, val_bg, batch_backward=8, max_epochs=30, step_summary=32, ckp_steps=5000, lr_steps=3000): overall_loss = 0.0 bloss = 0.0 step = 0 epoch_bar = tqdm(total = max_epochs, desc='Epochs') epoch = 0 while epoch < max_epochs: batch_gen = batch_gen.__iter__() true_train = [] pred_train = [] for _ in tqdm(range(len(batch_gen.data)), desc='Train Step', leave=False): x, y, epoch_pass = batch_gen.__next__() sloss, pred = self.train_iter(x, y) sloss = sloss / batch_backward true_train.append(y[0].item()) pred_train.append(pred.argmax().item()) sloss.backward() bloss += sloss.detach() step += 1 if step % ckp_steps == 0: self.store_model('%s-%d.pt' % (self.model_name, step)) if step % batch_backward == 0: torch.nn.utils.clip_grad_norm_(self.parameters(), 1) self.opt.step() self.opt.zero_grad() overall_loss += bloss bloss = 0.0 if step % step_summary == 0: tqdm.write('Step %d, Loss %.3f' % (step, overall_loss / (step_summary / batch_backward))) overall_loss =; 0.0 if step % lr_steps: self.lr.step() epoch += 1 train_acc = np.sum(np.array(true_train) == np.array(pred_train)) / len(bg.data) val_acc = self.get_validation_acc(val_bg) tqdm.write('Epoch %d, Train Accuracy: %.3f, Validation Accuracy: %.3f' % (epoch, train_acc, val_acc)) epoch_bar.update(1) epoch_bar.close()
def vbar(total_images, epoch, epochs): bar = mybar(total=total_images, ncols=int(get_terminal_width() * .9), desc=tqdm.write(f'Epoch {epoch + 1}/{epochs}'), postfix={ 'd_val_loss': f'{0:6.3f}', 1: 1 }, bar_format='{n_fmt}/{total_fmt} |{bar}| {rate_fmt} ' 'ETA: {remaining} Elapsed Time: {elapsed} ' 'D Loss: {postfix[d_val_loss]}', unit=' images', miniters=10) return bar
def sample(self, name: str, init: torch.Tensor, condition: torch.Tensor, temperature=1.): """Sampling: Wrapper around generate, handles saving to midi & saving roll as plot. Arguments -------------- name : int or str The name of the resulting sampled midi file. init : Tensor or None Initializing tensor for Wavenet in order for fast generation. Currently None is not supported. condition : Tensor or None Condition tensor for Wavenet. Currently None is not supported. temperature : float Sampling temperature; >1 means more randomness, <1 means less randomness. Returns -------------- to_image(roll) : np.array 2d piano roll representation of generated sample. """ if not os.path.isdir('Samples'): os.mkdir('Samples') roll = clean(self.generate(init, condition, temperature)) save_roll(roll, name) midi = piano_rolls_to_midi(roll) midi.write('Samples/{}.mid'.format(name)) tqdm.write('Saved to Samples/{}.mid'.format(name)) return to_image(roll)
def pbar(total_images, batch_size, epoch, epochs): bar = tqdm(total=(total_images // batch_size) * batch_size, ncols=int(get_terminal_width() * .9), desc=tqdm.write(f'Epoch {epoch + 1}/{epochs}'), postfix={ 'g_loss': f'{0:6.3f}', 'd_loss': f'{0:6.3f}', 1: 1 }, bar_format='{n_fmt}/{total_fmt} |{bar}| {rate_fmt} ' 'ETA: {remaining} Elapsed Time: {elapsed} ' 'G Loss: {postfix[g_loss]} D Loss: {postfix[' 'd_loss]}', unit=' images', miniters=10) return bar
def __init__(self, total_samples, batch_size, epoch, num_epochs, metrics): postfix = {m: f'{0:6.3f}' for m in metrics} postfix[1] = 1 str_format = '{n_fmt}/{total_fmt} |{bar}| {rate_fmt} ' \ 'ETA: {remaining} Elapsed Time: {elapsed} ' + \ reduce(lambda x, y: x + y, ["%s:{postfix[%s]} " % (m, m) for m in metrics], "") super(ProgressBar, self).__init__( total=(total_samples // batch_size) * batch_size, ncols=int(ProgressBar._get_terminal_width() * .9), desc=tqdm.write(f'Epoch {epoch + 1}/{num_epochs}'), postfix=postfix, bar_format=str_format, unit='samples', miniters=10) self._batch_size = batch_size
def console_write(mode, message): if mode == "sleep": tqdm.write("{color}[{message}]{reset}".format( color=Fore.BLUE, message=message, reset=Fore.RESET, )) elif mode == "error": tqdm.write("{color}[{message}]{reset}".format(color=Fore.RED, message=message, reset=Fore.RESET)) elif mode == "exists": tqdm.write("{color}[{message}]{reset}".format(color=Fore.YELLOW, message=message, reset=Fore.RESET)) elif mode == "fail": tqdm.write("{color}[{message}]{reset}".format(color=Fore.RED, message=message, reset=Fore.RESET))
def main(img_path, json_path=None, out_dir="hmr/output"): if config.img_path.endswith('.csv'): csv = pd.read_csv(config.img_path) else: raise NotImplementedError sess = tf.Session() model = RunModel(config, sess=sess) for ind, item in tqdm(csv.iterrows(), desc='Creating avatars'): tqdm.write('Creating avatar for %s' % item.img_path) out_dir = Path(out_dir) img_path = Path(item.img_path) json_path = Path(item.annot_path) dump_path = out_name(out_dir, img_path, suffix='_verts.pkl') if Path(dump_path).exists(): tqdm.write('Avatar is already created') continue input_img, proc_param, img = preprocess_image(img_path, str(json_path)) # Add batch dimension: 1 x D x D x 3 input_img = np.expand_dims(input_img, 0) joints, verts, cams, joints3d, theta = model.predict( input_img, get_theta=True) # Write outputs joints_csv = os.path.join(str(out_dir), "csv/", os.path.splitext(os.path.basename(str(img_path)))[0] + ".csv") export_joints(joints3d, joints_csv) # pose = pd.DataFrame(theta[:, 3:75]) # pose.to_csv("hmr/output/theta_test.csv", header=None, index=None) # print('THETA:', pose.shape, pose) # import cv2 # rotations = [cv2.Rodrigues(aa)[0] for aa in pose.reshape(-1, 3)] # print('ROTATIONS:', rotations) out_images_dir = os.path.join(str(out_dir), "images") # measure(theta[0][0], verts[0][0]) # view, batch # Write avatar with open(str(dump_path), 'wb') as f: tqdm.write('Vertices dump was written to %s' % dump_path) pickle.dump(verts, f) visualize(str(img_path), img, proc_param, joints[0], verts[0], cams[0], output=str(out_images_dir))
def train(args: Namespace, model: torch.nn.Module, train_dataset: Dataset, train_metrics: SequenceMetrics, train_output_composer: OutputComposer, valid_dataset: Optional[Dataset] = None, valid_metrics: Optional[SequenceMetrics] = None, valid_output_composer: Optional[OutputComposer] = None) -> None: """Train routine.""" logger.info("***** Running training *****") train_dl, train_eval_dl, valid_dl = prepare_dataloaders( args, train_dataset, valid_dataset) optimizer, scheduler = prepare_optimizer_and_scheduler( args, model, num_batches=len(train_dl)) # Multi-gpu, distributed and fp16 setup if args.fp16: try: from apex import amp except ImportError: msg = ("Please install apex from " "https://www.github.com/nvidia/apex to use fp16 training.") raise ImportError(msg) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) global_step = 0 train_losses = [] if valid_dataset: min_val_loss = float('inf') # Training loop try: epoch_tqdm = trange(int(args.num_train_epochs), desc="Epoch") loss_accum = RunningAccumulator() for epoch in epoch_tqdm: model.train() stats = {} train_tqdm = tqdm(train_dl, desc="Iter") for step, batch in enumerate(train_tqdm): if args.n_gpu == 1: # multi-gpu does scattering it-self batch = tuple(t.to(args.device) for t in batch) # Unpack batch input_ids = batch[0] input_mask = batch[1] segment_ids = batch[2] label_ids = batch[3] prediction_mask = batch[4] # example_ixs = batch[5] # doc_span_ixs = batch[6] outs = model(input_ids, segment_ids, input_mask, label_ids, prediction_mask) loss = outs['loss'] if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss_accum.accumulate(loss.item()) running_mean_loss = loss_accum.mean() train_tqdm.set_postfix({'loss': running_mean_loss}) if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() clip_grad_norm_(model.parameters(), args.max_grad_norm) if (step + 1) % args.gradient_accumulation_steps == 0: # Perform gradient clipping for group in optimizer.param_groups: for p in group['params']: if p.grad is None: continue clip_grad_norm_(p, 1) scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 train_losses.append(loss_accum.mean()) stats['loss'] = format_tqdm_metric(train_losses[-1], float(min(train_losses)), fmt='{:.3e}') # Evaluate train set if epoch % 5 == 0 or epoch == args.num_train_epochs - 1: trn_epoch_metrics = evaluate( args, model, tqdm(train_eval_dl, desc="Train metrics"), train_output_composer, train_metrics, ) stats['trn_f1'] = format_tqdm_metric( trn_epoch_metrics['f1_score'], train_metrics.get_best('f1_score'), fmt='{:.2%}') epoch_tqdm.set_postfix(stats) epoch_tqdm.refresh() if valid_dataset: # Evaluate validation set val_epoch_metrics = evaluate( args, model, tqdm(valid_dl, desc="Validation"), valid_output_composer, valid_metrics, ) # Show metrics on tqdm if 'loss' in val_epoch_metrics: epoch_val_loss = val_epoch_metrics['loss'] min_val_loss = min(min_val_loss, epoch_val_loss) stats['val_loss'] = format_tqdm_metric(epoch_val_loss, min_val_loss, fmt='{:.3e}') stats['val_f1'] = format_tqdm_metric( val_epoch_metrics['f1_score'], valid_metrics.get_best('f1_score'), fmt='{:.2%}') best_epoch = valid_metrics.get_best_epoch('f1_score') stats['best_epoch'] = best_epoch # Save model if best epoch if best_epoch == epoch + 1: tqdm.write('Best epoch. Saving model.') save_model(model, args) epoch_tqdm.set_postfix(stats) epoch_tqdm.refresh() # End of training if args.valid_file: logger.info(" Validation F1 scores: %s", valid_metrics.history['f1_score']) best_epoch = valid_metrics.get_best_epoch('f1_score') logger.info(" Validation confusion matrix:") logger.info(" Epoch %d", best_epoch) conf_mat = valid_metrics.get_value("confusion_matrix", best_epoch) logger.info("\n" + str(conf_mat)) logger.info(" Validation classification report:") classif_report = valid_metrics.get_value("classification_report", best_epoch) logger.info("\n" + str(classif_report)) except KeyboardInterrupt: action = '' while action.lower() not in ('y', 'n'): action = input('\nInterrupted. Continue execution to save model ' 'weights? [Y/n]') if action == 'n': sys.exit() if not valid_dataset: # If not using valid dataset, save model of last epoch logger.info('Saving model from last epoch.') save_model(model, args) if args.results_file: # Append this run results write_jsonl_results( compile_results(args, train_metrics, valid_metrics, train_losses=train_losses), args.results_file, )
def pretrain_with_clusters( model, X_train, y_train, X_test, y_test, num, samples, epochs, iter_train, num_inference, batch=False, verbose=1, save=None, ): # t1 = tqdm(total=epochs, position=0) # t2 = tqdm(total=int(X_train.shape[0] // num), position=1, leave=False) tqdm.write( "{:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10}" .format( "epoch", "loss", "likelih", "z-prior", "y-prior", "trAMI", "teAMI", "trPUR", "tePUR", "attch_te", "nent", )) y_ohe = OneHotEncoder() y_train_ohe = np.array( y_ohe.fit_transform(y_train.reshape(-1, 1)).todense()) y_test_ohe = np.array(y_ohe.transform(y_test.reshape(-1, 1)).todense()) for i in range(epochs): # Setup datasets dataset_train = (tf.data.Dataset.from_tensor_slices( (X_train, y_train_ohe)).repeat(iter_train).shuffle( X_train.shape[0]).batch(num)) for x, y in dataset_train: model.pretrain_categories_step(x, y, samples=samples) # for i in range(iter): # idx=np.random.choice(len(X_train), num) # model.train_step(X_train[idx]) # t2.update(1) # t2.close() if i % verbose == 0: # Evaluate training metrics recon, z_ent, y_ent = chain_call(model.entropy_fn, X_train, num_inference) recon = np.array(recon).mean() z_ent = np.array(z_ent).mean() y_ent = np.array(y_ent).mean() loss = -(recon + z_ent + y_ent) idx_tr = chain_call(model.predict, X_train, num_inference).argmax(1) idx_te = chain_call(model.predict, X_test, num_inference).argmax(1) ami_tr = adjusted_mutual_info_score(y_train, idx_tr, average_method="arithmetic") ami_te = adjusted_mutual_info_score(y_test, idx_te, average_method="arithmetic") attch_te = np.array(np.unique( idx_te, return_counts=True)[1]).max() / len(idx_te) purity_train = purity_score(y_train, idx_tr) purity_test = purity_score(y_test, idx_te) tqdm.write("{:10d} {:10.5f} {:10.5f} {:10.5f} {:10.5f} " "{:10.5f} {:10.5f} " "{:10.5f} {:10.5f} " "{:10.2f} {:10.5f}".format( i, loss, recon, z_ent, y_ent, ami_tr, ami_te, purity_train, purity_test, attch_te, np.nan, )) if save is not None: model.save_weights(save, save_format="tf")
def train( model, X_train, y_train, X_test, y_test, num, samples, epochs, iter_train, num_inference, batch=False, verbose=1, save=None, temperature_function=lambda: 0.5, save_results=None, beta_z_method=lambda: 1.0, beta_y_method=lambda: 1.0, beta_d_method=lambda: 1.0, tensorboard="./logs", ): # t1 = tqdm(total=epochs, position=0) # t2 = tqdm(total=int(X_train.shape[0] // num), position=1, leave=False) summary_writer = tf.summary.create_file_writer(tensorboard) if save_results is not None: header_str = ( "{:<10}\t{:<10}\t{:<10}\t{:<10}\t{:<10}\t{:<10}\t{:<10}\t" "{:<10}\t{:<10}\t{:<10}\t{:<10}\t" "{:<10}\t{:<10}\t{:<10}").format( "epoch", "beta_z", "beta_y", "loss", "gan_ent", "likelih", "z-prior", "y-prior", "trAMI", "teAMI", "trPUR", "tePUR", "attch_te", "temp", ) save_results = os.path.join(os.path.abspath(save_results)) for i in range(epochs): # Setup datasets dataset_train = (tf.data.Dataset.from_tensor_slices(X_train).repeat( iter_train).shuffle(X_train.shape[0]).batch(num)) iter = model.cooling_distance beta_z = beta_z_method() beta_y = beta_y_method() beta_d = beta_d_method() if temperature_function is not None: temp = temperature_function(iter) else: temp = 1.0 for x in dataset_train: model.train_step( x, samples=samples, temperature=temp, beta_z=beta_z, beta_y=beta_y, beta_d=beta_d, gradient_clip=model.gradient_clip, ) # for i in range(iter): # idx=np.random.choice(len(X_train), num) # model.train_step(X_train[idx]) # t2.update(1) # t2.close() if i % verbose == 0: # Evaluate training metrics recon, z_ent, y_ent, desc_ent = chain_call(model.entropy_fn, X_train, num_inference) recon = np.array(recon).mean() z_ent = np.array(z_ent).mean() y_ent = np.array(y_ent).mean() d_ent = np.array(desc_ent).mean() loss = -(recon + z_ent + y_ent) idx_tr = chain_call(model.predict, X_train, num_inference).argmax(1) idx_te = chain_call(model.predict, X_test, num_inference).argmax(1) ami_tr = adjusted_mutual_info_score(y_train, idx_tr, average_method="arithmetic") ami_te = adjusted_mutual_info_score(y_test, idx_te, average_method="arithmetic") attch_te = np.array(np.unique( idx_te, return_counts=True)[1]).max() / len(idx_te) purity_train = purity_score(y_train, idx_tr) purity_test = purity_score(y_test, idx_te) value_str = ("{:d}\t{:10.5f}\t{:10.5f}\t{:10.5f}\t" "{:10.5f}\t{:10.5f}\t{:10.5f}\t{:10.5f}\t" "{:10.5f}\t{:10.5f}\t" "{:10.5f}\t{:10.5f}\t" "{:10.2f}\t{:10.5f}".format( iter, beta_z, beta_y, loss, d_ent, recon, z_ent, y_ent, ami_tr, ami_te, purity_train, purity_test, attch_te, temp, )) if save_results is not None: with open(save_results, "a") as results_file: results_file.write("\n" + value_str) tqdm.write(value_str) model.increment_cooling() # plot latent space latent_vectors = chain_call(model.latent_sample, X_test, num_inference) plt_latent_true = plot_latent(latent_vectors, y_test, idx_te) with summary_writer.as_default(): tf.summary.scalar("beta_z", beta_z, step=iter) tf.summary.scalar("beta_y", beta_y, step=iter) tf.summary.scalar("loss", loss, step=iter) tf.summary.scalar("gan_entropy", d_ent, step=iter) tf.summary.scalar("likelihood", recon, step=iter) tf.summary.scalar("z_prior_entropy", z_ent, step=iter) tf.summary.scalar("y_prior_entropy", y_ent, step=iter) tf.summary.scalar("ami_train", ami_tr, step=iter) tf.summary.scalar("ami_test", ami_te, step=iter) tf.summary.scalar("purity_train", purity_train, step=iter) tf.summary.scalar("purity_test", purity_test, step=iter) tf.summary.scalar("max_cluster_attachment_test", attch_te, step=iter) tf.summary.scalar("beta_z", beta_z, step=iter) tf.summary.image("latent", plot_to_image(plt_latent_true), step=iter)
def train( model, X_train, X_test, num, samples, epochs, iter_train, num_inference, batch=False, verbose=1, save=None, save_results=None, beta_z_method=lambda: 1.0, tensorboard="./logs", ): # t1 = tqdm(total=epochs, position=0) # t2 = tqdm(total=int(X_train.shape[0] // num), position=1, leave=False) if tensorboard is not None: summary_writer = tf.summary.create_file_writer(tensorboard) if save_results is not None: header_str = ("{:<10}\t{:<10}\t{:<10}\t{:<10}\t{:<10}").format( "epoch", "beta_z", "loss", "likelih", "z-prior", ) save_results = os.path.join(os.path.abspath(save_results)) if not os.path.exists(save_results): with open(save_results, "w") as results_file: results_file.write(header_str) tqdm.write(header_str) for i in range(epochs): # Setup datasets dataset_train = (tf.data.Dataset.from_tensor_slices(X_train).repeat( iter_train).shuffle(X_train.shape[0]).batch(num)) iter = model.cooling_distance beta_z = beta_z_method() for x in dataset_train: model.train_step( # random sampling tf.cast( tf.greater( tf.cast(x, tf.float64), tf.cast(tf.random.uniform(tf.shape(x), 0, 1), x.dtype), ), x.dtype, ), samples=samples, batch=batch, beta_z=beta_z, ) # for i in range(iter): # idx=np.random.choice(len(X_train), num) # model.train_step(X_train[idx]) # t2.update(1) # t2.close() if i % verbose == 0: # Evaluate training metrics ##recon, z_ent, y_ent = chain_call(model.entropy_fn, X_train, num_inference) recon, z_ent = chain_call(model.entropy_fn, X_train, num_inference) recon = np.array(recon).mean() z_ent = np.array(z_ent).mean() loss = -(recon + z_ent) value_str = "{:d}\t{:10.5f}\t{:10.5f}\t{:10.5f}\t{:10.5f}".format( int(model.cooling_distance), beta_z, loss, recon, z_ent, ) if save_results is not None: with open(save_results, "a") as results_file: results_file.write("\n" + value_str) tqdm.write(value_str) if save is not None: model.save_weights(save, save_format="tf") model.increment_cooling() if tensorboard is not None: # plot latent space latent_vectors = chain_call(model.latent_sample, X_test, num_inference) plt_latent_true = plot_latent(latent_vectors, y_test, idx_te) with summary_writer.as_default(): tf.summary.scalar("beta_z", beta_z, step=iter) tf.summary.scalar("loss", loss, step=iter) tf.summary.scalar("likelihood", recon, step=iter) tf.summary.scalar("z_prior_entropy", z_ent, step=iter) tf.summary.image("latent", plot_to_image(plt_latent_true), step=iter)